# Autoencoder 신용카드 사기 거래 감지하기
> unbalanced data

## Step 1. EDA

In [32]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [33]:
data = pd.read_csv('https://github.com/nsethi31/Kaggle-Data-Credit-Card-Fraud-Detection/raw/master/creditcard.csv')
data.shape

(284807, 31)

In [34]:
data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,-0.5516,-0.617801,-0.99139,-0.311169,1.468177,-0.470401,0.207971,0.025791,0.403993,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,1.612727,1.065235,0.489095,-0.143772,0.635558,0.463917,-0.114805,-0.183361,-0.145783,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,0.624501,0.066084,0.717293,-0.165946,2.345865,-2.890083,1.109969,-0.121359,-2.261857,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,-0.226487,0.178228,0.507757,-0.287924,-0.631418,-1.059647,-0.684093,1.965775,-1.232622,-0.208038,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,-0.822843,0.538196,1.345852,-1.11967,0.175121,-0.451449,-0.237033,-0.038195,0.803487,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [35]:
tmp = data['Class'].value_counts().to_frame().reset_index()
tmp['Percent(%)'] = tmp["Class"].apply(lambda x : round(100*float(x) / len(data), 2))
tmp = tmp.rename(columns = {"index" : "Target", "Class" : "Count"})

tmp

Unnamed: 0,Target,Count,Percent(%)
0,0,284315,99.83
1,1,492,0.17


## Step 2. Data Engneering

In [36]:
x_data = data.loc[:, 'V1' : 'V28']
y_data = data.loc[:, 'Class']

print(x_data.shape)
print(y_data.shape)

(284807, 28)
(284807,)


#### 최악의 불균형 데이터 만들기 위해 1:9 비율로 Train, Test 분리 

In [37]:
# 데이터를 랜덤하게 섞은 후에 Train, Test 데이터를 각각 1:9 으로 나누었습니다.
shuffle_index = np.random.permutation(len(data))
x_data = x_data.values[shuffle_index]
y_data = y_data.values[shuffle_index]

n_train = int(len(x_data) * 0.1)

x_train = x_data[:n_train]
y_train = y_data[:n_train]
x_test = x_data[n_train:]
y_test = y_data[n_train:]

print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(28480, 28)
(28480,)
(256327, 28)
(256327,)


In [38]:
# Train, Test 데이터를 각각 사기/정상 으로 분류하여 보면 다음과 같습니다
pd.DataFrame([[sum(y_train == 0), sum(y_test == 0)], [sum(y_train == 1), sum(y_test == 1)]], 
             columns=['train', 'test'], index=['0 (non-fraud)', '1 (fraud)'])

Unnamed: 0,train,test
0 (non-fraud),28429,255886
1 (fraud),51,441


## Step 3. Modeling

### Modeling1 - RandomForest

In [39]:
from sklearn.ensemble import RandomForestClassifier

# modeling
model_rf = RandomForestClassifier(n_estimators = 15)
# train
model_rf.fit(x_train, y_train)
# predict
y_pred = model_rf.predict(x_test)
y_real = y_test

In [40]:
accuracy = round(sum(y_pred == y_real) / len(y_pred), 4)
precision = round(sum([p == 1 & r == 1 for p, r in zip(y_pred, y_real)]) / sum(y_pred == 1), 4)
recall = round(sum([p == 1 & r == 1 for p, r in zip(y_pred, y_real)]) / sum(y_real == 1), 4)
f1 = round(2 / ((1/precision) + (1/recall)), 4)

print('Accuracy : ', accuracy)
print('Precision : ', precision)
print('Recall : ', recall)
print('f1-score : ', f1)

Accuracy :  0.9993
Precision :  0.876
Recall :  0.7211
f1-score :  0.791


In [41]:
from sklearn.metrics import classification_report

print(classification_report(y_real, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    255886
           1       0.88      0.72      0.79       441

    accuracy                           1.00    256327
   macro avg       0.94      0.86      0.90    256327
weighted avg       1.00      1.00      1.00    256327



### Modeling 2 - Logistic regression with Neural Network

In [42]:
import tensorflow as tf
import tensorflow.keras.layers as layers
import tensorflow.keras.models as models

n_inputs = x_train.shape[1]
n_output = 2

model_nn = tf.keras.Sequential([
    layers.Dense(64, input_shape=(n_inputs, ), activation='tanh'),
    layers.Dense(32, activation='relu'),
    layers.Dense(16, activation='relu'),
    layers.Dense(n_output, activation='softmax'),
])
model_nn.compile(loss = 'sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model_nn.summary()

# train
model_nn.fit(x_train, y_train, batch_size=100, epochs=10, validation_data=(x_test, y_test))

# predict
y_pred = model_nn.predict(x_test)
y_real = y_test

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_15 (Dense)             (None, 64)                1856      
_________________________________________________________________
dense_16 (Dense)             (None, 32)                2080      
_________________________________________________________________
dense_17 (Dense)             (None, 16)                528       
_________________________________________________________________
dense_18 (Dense)             (None, 2)                 34        
Total params: 4,498
Trainable params: 4,498
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [43]:
y_pred = y_pred.argmax(axis=1)

accuracy = round(sum(y_pred == y_real) / len(y_pred), 4)
print('Accuracy : ', accuracy)
print(classification_report(y_real, y_pred))

Accuracy :  0.9993
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    255886
           1       0.78      0.78      0.78       441

    accuracy                           1.00    256327
   macro avg       0.89      0.89      0.89    256327
weighted avg       1.00      1.00      1.00    256327



### Modeling 3 - Autoencoder

In [48]:
import tensorflow as tf
import tensorflow.keras.layers as layers
import tensorflow.keras.models as models

n_inputs = x_train.shape[1]
n_outputs = 2
n_latent = 50

inputs = tf.keras.layers.Input(shape=(n_inputs, ))
x = tf.keras.layers.Dense(100, activation='tanh')(inputs)
latent = tf.keras.layers.Dense(n_latent, activation='tanh')(x)

# Encoder
encoder = tf.keras.models.Model(inputs, latent, name='encoder')
encoder.summary()

latent_inputs = tf.keras.layers.Input(shape=(n_latent, ))
x = tf.keras.layers.Dense(100, activation='tanh')(latent_inputs)
outputs = tf.keras.layers.Dense(n_inputs, activation='sigmoid')(x)

# Decoder
decoder = tf.keras.models.Model(latent_inputs, outputs, name='decoder')
decoder.summary()

# 정상 데이터 만을 학습
x_train_norm = x_train[y_train == 0]

es = tf.keras.callbacks.EarlyStopping(patience=5)
mc = tf.keras.callbacks.ModelCheckpoint('./best_model.h5', save_best_only=True, save_weights_only=True, verbose=1)

autoencoder = tf.keras.models.Model(inputs, decoder(encoder(inputs)))
autoencoder.compile(optimizer='adam', loss='mse')
autoencoder.fit(x_train_norm, x_train_norm, epochs=100, batch_size = 100, validation_data=(x_test, x_test), callbacks=[es,mc])


Model: "encoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_7 (InputLayer)         [(None, 28)]              0         
_________________________________________________________________
dense_30 (Dense)             (None, 100)               2900      
_________________________________________________________________
dense_31 (Dense)             (None, 50)                5050      
Total params: 7,950
Trainable params: 7,950
Non-trainable params: 0
_________________________________________________________________
Model: "decoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_8 (InputLayer)         [(None, 50)]              0         
_________________________________________________________________
dense_32 (Dense)             (None, 100)               5100      
________________________________________

<tensorflow.python.keras.callbacks.History at 0x7f0ce2873668>

In [49]:
encoded = encoder.predict(x_train)

classifier = tf.keras.Sequential([
    layers.Dense(32, input_dim=n_latent, activation='tanh'),
    layers.Dense(16, activation='relu'),
    layers.Dense(n_outputs, activation ='softmax')
])
classifier.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
classifier.summary()

classifier.fit(encoded, y_train, batch_size=100, epochs=10)

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_34 (Dense)             (None, 32)                1632      
_________________________________________________________________
dense_35 (Dense)             (None, 16)                528       
_________________________________________________________________
dense_36 (Dense)             (None, 2)                 34        
Total params: 2,194
Trainable params: 2,194
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f0d501c86d8>

In [50]:
pred_y = classifier.predict(encoder.predict(x_test)).argmax(axis=1)
y = y_test

print(classification_report(y, pred_y))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    255886
           1       0.83      0.81      0.82       441

    accuracy                           1.00    256327
   macro avg       0.91      0.90      0.91    256327
weighted avg       1.00      1.00      1.00    256327

