In [73]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')
import warnings
warnings.filterwarnings('ignore')
from sklearn import preprocessing, decomposition, model_selection, linear_model, metrics, ensemble, svm, utils
from sklearn.datasets import make_classification
import gc

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras import metrics

import sklearn

In [70]:
# Get the data
train_transaction = pd.read_csv('data/train_transaction.csv')
train_identity = pd.read_csv('data/train_identity.csv')

In [71]:
# Join datasets
dataset = train_transaction.merge(train_identity, how='left', left_index=True, right_index=True)

In [72]:
# Reduce memory usage
def reduce_mem(df):
  start_mem=df.memory_usage().sum()/1024**2
  print('Initial Memory Usage : {:.2f} MB'.format(start_mem))
  for col in df.columns:
    col_type=df[col].dtype
    if col_type != object:
      mn, mx = df[col].min(), df[col].max()
      if str(col_type)[:3]=='int':
        if mn>np.iinfo(np.int8).min and mx<np.iinfo(np.int8).max:
          df[col]=df[col].astype(np.int8)
        elif mn>np.iinfo(np.int16).min and mx<np.iinfo(np.int16).max:
          df[col]=df[col].astype(np.int16)
        elif mn>np.iinfo(np.int32).min and mx<np.iinfo(np.int32).max:
          df[col]=df[col].astype(np.int32)
      else:
        if mn>np.finfo(np.float16).min and mx<np.finfo(np.float16).max:
          df[col]=df[col].astype(np.float16)
        elif mn>np.finfo(np.float32).min and mx<np.finfo(np.float32).max:
          df[col]=df[col].astype(np.float32)
  end_mem = df.memory_usage().sum()/1024**2
  print('Final Memory Usage : {:.2f} MB'.format(end_mem))
  print('Decreased by {:.2f}%'.format(100*(start_mem-end_mem)/start_mem))
  return df

In [74]:
dataset = reduce_mem(dataset)

Initial Memory Usage : 1959.88 MB
Final Memory Usage : 648.22 MB
Decreased by 66.93%


In [75]:
del train_transaction, train_identity
gc.collect()

589

In [76]:
# Fill NaN values
dataset = dataset.fillna(0)

In [77]:
# Encode labels
for f in dataset.columns:
    if dataset[f].dtype=='object': 
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(dataset[f].values))
        dataset[f] = lbl.transform(list(dataset[f].values))

In [78]:
y = dataset.isFraud.values
dataset = dataset.drop('isFraud',axis=1)

In [79]:
# Standardize the data
scaler = preprocessing.StandardScaler()
cols = list(dataset.columns)
dataset[cols] = scaler.fit_transform(dataset[cols])

In [80]:
# Reduce number of dimensions through PCA
N = 50
svd = decomposition.TruncatedSVD(n_components=N, random_state=42)
X = svd.fit_transform(dataset[cols])  
print(svd.explained_variance_ratio_.sum())

0.8517370441936513


In [81]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=.2, random_state=1)

In [82]:
random_forest_model = ensemble.RandomForestClassifier(n_estimators=200,criterion='gini',n_jobs=-1).fit(X_train, y_train)

In [83]:
logistic_regression_model = linear_model.LogisticRegression(C=1, solver="newton-cg", penalty="l2", n_jobs=-1, max_iter=200).fit(X_train, y_train) 

In [84]:
svm_model = svm.SVC(max_iter=200).fit(X_train, y_train) 

In [88]:
print(sklearn.metrics.classification_report(y_test, random_forest_model.predict(X_test)))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99    113974
           1       0.91      0.34      0.50      4134

    accuracy                           0.98    118108
   macro avg       0.94      0.67      0.74    118108
weighted avg       0.97      0.98      0.97    118108



In [89]:
print(sklearn.metrics.classification_report(y_test, logistic_regression_model.predict(X_test)))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98    113974
           1       0.81      0.15      0.25      4134

    accuracy                           0.97    118108
   macro avg       0.89      0.57      0.62    118108
weighted avg       0.96      0.97      0.96    118108



In [91]:
print(sklearn.metrics.classification_report(y_test, svm_model.predict(X_test)))

              precision    recall  f1-score   support

           0       0.96      0.98      0.97    113974
           1       0.02      0.01      0.02      4134

    accuracy                           0.95    118108
   macro avg       0.49      0.50      0.49    118108
weighted avg       0.93      0.95      0.94    118108



In [92]:
#lr_param_grid = {}
#rfc_param_grid = {'n_estimators':[25, 50, 100], 'criterion':('gini', 'entropy')}
#svc_param_grid = {}

#rfc = ensemble.RandomForestClassifier()
#rfc_gs = GridSearchCV(rfc, rfc_param_grid, cv=10)
#rfc_gs.fit(X_train, y_train)

In [65]:
def split_with_PCA(k, x_tr, y):
    X_PCA = decomposition.TruncatedSVD(n_components=k).fit_transform(x_tr)  
    return model_selection.train_test_split(X_PCA, y, test_size=.2, random_state=1)

In [68]:


# use PCA from sklearn instead?





K = [50, 75, 100, 125, 150, 175, 200]
nns = [Sequential() for _ in range(len(K))]
results = []
batch_size = 5000
num_epochs = 10

for k, cur_nn in zip(K, nns):
    n_cols = k
    x_tr, x_test, y_tr, y_test = split_with_PCA(k, dataset[cols], y)
    cur_nn.add(Dense(300, activation='relu', input_shape=(n_cols,)))
    cur_nn.add(Dropout(0.2))
    cur_nn.add(Dense(500, activation='relu'))
    cur_nn.add(Dropout(0.2))
    cur_nn.add(Dense(100, activation='relu'))
    cur_nn.add(Dropout(0.2))
    cur_nn.add(Dense(25, activation='relu'))
    cur_nn.add(Dropout(0.2))
    cur_nn.add(Dense(1, activation='sigmoid'))
    cur_nn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', 'categorical_accuracy'])
    x_vl, y_vl = x_tr[:batch_size], y_tr[:batch_size]
    x_train, y_train = x_tr[batch_size:], y_tr[batch_size:]
    print("For k=" + str(k))
    cur_nn.fit(x_train, y_train, validation_data=(x_vl, y_vl), epochs=num_epochs, batch_size=batch_size)
    #res = cur_nn.evaluate(x_test, y_test, batch_size=128, verbose=0)
    print(sklearn.metrics.classification_report(y_test, cur_nn.predict_classes(x_test)))
    #results.append(res)
    #print('test loss, test acc, categorical accuracy:', res)

For k=50
Train on 467432 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
              precision    recall  f1-score   support

           0       0.98      1.00      0.99    113974
           1       0.80      0.33      0.47      4134

    accuracy                           0.97    118108
   macro avg       0.89      0.66      0.73    118108
weighted avg       0.97      0.97      0.97    118108

For k=75
Train on 467432 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
              precision    recall  f1-score   support

           0       0.98      1.00      0.99    113974
           1       0.84      0.36      0.50      4134

    accuracy                           0.98    118108
   macro avg       0.91      0.68      0.75    118108
weighted avg       0.97      0.98      0.97    118108

For k=

Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
              precision    recall  f1-score   support

           0       0.98      1.00      0.99    113974
           1       0.89      0.39      0.54      4134

    accuracy                           0.98    118108
   macro avg       0.93      0.69      0.76    118108
weighted avg       0.98      0.98      0.97    118108

For k=150
Train on 467432 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
              precision    recall  f1-score   support

           0       0.98      1.00      0.99    113974
           1       0.86      0.41      0.56      4134

    accuracy                           0.98    118108
   macro avg       0.92      0.71      0.77    118108
weighted avg       0.97      0.98      0.97    118108

For k=175
Train on 467432 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoc

Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
              precision    recall  f1-score   support

           0       0.98      1.00      0.99    113974
           1       0.84      0.45      0.58      4134

    accuracy                           0.98    118108
   macro avg       0.91      0.72      0.79    118108
weighted avg       0.98      0.98      0.97    118108

