In [1]:
%%time
# Import Modules
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
pd.options.display.float_format = '{:,.2f}'.format
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score , confusion_matrix , precision_score , precision_recall_curve , f1_score
from sklearn.metrics import classification_report , roc_auc_score, recall_score
from sklearn.pipeline import Pipeline
import sklearn.metrics as metrics
from joblib import dump, load
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
#Import Models
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.cluster import KMeans
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier


Using TensorFlow backend.


CPU times: user 4 s, sys: 1.48 s, total: 5.48 s
Wall time: 9.63 s


In [2]:
%%time
# Read csv file into dataframe
df = pd.read_csv('paysim_features.csv').drop(['Unnamed: 0'], axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 20 columns):
step              int64
amount            float64
ChangeOrig        float64
ChangeDest        float64
nameOrig          object
oldbalanceOrg     float64
newbalanceOrig    float64
nameDest          object
oldbalanceDest    float64
newbalanceDest    float64
isFraud           int64
isFlaggedFraud    int64
type_CASH_IN      int64
type_CASH_OUT     int64
type_DEBIT        int64
type_PAYMENT      int64
type_TRANSFER     int64
steptime          object
dayofweek         int64
timeofday         float64
dtypes: float64(8), int64(9), object(3)
memory usage: 970.9+ MB
CPU times: user 21.6 s, sys: 6.18 s, total: 27.8 s
Wall time: 29.5 s


In [3]:
df.describe()

Unnamed: 0,step,amount,ChangeOrig,ChangeDest,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER,dayofweek,timeofday
count,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0
mean,243.4,179861.9,21230.56,124294.73,833883.1,855113.67,1100701.67,1224996.4,0.0,0.0,0.22,0.35,0.01,0.34,0.08,3.12,15.32
std,142.33,603858.23,146643.29,812939.08,2888242.67,2924048.5,3399180.11,3674128.94,0.04,0.0,0.41,0.48,0.08,0.47,0.28,1.65,4.32
min,1.0,0.0,-10000000.0,-13060826.21,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,156.0,13389.57,-10150.44,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,12.0
50%,239.0,74871.94,0.0,0.0,14208.0,0.0,132705.66,214661.44,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,16.0
75%,335.0,208721.48,0.0,149105.43,107315.18,144258.41,943036.71,1111909.25,0.0,0.0,0.0,1.0,0.0,1.0,0.0,4.0,19.0
max,743.0,92445516.64,1915267.9,105687838.82,59585040.37,49585040.37,356015889.35,356179278.92,1.0,1.0,1.0,1.0,1.0,1.0,1.0,6.0,23.0


In [4]:
df.head()

Unnamed: 0,step,amount,ChangeOrig,ChangeDest,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER,steptime,dayofweek,timeofday
0,1,9839.64,-9839.64,0.0,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0,0,0,0,1,0,1970-01-01 01:00:00,3,1.0
1,1,1864.28,-1864.28,0.0,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0,0,0,0,1,0,1970-01-01 01:00:00,3,1.0
2,1,181.0,-181.0,0.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0,0,0,0,0,1,1970-01-01 01:00:00,3,1.0
3,1,181.0,-181.0,-21182.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0,0,1,0,0,0,1970-01-01 01:00:00,3,1.0
4,1,11668.14,-11668.14,0.0,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0,0,0,0,1,0,1970-01-01 01:00:00,3,1.0


In [5]:
# Define X with selected columns, y as the binary variable to predict
X = df.drop(['isFraud' , 'isFlaggedFraud' , 'nameOrig' , 'nameDest' ,'steptime'] , axis = 1 ).values
y = df['isFraud'].values
# Train, Test, Split to segregate Training from Test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [6]:
X.shape

(6362620, 15)

In [7]:
%%time
# Replace the sklearn Pipeline with imblearn Pipeline for SMOTE
from imblearn.pipeline import Pipeline

steps = [('scaler', StandardScaler()),
         ('pca', PCA()),
         ('smote', SMOTE()),         
         ('lr', LogisticRegression())]

model = Pipeline(steps)


# Fit it to the training data
model.fit(X_train, y_train)

model_predict = model.predict(X_test)

# Compute and print metrics
print (confusion_matrix(y_test , model_predict))
print ("Accuracy score: ", accuracy_score(y_test , model_predict))
print("F1 score: ", f1_score(y_test , model_predict))
print("Precision score: " , precision_score(y_test , model_predict))
print("Recall score: " , recall_score(y_test , model_predict))
print("ROC-AUC score: " , roc_auc_score(y_test , model_predict))
print (classification_report(y_test , model_predict))
lr = model
lr



[[1823999   82352]
 [    128    2307]]
Accuracy score:  0.9567892891083652
F1 score:  0.05297724297885044
Precision score:  0.02725049906093859
Recall score:  0.9474332648870637
ROC-AUC score:  0.9521172522664291
              precision    recall  f1-score   support

           0       1.00      0.96      0.98   1906351
           1       0.03      0.95      0.05      2435

    accuracy                           0.96   1908786
   macro avg       0.51      0.95      0.52   1908786
weighted avg       1.00      0.96      0.98   1908786

CPU times: user 1min 52s, sys: 11.9 s, total: 2min 4s
Wall time: 1min 55s


Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('pca',
                 PCA(copy=True, iterated_power='auto', n_components=None,
                     random_state=None, svd_solver='auto', tol=0.0,
                     whiten=False)),
                ('smote',
                 SMOTE(k_neighbors=5, kind='deprecated',
                       m_neighbors='deprecated', n_jobs=1,
                       out_step='deprecated', random_state=None, ratio=None,
                       sampling_strategy='auto', svm_estimator='deprecated')),
                ('lr',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='warn', n_jobs=None,
                                    penalty='l2', random_state=Non

In [8]:
%%time
# Setup the pipeline
steps = [('scaler', StandardScaler()),
         ('pca', PCA()),
         ('smote', SMOTE()),         
         ('gnb', GaussianNB())]

model = Pipeline(steps)


# Fit it to the training data
model.fit(X_train, y_train)

model_predict = model.predict(X_test)

# Compute and print metrics
print (confusion_matrix(y_test , model_predict))
print ("Accuracy score: ", accuracy_score(y_test , model_predict))
print("F1 score: ", f1_score(y_test , model_predict))
print("Precision score: " , precision_score(y_test , model_predict))
print("Recall score: " , recall_score(y_test , model_predict))
print("ROC-AUC score: " , roc_auc_score(y_test , model_predict))
print (classification_report(y_test , model_predict))
gnb = model
gnb

[[1862508   43843]
 [   1247    1188]]
Accuracy score:  0.9763776557455891
F1 score:  0.05005688282138795
Precision score:  0.02638182585330106
Recall score:  0.4878850102669405
ROC-AUC score:  0.7324433111235529
              precision    recall  f1-score   support

           0       1.00      0.98      0.99   1906351
           1       0.03      0.49      0.05      2435

    accuracy                           0.98   1908786
   macro avg       0.51      0.73      0.52   1908786
weighted avg       1.00      0.98      0.99   1908786

CPU times: user 55.8 s, sys: 5.53 s, total: 1min 1s
Wall time: 47.4 s


Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('pca',
                 PCA(copy=True, iterated_power='auto', n_components=None,
                     random_state=None, svd_solver='auto', tol=0.0,
                     whiten=False)),
                ('smote',
                 SMOTE(k_neighbors=5, kind='deprecated',
                       m_neighbors='deprecated', n_jobs=1,
                       out_step='deprecated', random_state=None, ratio=None,
                       sampling_strategy='auto', svm_estimator='deprecated')),
                ('gnb', GaussianNB(priors=None, var_smoothing=1e-09))],
         verbose=False)

In [9]:
%%time
# Setup the pipeline
steps = [('scaler', StandardScaler()),
         ('pca', PCA()),
         ('smote', SMOTE()),         
         ('kmeans', KMeans(n_clusters=2))]

model = Pipeline(steps)


# Fit it to the training data
model.fit(X_train, y_train)

model_predict = model.predict(X_test)

# Compute and print metrics
print (confusion_matrix(y_test , model_predict))
print ("Accuracy score: ", accuracy_score(y_test , model_predict))
print("F1 score: ", f1_score(y_test , model_predict))
print("Precision score: " , precision_score(y_test , model_predict))
print("Recall score: " , recall_score(y_test , model_predict))
print("ROC-AUC score: " , roc_auc_score(y_test , model_predict))
print (classification_report(y_test , model_predict))
kmeans = model
kmeans

[[1906342       9]
 [   2180     255]]
Accuracy score:  0.9988531977916854
F1 score:  0.18895887365690997
Precision score:  0.9659090909090909
Recall score:  0.10472279260780287
ROC-AUC score:  0.5523590357732332
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1906351
           1       0.97      0.10      0.19      2435

    accuracy                           1.00   1908786
   macro avg       0.98      0.55      0.59   1908786
weighted avg       1.00      1.00      1.00   1908786

CPU times: user 2min 31s, sys: 26.5 s, total: 2min 57s
Wall time: 2min 14s


Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('pca',
                 PCA(copy=True, iterated_power='auto', n_components=None,
                     random_state=None, svd_solver='auto', tol=0.0,
                     whiten=False)),
                ('smote',
                 SMOTE(k_neighbors=5, kind='deprecated',
                       m_neighbors='deprecated', n_jobs=1,
                       out_step='deprecated', random_state=None, ratio=None,
                       sampling_strategy='auto', svm_estimator='deprecated')),
                ('kmeans',
                 KMeans(algorithm='auto', copy_x=True, init='k-means++',
                        max_iter=300, n_clusters=2, n_init=10, n_jobs=None,
                        precompute_distances='auto', random_state=None,
                        tol=0.0001, verbose=0))],
         verbose=False)

In [10]:
%%time
# Setup the pipeline
steps = [('scaler', StandardScaler()),
         ('pca', PCA()),
         ('smote', SMOTE()),         
         ('mlp', MLPClassifier())]

model = Pipeline(steps)


# Fit it to the training data
model.fit(X_train, y_train)

model_predict = model.predict(X_test)

# Compute and print metrics
print (confusion_matrix(y_test , model_predict))
print ("Accuracy score: ", accuracy_score(y_test , model_predict))
print("F1 score: ", f1_score(y_test , model_predict))
print("Precision score: " , precision_score(y_test , model_predict))
print("Recall score: " , recall_score(y_test , model_predict))
print("ROC-AUC score: " , roc_auc_score(y_test , model_predict))
print (classification_report(y_test , model_predict))
mlp = model
mlp



[[1899103    7248]
 [     43    2392]]
Accuracy score:  0.9961802947004011
F1 score:  0.3961904761904762
Precision score:  0.24813278008298756
Recall score:  0.9823408624229979
ROC-AUC score:  0.9892694171799801
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1906351
           1       0.25      0.98      0.40      2435

    accuracy                           1.00   1908786
   macro avg       0.62      0.99      0.70   1908786
weighted avg       1.00      1.00      1.00   1908786

CPU times: user 6h 39min 6s, sys: 4min 33s, total: 6h 43min 39s
Wall time: 3h 26min 16s


Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('pca',
                 PCA(copy=True, iterated_power='auto', n_components=None,
                     random_state=None, svd_solver='auto', tol=0.0,
                     whiten=False)),
                ('smote',
                 SMOTE(k_neighbors=5, kind='deprecated',
                       m_neighbors='deprecated', n_jobs=1,
                       out_step='deprecated', random_state=None, ratio=None,
                       samp...
                               batch_size='auto', beta_1=0.9, beta_2=0.999,
                               early_stopping=False, epsilon=1e-08,
                               hidden_layer_sizes=(100,),
                               learning_rate='constant',
                               learning_rate_init=0.001, max_iter=200,
                               momentum=0.9, n_iter_no_change=10,
                          

In [11]:
%%time
# Setup the pipeline
steps = [('scaler', StandardScaler()),
         ('pca', PCA()),
         ('smote', SMOTE()),         
         ('rfc', RandomForestClassifier())]

model = Pipeline(steps)


# Fit it to the training data
model.fit(X_train, y_train)

model_predict = model.predict(X_test)

# Compute and print metrics
print (confusion_matrix(y_test , model_predict))
print ("Accuracy score: ", accuracy_score(y_test , model_predict))
print("F1 score: ", f1_score(y_test , model_predict))
print("Precision score: " , precision_score(y_test , model_predict))
print("Recall score: " , recall_score(y_test , model_predict))
print("ROC-AUC score: " , roc_auc_score(y_test , model_predict))
print (classification_report(y_test , model_predict))
rfc = model
rfc



[[1904884    1467]
 [    418    2017]]
Accuracy score:  0.9990124613235847
F1 score:  0.6815340429126542
Precision score:  0.5789322617680827
Recall score:  0.8283367556468173
ROC-AUC score:  0.9137836113244795
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1906351
           1       0.58      0.83      0.68      2435

    accuracy                           1.00   1908786
   macro avg       0.79      0.91      0.84   1908786
weighted avg       1.00      1.00      1.00   1908786

CPU times: user 21min 26s, sys: 23.3 s, total: 21min 50s
Wall time: 21min 33s


Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('pca',
                 PCA(copy=True, iterated_power='auto', n_components=None,
                     random_state=None, svd_solver='auto', tol=0.0,
                     whiten=False)),
                ('smote',
                 SMOTE(k_neighbors=5, kind='deprecated',
                       m_neighbors='deprecated', n_jobs=1,
                       out_step='deprecated', random_state=None, ratio=None,
                       samp...
                 RandomForestClassifier(bootstrap=True, class_weight=None,
                                        criterion='gini', max_depth=None,
                                        max_features='auto',
                                        max_leaf_nodes=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                   

In [12]:
%%time
# Setup the pipeline
steps = [('scaler', StandardScaler()),
         ('pca', PCA()),
         ('smote', SMOTE()),         
         ('brfc', BalancedRandomForestClassifier())]

model = Pipeline(steps)


# Fit it to the training data
model.fit(X_train, y_train)

model_predict = model.predict(X_test)

# Compute and print metrics
print (confusion_matrix(y_test , model_predict))
print ("Accuracy score: ", accuracy_score(y_test , model_predict))
print("F1 score: ", f1_score(y_test , model_predict))
print("Precision score: " , precision_score(y_test , model_predict))
print("Recall score: " , recall_score(y_test , model_predict))
print("ROC-AUC score: " , roc_auc_score(y_test , model_predict))
print (classification_report(y_test , model_predict))
brfc = model
brfc

[[1904689    1662]
 [    382    2053]]
Accuracy score:  0.9989291623052557
F1 score:  0.6676422764227641
Precision score:  0.5526244952893674
Recall score:  0.8431211498973306
ROC-AUC score:  0.9211246636185902
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1906351
           1       0.55      0.84      0.67      2435

    accuracy                           1.00   1908786
   macro avg       0.78      0.92      0.83   1908786
weighted avg       1.00      1.00      1.00   1908786

CPU times: user 2h 18min 44s, sys: 2min 30s, total: 2h 21min 14s
Wall time: 5h 54min 9s


Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('pca',
                 PCA(copy=True, iterated_power='auto', n_components=None,
                     random_state=None, svd_solver='auto', tol=0.0,
                     whiten=False)),
                ('smote',
                 SMOTE(k_neighbors=5, kind='deprecated',
                       m_neighbors='deprecated', n_jobs=1,
                       out_step='deprecated', random_state=None, ratio=None,
                       samp...
                 BalancedRandomForestClassifier(bootstrap=True,
                                                class_weight=None,
                                                criterion='gini',
                                                max_depth=None,
                                                max_features='auto',
                                                max_leaf_nodes=None,
                       

In [13]:
%%time
# Setup the pipeline
steps = [('scaler', StandardScaler()),
         ('pca', PCA()),
         ('smote', SMOTE()),         
         ('xgb', GradientBoostingClassifier())]

model = Pipeline(steps)


# Fit it to the training data
model.fit(X_train, y_train)

model_predict = model.predict(X_test)

# Compute and print metrics
print (confusion_matrix(y_test , model_predict))
print ("Accuracy score: ", accuracy_score(y_test , model_predict))
print("F1 score: ", f1_score(y_test , model_predict))
print("Precision score: " , precision_score(y_test , model_predict))
print("Recall score: " , recall_score(y_test , model_predict))
print("ROC-AUC score: " , roc_auc_score(y_test , model_predict))
print (classification_report(y_test , model_predict))
xgb = model
xgb

[[1841299   65052]
 [     91    2344]]
Accuracy score:  0.9658720254654005
F1 score:  0.06713350804084145
Precision score:  0.034779512137218825
Recall score:  0.9626283367556469
ROC-AUC score:  0.9642522527075194
              precision    recall  f1-score   support

           0       1.00      0.97      0.98   1906351
           1       0.03      0.96      0.07      2435

    accuracy                           0.97   1908786
   macro avg       0.52      0.96      0.52   1908786
weighted avg       1.00      0.97      0.98   1908786

CPU times: user 1h 25min 45s, sys: 1min 12s, total: 1h 26min 58s
Wall time: 1h 25min 33s


Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('pca',
                 PCA(copy=True, iterated_power='auto', n_components=None,
                     random_state=None, svd_solver='auto', tol=0.0,
                     whiten=False)),
                ('smote',
                 SMOTE(k_neighbors=5, kind='deprecated',
                       m_neighbors='deprecated', n_jobs=1,
                       out_step='deprecated', random_state=None, ratio=None,
                       samp...
                                            learning_rate=0.1, loss='deviance',
                                            max_depth=3, max_features=None,
                                            max_leaf_nodes=None,
                                            min_impurity_decrease=0.0,
                                            min_impurity_split=None,
                                            min_samples_leaf=

In [14]:
%%time
# Setup the pipeline
steps = [('scaler', StandardScaler()),
         ('pca', PCA()),
         ('smote', SMOTE()),         
         ('knn', KNeighborsClassifier())]

model = Pipeline(steps)


# Fit it to the training data
model.fit(X_train, y_train)

model_predict = model.predict(X_test)

# Compute and print metrics
print (confusion_matrix(y_test , model_predict))
print ("Accuracy score: ", accuracy_score(y_test , model_predict))
print("F1 score: ", f1_score(y_test , model_predict))
print("Precision score: " , precision_score(y_test , model_predict))
print("Recall score: " , recall_score(y_test , model_predict))
print("ROC-AUC score: " , roc_auc_score(y_test , model_predict))
print (classification_report(y_test , model_predict))
knn = model
knn

[[1902025    4326]
 [    456    1979]]
Accuracy score:  0.9974947427317677
F1 score:  0.45286041189931353
Precision score:  0.31387787470261697
Recall score:  0.8127310061601642
ROC-AUC score:  0.9052308746721971
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1906351
           1       0.31      0.81      0.45      2435

    accuracy                           1.00   1908786
   macro avg       0.66      0.91      0.73   1908786
weighted avg       1.00      1.00      1.00   1908786

CPU times: user 5min 48s, sys: 8.58 s, total: 5min 56s
Wall time: 5min 45s


Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('pca',
                 PCA(copy=True, iterated_power='auto', n_components=None,
                     random_state=None, svd_solver='auto', tol=0.0,
                     whiten=False)),
                ('smote',
                 SMOTE(k_neighbors=5, kind='deprecated',
                       m_neighbors='deprecated', n_jobs=1,
                       out_step='deprecated', random_state=None, ratio=None,
                       sampling_strategy='auto', svm_estimator='deprecated')),
                ('knn',
                 KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                      metric='minkowski', metric_params=None,
                                      n_jobs=None, n_neighbors=5, p=2,
                                      weights='uniform'))],
         verbose=False)

In [None]:
%%time
# Setup the pipeline
steps = [('scaler', StandardScaler()),
         ('pca', PCA()),
         ('smote', SMOTE()),         
         ('svm', svm.SVC())]

model = Pipeline(steps)


# Fit it to the training data
model.fit(X_train, y_train)

model_predict = model.predict(X_test)

# Compute and print metrics
print (confusion_matrix(y_test , model_predict))
print ("Accuracy score: ", accuracy_score(y_test , model_predict))
print("F1 score: ", f1_score(y_test , model_predict))
print("Precision score: " , precision_score(y_test , model_predict))
print("Recall score: " , recall_score(y_test , model_predict))
print("ROC-AUC score: " , roc_auc_score(y_test , model_predict))
print (classification_report(y_test , model_predict))
svm = model
svm

