In [1]:
%%time
# Import Modules
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
pd.options.display.float_format = '{:,.2f}'.format
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score , confusion_matrix , precision_score , precision_recall_curve , f1_score
from sklearn.metrics import classification_report , roc_auc_score, recall_score
from sklearn.pipeline import Pipeline
import sklearn.metrics as metrics
from joblib import dump, load
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
#Import Models
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.cluster import KMeans
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier


Using TensorFlow backend.


CPU times: user 3.99 s, sys: 1.32 s, total: 5.31 s
Wall time: 9.14 s


In [2]:
%%time
# Read csv file into dataframe
df = pd.read_csv('paysim_features.csv').drop(['Unnamed: 0'], axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 20 columns):
step              int64
amount            float64
ChangeOrig        float64
ChangeDest        float64
nameOrig          object
oldbalanceOrg     float64
newbalanceOrig    float64
nameDest          object
oldbalanceDest    float64
newbalanceDest    float64
isFraud           int64
isFlaggedFraud    int64
type_CASH_IN      int64
type_CASH_OUT     int64
type_DEBIT        int64
type_PAYMENT      int64
type_TRANSFER     int64
steptime          object
dayofweek         int64
timeofday         float64
dtypes: float64(8), int64(9), object(3)
memory usage: 970.9+ MB
CPU times: user 18.8 s, sys: 2.33 s, total: 21.1 s
Wall time: 20.9 s


In [3]:
df.describe()

Unnamed: 0,step,amount,ChangeOrig,ChangeDest,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER,dayofweek,timeofday
count,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0
mean,243.4,179861.9,21230.56,124294.73,833883.1,855113.67,1100701.67,1224996.4,0.0,0.0,0.22,0.35,0.01,0.34,0.08,3.12,15.32
std,142.33,603858.23,146643.29,812939.08,2888242.67,2924048.5,3399180.11,3674128.94,0.04,0.0,0.41,0.48,0.08,0.47,0.28,1.65,4.32
min,1.0,0.0,-10000000.0,-13060826.21,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,156.0,13389.57,-10150.44,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,12.0
50%,239.0,74871.94,0.0,0.0,14208.0,0.0,132705.66,214661.44,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,16.0
75%,335.0,208721.48,0.0,149105.43,107315.18,144258.41,943036.71,1111909.25,0.0,0.0,0.0,1.0,0.0,1.0,0.0,4.0,19.0
max,743.0,92445516.64,1915267.9,105687838.82,59585040.37,49585040.37,356015889.35,356179278.92,1.0,1.0,1.0,1.0,1.0,1.0,1.0,6.0,23.0


In [4]:
df.head()

Unnamed: 0,step,amount,ChangeOrig,ChangeDest,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER,steptime,dayofweek,timeofday
0,1,9839.64,-9839.64,0.0,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0,0,0,0,1,0,1970-01-01 01:00:00,3,1.0
1,1,1864.28,-1864.28,0.0,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0,0,0,0,1,0,1970-01-01 01:00:00,3,1.0
2,1,181.0,-181.0,0.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0,0,0,0,0,1,1970-01-01 01:00:00,3,1.0
3,1,181.0,-181.0,-21182.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0,0,1,0,0,0,1970-01-01 01:00:00,3,1.0
4,1,11668.14,-11668.14,0.0,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0,0,0,0,1,0,1970-01-01 01:00:00,3,1.0


In [5]:
# Define X with selected columns, y as the binary variable to predict
X = df.drop(['isFraud' , 'isFlaggedFraud' , 'nameOrig' , 'nameDest' ,'steptime'] , axis = 1 ).values
y = df['isFraud'].values
# Train, Test, Split to segregate Training from Test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [6]:
X.shape

(6362620, 15)

In [7]:
%%time
# Setup the pipeline
steps = [('scaler', StandardScaler()),
         ('lr', LogisticRegression())]

model = Pipeline(steps)


# Fit it to the training data
model.fit(X_train, y_train)

model_predict = model.predict(X_test)

# Compute and print metrics
print (confusion_matrix(y_test , model_predict))
print ("Accuracy score: ", accuracy_score(y_test , model_predict))
print("F1 score: ", f1_score(y_test , model_predict))
print("Precision score: " , precision_score(y_test , model_predict))
print("Recall score: " , recall_score(y_test , model_predict))
print("ROC-AUC score: " , roc_auc_score(y_test , model_predict))
print (classification_report(y_test , model_predict))
lr = model
lr



[[1906208     143]
 [   1183    1252]]
Accuracy score:  0.9993053176207286
F1 score:  0.6537859007832898
Precision score:  0.8974910394265233
Recall score:  0.5141683778234086
ROC-AUC score:  0.7570466827021973
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1906351
           1       0.90      0.51      0.65      2435

    accuracy                           1.00   1908786
   macro avg       0.95      0.76      0.83   1908786
weighted avg       1.00      1.00      1.00   1908786

CPU times: user 55.4 s, sys: 3.55 s, total: 58.9 s
Wall time: 51.9 s


Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('lr',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='warn', n_jobs=None,
                                    penalty='l2', random_state=None,
                                    solver='warn', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)

In [8]:
%%time
# Setup the pipeline
steps = [('scaler', StandardScaler()),
         ('gnb', GaussianNB())]

model = Pipeline(steps)


# Fit it to the training data
model.fit(X_train, y_train)

model_predict = model.predict(X_test)

# Compute and print metrics
print (confusion_matrix(y_test , model_predict))
print ("Accuracy score: ", accuracy_score(y_test , model_predict))
print("F1 score: ", f1_score(y_test , model_predict))
print("Precision score: " , precision_score(y_test , model_predict))
print("Recall score: " , recall_score(y_test , model_predict))
print("ROC-AUC score: " , roc_auc_score(y_test , model_predict))
print (classification_report(y_test , model_predict))
gnb = model
gnb

[[1078399  827952]
 [      0    2435]]
Accuracy score:  0.5662415797265906
F1 score:  0.005847588080045917
Precision score:  0.002932367679166461
Recall score:  1.0
ROC-AUC score:  0.782843768015439
              precision    recall  f1-score   support

           0       1.00      0.57      0.72   1906351
           1       0.00      1.00      0.01      2435

    accuracy                           0.57   1908786
   macro avg       0.50      0.78      0.36   1908786
weighted avg       1.00      0.57      0.72   1908786

CPU times: user 21.7 s, sys: 2.45 s, total: 24.1 s
Wall time: 14.5 s


Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('gnb', GaussianNB(priors=None, var_smoothing=1e-09))],
         verbose=False)

In [9]:
%%time
# Setup the pipeline
steps = [('scaler', StandardScaler()),
         ('kmeans', KMeans(n_clusters=2))]

model = Pipeline(steps)


# Fit it to the training data
model.fit(X_train, y_train)

model_predict = model.predict(X_test)

# Compute and print metrics
print (confusion_matrix(y_test , model_predict))
print ("Accuracy score: ", accuracy_score(y_test , model_predict))
print("F1 score: ", f1_score(y_test , model_predict))
print("Precision score: " , precision_score(y_test , model_predict))
print("Recall score: " , recall_score(y_test , model_predict))
print("ROC-AUC score: " , roc_auc_score(y_test , model_predict))
print (classification_report(y_test , model_predict))
kmeans = model
kmeans

[[1485870  420481]
 [   2433       2]]
Accuracy score:  0.7784382324681761
F1 score:  9.458098260182825e-06
Precision score:  4.756434861813676e-06
Recall score:  0.0008213552361396304
ROC-AUC score:  0.3901264225149959
              precision    recall  f1-score   support

           0       1.00      0.78      0.88   1906351
           1       0.00      0.00      0.00      2435

    accuracy                           0.78   1908786
   macro avg       0.50      0.39      0.44   1908786
weighted avg       1.00      0.78      0.87   1908786

CPU times: user 1min 16s, sys: 8.25 s, total: 1min 24s
Wall time: 55.6 s


Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('kmeans',
                 KMeans(algorithm='auto', copy_x=True, init='k-means++',
                        max_iter=300, n_clusters=2, n_init=10, n_jobs=None,
                        precompute_distances='auto', random_state=None,
                        tol=0.0001, verbose=0))],
         verbose=False)

In [10]:
%%time
# Setup the pipeline
steps = [('scaler', StandardScaler()),
         ('mlp', MLPClassifier())]

model = Pipeline(steps)


# Fit it to the training data
model.fit(X_train, y_train)

model_predict = model.predict(X_test)

# Compute and print metrics
print (confusion_matrix(y_test , model_predict))
print ("Accuracy score: ", accuracy_score(y_test , model_predict))
print("F1 score: ", f1_score(y_test , model_predict))
print("Precision score: " , precision_score(y_test , model_predict))
print("Recall score: " , recall_score(y_test , model_predict))
print("ROC-AUC score: " , roc_auc_score(y_test , model_predict))
print (classification_report(y_test , model_predict))
mlp = model
mlp

[[1906248     103]
 [    529    1906]]
Accuracy score:  0.9996688994994725
F1 score:  0.8577857785778578
Precision score:  0.9487307117969139
Recall score:  0.7827515400410677
ROC-AUC score:  0.8913487550584412
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1906351
           1       0.95      0.78      0.86      2435

    accuracy                           1.00   1908786
   macro avg       0.97      0.89      0.93   1908786
weighted avg       1.00      1.00      1.00   1908786

CPU times: user 16min 10s, sys: 12.9 s, total: 16min 23s
Wall time: 15min 54s


Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('mlp',
                 MLPClassifier(activation='relu', alpha=0.0001,
                               batch_size='auto', beta_1=0.9, beta_2=0.999,
                               early_stopping=False, epsilon=1e-08,
                               hidden_layer_sizes=(100,),
                               learning_rate='constant',
                               learning_rate_init=0.001, max_iter=200,
                               momentum=0.9, n_iter_no_change=10,
                               nesterovs_momentum=True, power_t=0.5,
                               random_state=None, shuffle=True, solver='adam',
                               tol=0.0001, validation_fraction=0.1,
                               verbose=False, warm_start=False))],
         verbose=False)

In [11]:
%%time
# Setup the pipeline
steps = [('scaler', StandardScaler()),
         ('rfc', RandomForestClassifier())]

model = Pipeline(steps)


# Fit it to the training data
model.fit(X_train, y_train)

model_predict = model.predict(X_test)

# Compute and print metrics
print (confusion_matrix(y_test , model_predict))
print ("Accuracy score: ", accuracy_score(y_test , model_predict))
print("F1 score: ", f1_score(y_test , model_predict))
print("Precision score: " , precision_score(y_test , model_predict))
print("Recall score: " , recall_score(y_test , model_predict))
print("ROC-AUC score: " , roc_auc_score(y_test , model_predict))
print (classification_report(y_test , model_predict))
rfc = model
rfc



[[1906325      26]
 [    439    1996]]
Accuracy score:  0.9997563896633777
F1 score:  0.8956697330042629
Precision score:  0.9871414441147379
Recall score:  0.8197125256673511
ROC-AUC score:  0.909849443522856
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1906351
           1       0.99      0.82      0.90      2435

    accuracy                           1.00   1908786
   macro avg       0.99      0.91      0.95   1908786
weighted avg       1.00      1.00      1.00   1908786

CPU times: user 3min 39s, sys: 2.68 s, total: 3min 42s
Wall time: 7min


Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('rfc',
                 RandomForestClassifier(bootstrap=True, class_weight=None,
                                        criterion='gini', max_depth=None,
                                        max_features='auto',
                                        max_leaf_nodes=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        n_estimators=10, n_jobs=None,
                                        oob_score=False, random_state=None,
                                        verbose=0, warm_start=False))],
         verbose=False)

In [12]:
%%time
# Setup the pipeline
steps = [('scaler', StandardScaler()),
         ('brfc', BalancedRandomForestClassifier())]

model = Pipeline(steps)


# Fit it to the training data
model.fit(X_train, y_train)

model_predict = model.predict(X_test)

# Compute and print metrics
print (confusion_matrix(y_test , model_predict))
print ("Accuracy score: ", accuracy_score(y_test , model_predict))
print("F1 score: ", f1_score(y_test , model_predict))
print("Precision score: " , precision_score(y_test , model_predict))
print("Recall score: " , recall_score(y_test , model_predict))
print("ROC-AUC score: " , roc_auc_score(y_test , model_predict))
print (classification_report(y_test , model_predict))
brfc = model
brfc

[[1886432   19919]
 [     19    2416]]
Accuracy score:  0.9895546174374708
F1 score:  0.19507468712151796
Precision score:  0.10817103201253638
Recall score:  0.9921971252566735
ROC-AUC score:  0.9908741836970696
              precision    recall  f1-score   support

           0       1.00      0.99      0.99   1906351
           1       0.11      0.99      0.20      2435

    accuracy                           0.99   1908786
   macro avg       0.55      0.99      0.59   1908786
weighted avg       1.00      0.99      0.99   1908786

CPU times: user 5min 20s, sys: 31.1 s, total: 5min 51s
Wall time: 4min 12s


Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('brfc',
                 BalancedRandomForestClassifier(bootstrap=True,
                                                class_weight=None,
                                                criterion='gini',
                                                max_depth=None,
                                                max_features='auto',
                                                max_leaf_nodes=None,
                                                min_impurity_decrease=0.0,
                                                min_samples_leaf=2,
                                                min_samples_split=2,
                                                min_weight_fraction_leaf=0.0,
                                                n_estimators=100, n_jobs=1,
                                                oob_score=False,
                    

In [13]:
%%time
# Setup the pipeline
steps = [('scaler', StandardScaler()),
         ('xgb', GradientBoostingClassifier())]

model = Pipeline(steps)


# Fit it to the training data
model.fit(X_train, y_train)

model_predict = model.predict(X_test)

# Compute and print metrics
print (confusion_matrix(y_test , model_predict))
print ("Accuracy score: ", accuracy_score(y_test , model_predict))
print("F1 score: ", f1_score(y_test , model_predict))
print("Precision score: " , precision_score(y_test , model_predict))
print("Recall score: " , recall_score(y_test , model_predict))
print("ROC-AUC score: " , roc_auc_score(y_test , model_predict))
print (classification_report(y_test , model_predict))
xgb = model
xgb

[[1905852     499]
 [   1988     447]]
Accuracy score:  0.9986970776189683
F1 score:  0.2644188110026619
Precision score:  0.4725158562367865
Recall score:  0.18357289527720738
ROC-AUC score:  0.5916555693271071
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1906351
           1       0.47      0.18      0.26      2435

    accuracy                           1.00   1908786
   macro avg       0.74      0.59      0.63   1908786
weighted avg       1.00      1.00      1.00   1908786

CPU times: user 20min 35s, sys: 10 s, total: 20min 45s
Wall time: 20min 57s


Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('xgb',
                 GradientBoostingClassifier(criterion='friedman_mse', init=None,
                                            learning_rate=0.1, loss='deviance',
                                            max_depth=3, max_features=None,
                                            max_leaf_nodes=None,
                                            min_impurity_decrease=0.0,
                                            min_impurity_split=None,
                                            min_samples_leaf=1,
                                            min_samples_split=2,
                                            min_weight_fraction_leaf=0.0,
                                            n_estimators=100,
                                            n_iter_no_change=None,
                                            presort='auto', random_state=None

In [14]:
%%time
# Setup the pipeline
steps = [('scaler', StandardScaler()),
         ('knn', KNeighborsClassifier())]

model = Pipeline(steps)


# Fit it to the training data
model.fit(X_train, y_train)

model_predict = model.predict(X_test)

# Compute and print metrics
print (confusion_matrix(y_test , model_predict))
print ("Accuracy score: ", accuracy_score(y_test , model_predict))
print("F1 score: ", f1_score(y_test , model_predict))
print("Precision score: " , precision_score(y_test , model_predict))
print("Recall score: " , recall_score(y_test , model_predict))
print("ROC-AUC score: " , roc_auc_score(y_test , model_predict))
print (classification_report(y_test , model_predict))
knn = model
knn

[[1906292      59]
 [    905    1530]]
Accuracy score:  0.9994949669580561
F1 score:  0.7604373757455269
Precision score:  0.9628697293895532
Recall score:  0.6283367556468172
ROC-AUC score:  0.8141529032334721
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1906351
           1       0.96      0.63      0.76      2435

    accuracy                           1.00   1908786
   macro avg       0.98      0.81      0.88   1908786
weighted avg       1.00      1.00      1.00   1908786

CPU times: user 5h 29min 52s, sys: 1min 50s, total: 5h 31min 43s
Wall time: 5h 35min 2s


Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('knn',
                 KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                      metric='minkowski', metric_params=None,
                                      n_jobs=None, n_neighbors=5, p=2,
                                      weights='uniform'))],
         verbose=False)

In [15]:
%%time
# Setup the pipeline
steps = [('scaler', StandardScaler()),
         ('svm', svm.SVC())]

model = Pipeline(steps)


# Fit it to the training data
model.fit(X_train, y_train)

model_predict = model.predict(X_test)

# Compute and print metrics
print (confusion_matrix(y_test , model_predict))
print ("Accuracy score: ", accuracy_score(y_test , model_predict))
print("F1 score: ", f1_score(y_test , model_predict))
print("Precision score: " , precision_score(y_test , model_predict))
print("Recall score: " , recall_score(y_test , model_predict))
print("ROC-AUC score: " , roc_auc_score(y_test , model_predict))
print (classification_report(y_test , model_predict))
svm = model
svm

[[1906326      25]
 [    998    1437]]
Accuracy score:  0.9994640572594309
F1 score:  0.737490377213241
Precision score:  0.9829001367989056
Recall score:  0.5901437371663244
ROC-AUC score:  0.7950653115535281
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1906351
           1       0.98      0.59      0.74      2435

    accuracy                           1.00   1908786
   macro avg       0.99      0.80      0.87   1908786
weighted avg       1.00      1.00      1.00   1908786

CPU times: user 2h 18min 35s, sys: 1min 13s, total: 2h 19min 49s
Wall time: 4h 26min 49s


Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('svm',
                 SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                     decision_function_shape='ovr', degree=3,
                     gamma='auto_deprecated', kernel='rbf', max_iter=-1,
                     probability=False, random_state=None, shrinking=True,
                     tol=0.001, verbose=False))],
         verbose=False)