In [10]:
%%time
# Import Modules
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
pd.options.display.float_format = '{:,.2f}'.format
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score , confusion_matrix , precision_score , precision_recall_curve , f1_score
from sklearn.metrics import classification_report , roc_auc_score, recall_score
from sklearn.pipeline import Pipeline
import sklearn.metrics as metrics
from joblib import dump, load
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
#Import Models
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.cluster import KMeans
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier


Wall time: 998 µs


In [3]:
%%time
# Read csv file into dataframe
df = pd.read_csv('paysim_features.csv').drop(['Unnamed: 0'], axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 20 columns):
step              int64
amount            float64
ChangeOrig        float64
ChangeDest        float64
nameOrig          object
oldbalanceOrg     float64
newbalanceOrig    float64
nameDest          object
oldbalanceDest    float64
newbalanceDest    float64
isFraud           int64
isFlaggedFraud    int64
type_CASH_IN      int64
type_CASH_OUT     int64
type_DEBIT        int64
type_PAYMENT      int64
type_TRANSFER     int64
steptime          object
dayofweek         int64
timeofday         float64
dtypes: float64(8), int64(9), object(3)
memory usage: 970.9+ MB
Wall time: 24 s


In [4]:
df.describe()

Unnamed: 0,step,amount,ChangeOrig,ChangeDest,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER,dayofweek,timeofday
count,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0
mean,243.4,179861.9,21230.56,124294.73,833883.1,855113.67,1100701.67,1224996.4,0.0,0.0,0.22,0.35,0.01,0.34,0.08,3.12,15.32
std,142.33,603858.23,146643.29,812939.08,2888242.67,2924048.5,3399180.11,3674128.94,0.04,0.0,0.41,0.48,0.08,0.47,0.28,1.65,4.32
min,1.0,0.0,-10000000.0,-13060826.21,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,156.0,13389.57,-10150.44,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,12.0
50%,239.0,74871.94,0.0,0.0,14208.0,0.0,132705.66,214661.44,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,16.0
75%,335.0,208721.48,0.0,149105.43,107315.18,144258.41,943036.71,1111909.25,0.0,0.0,0.0,1.0,0.0,1.0,0.0,4.0,19.0
max,743.0,92445516.64,1915267.9,105687838.82,59585040.37,49585040.37,356015889.35,356179278.92,1.0,1.0,1.0,1.0,1.0,1.0,1.0,6.0,23.0


In [5]:
df.head()

Unnamed: 0,step,amount,ChangeOrig,ChangeDest,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER,steptime,dayofweek,timeofday
0,1,9839.64,-9839.64,0.0,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0,0,0,0,1,0,1970-01-01 01:00:00,3,1.0
1,1,1864.28,-1864.28,0.0,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0,0,0,0,1,0,1970-01-01 01:00:00,3,1.0
2,1,181.0,-181.0,0.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0,0,0,0,0,1,1970-01-01 01:00:00,3,1.0
3,1,181.0,-181.0,-21182.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0,0,1,0,0,0,1970-01-01 01:00:00,3,1.0
4,1,11668.14,-11668.14,0.0,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0,0,0,0,1,0,1970-01-01 01:00:00,3,1.0


In [6]:
# Define X with selected columns, y as the binary variable to predict
X = df.drop(['isFraud' , 'isFlaggedFraud' , 'nameOrig' , 'nameDest' ,'steptime'] , axis = 1 ).values
y = df['isFraud'].values
# Train, Test, Split to segregate Training from Test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [14]:
X.shape

(6362620, 15)

In [15]:
%%time
# Replace the sklearn Pipeline with imblearn Pipeline for SMOTE
from imblearn.pipeline import Pipeline

steps = [('scaler', StandardScaler()),
         ('smote', SMOTE()),         
         ('lr', LogisticRegression())]

model = Pipeline(steps)


# Fit it to the training data
model.fit(X_train, y_train)

model_predict = model.predict(X_test)

# Compute and print metrics
print (confusion_matrix(y_test , model_predict))
print ("Accuracy score: ", accuracy_score(y_test , model_predict))
print("F1 score: ", f1_score(y_test , model_predict))
print("Precision score: " , precision_score(y_test , model_predict))
print("Recall score: " , recall_score(y_test , model_predict))
print("ROC-AUC score: " , roc_auc_score(y_test , model_predict))
print (classification_report(y_test , model_predict))
lr = model
lr



[[1823974   82377]
 [    128    2307]]
Accuracy score:  0.9567761917784393
F1 score:  0.052962040427461284
Precision score:  0.027242454300694347
Recall score:  0.9474332648870637
ROC-AUC score:  0.9521106952367949
              precision    recall  f1-score   support

           0       1.00      0.96      0.98   1906351
           1       0.03      0.95      0.05      2435

    accuracy                           0.96   1908786
   macro avg       0.51      0.95      0.52   1908786
weighted avg       1.00      0.96      0.98   1908786

Wall time: 2min 13s


Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('smote',
                 SMOTE(k_neighbors=5, kind='deprecated',
                       m_neighbors='deprecated', n_jobs=1,
                       out_step='deprecated', random_state=None, ratio=None,
                       sampling_strategy='auto', svm_estimator='deprecated')),
                ('lr',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='warn', n_jobs=None,
                                    penalty='l2', random_state=None,
                                    solver='warn', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)

In [16]:
%%time
# Setup the pipeline
steps = [('scaler', StandardScaler()),
         ('smote', SMOTE()),         
         ('gnb', GaussianNB())]

model = Pipeline(steps)


# Fit it to the training data
model.fit(X_train, y_train)

model_predict = model.predict(X_test)

# Compute and print metrics
print (confusion_matrix(y_test , model_predict))
print ("Accuracy score: ", accuracy_score(y_test , model_predict))
print("F1 score: ", f1_score(y_test , model_predict))
print("Precision score: " , precision_score(y_test , model_predict))
print("Recall score: " , recall_score(y_test , model_predict))
print("ROC-AUC score: " , roc_auc_score(y_test , model_predict))
print (classification_report(y_test , model_predict))
gnb = model
gnb

[[1081677  824674]
 [      0    2435]]
Accuracy score:  0.5679589016264788
F1 score:  0.005870695225328615
Precision score:  0.0029439892444647563
Recall score:  1.0
ROC-AUC score:  0.7837035257410624
              precision    recall  f1-score   support

           0       1.00      0.57      0.72   1906351
           1       0.00      1.00      0.01      2435

    accuracy                           0.57   1908786
   macro avg       0.50      0.78      0.36   1908786
weighted avg       1.00      0.57      0.72   1908786

Wall time: 39.6 s


Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('smote',
                 SMOTE(k_neighbors=5, kind='deprecated',
                       m_neighbors='deprecated', n_jobs=1,
                       out_step='deprecated', random_state=None, ratio=None,
                       sampling_strategy='auto', svm_estimator='deprecated')),
                ('gnb', GaussianNB(priors=None, var_smoothing=1e-09))],
         verbose=False)

In [17]:
%%time
# Setup the pipeline
steps = [('scaler', StandardScaler()),
         ('smote', SMOTE()),         
         ('kmeans', KMeans(n_clusters=2))]

model = Pipeline(steps)


# Fit it to the training data
model.fit(X_train, y_train)

model_predict = model.predict(X_test)

# Compute and print metrics
print (confusion_matrix(y_test , model_predict))
print ("Accuracy score: ", accuracy_score(y_test , model_predict))
print("F1 score: ", f1_score(y_test , model_predict))
print("Precision score: " , precision_score(y_test , model_predict))
print("Recall score: " , recall_score(y_test , model_predict))
print("ROC-AUC score: " , roc_auc_score(y_test , model_predict))
print (classification_report(y_test , model_predict))
kmeans = model
kmeans

[[1906342       9]
 [   2180     255]]
Accuracy score:  0.9988531977916854
F1 score:  0.18895887365690997
Precision score:  0.9659090909090909
Recall score:  0.10472279260780287
ROC-AUC score:  0.5523590357732332
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1906351
           1       0.97      0.10      0.19      2435

    accuracy                           1.00   1908786
   macro avg       0.98      0.55      0.59   1908786
weighted avg       1.00      1.00      1.00   1908786

Wall time: 2min 10s


Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('smote',
                 SMOTE(k_neighbors=5, kind='deprecated',
                       m_neighbors='deprecated', n_jobs=1,
                       out_step='deprecated', random_state=None, ratio=None,
                       sampling_strategy='auto', svm_estimator='deprecated')),
                ('kmeans',
                 KMeans(algorithm='auto', copy_x=True, init='k-means++',
                        max_iter=300, n_clusters=2, n_init=10, n_jobs=None,
                        precompute_distances='auto', random_state=None,
                        tol=0.0001, verbose=0))],
         verbose=False)

In [18]:
%%time
# Setup the pipeline
steps = [('scaler', StandardScaler()),
         ('smote', SMOTE()),         
         ('mlp', MLPClassifier())]

model = Pipeline(steps)


# Fit it to the training data
model.fit(X_train, y_train)

model_predict = model.predict(X_test)

# Compute and print metrics
print (confusion_matrix(y_test , model_predict))
print ("Accuracy score: ", accuracy_score(y_test , model_predict))
print("F1 score: ", f1_score(y_test , model_predict))
print("Precision score: " , precision_score(y_test , model_predict))
print("Recall score: " , recall_score(y_test , model_predict))
print("ROC-AUC score: " , roc_auc_score(y_test , model_predict))
print (classification_report(y_test , model_predict))
mlp = model
mlp



[[1901094    5257]
 [     52    2383]]
Accuracy score:  0.9972186510169291
F1 score:  0.4730521091811415
Precision score:  0.31191099476439793
Recall score:  0.9786447638603696
ROC-AUC score:  0.9879435697387259
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1906351
           1       0.31      0.98      0.47      2435

    accuracy                           1.00   1908786
   macro avg       0.66      0.99      0.74   1908786
weighted avg       1.00      1.00      1.00   1908786

Wall time: 3h 43min 30s


Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('smote',
                 SMOTE(k_neighbors=5, kind='deprecated',
                       m_neighbors='deprecated', n_jobs=1,
                       out_step='deprecated', random_state=None, ratio=None,
                       sampling_strategy='auto', svm_estimator='deprecated')),
                ('mlp',
                 MLPClassifier(activation='relu', alpha=0.0001,
                               batch_size='auto', beta_1=0.9, beta_2=0.999,
                               early_stopping=False, epsilon=1e-08,
                               hidden_layer_sizes=(100,),
                               learning_rate='constant',
                               learning_rate_init=0.001, max_iter=200,
                               momentum=0.9, n_iter_no_change=10,
                               nesterovs_momentum=True, power_t=0.5,
                        

In [19]:
%%time
# Setup the pipeline
steps = [('scaler', StandardScaler()),
         ('smote', SMOTE()),         
         ('rfc', RandomForestClassifier())]

model = Pipeline(steps)


# Fit it to the training data
model.fit(X_train, y_train)

model_predict = model.predict(X_test)

# Compute and print metrics
print (confusion_matrix(y_test , model_predict))
print ("Accuracy score: ", accuracy_score(y_test , model_predict))
print("F1 score: ", f1_score(y_test , model_predict))
print("Precision score: " , precision_score(y_test , model_predict))
print("Recall score: " , recall_score(y_test , model_predict))
print("ROC-AUC score: " , roc_auc_score(y_test , model_predict))
print (classification_report(y_test , model_predict))
rfc = model
rfc



[[1905515     836]
 [    144    2291]]
Accuracy score:  0.9994865846669034
F1 score:  0.8238043869111831
Precision score:  0.7326511032938919
Recall score:  0.9408624229979466
ROC-AUC score:  0.970211944428009
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1906351
           1       0.73      0.94      0.82      2435

    accuracy                           1.00   1908786
   macro avg       0.87      0.97      0.91   1908786
weighted avg       1.00      1.00      1.00   1908786

Wall time: 5min 42s


Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('smote',
                 SMOTE(k_neighbors=5, kind='deprecated',
                       m_neighbors='deprecated', n_jobs=1,
                       out_step='deprecated', random_state=None, ratio=None,
                       sampling_strategy='auto', svm_estimator='deprecated')),
                ('rfc',
                 RandomForestClassifier(bootstrap=True, class_weight=None,
                                        criterion='gini', max_depth=None,
                                        max_features='auto',
                                        max_leaf_nodes=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,

In [20]:
%%time
# Setup the pipeline
steps = [('scaler', StandardScaler()),
         ('smote', SMOTE()),         
         ('brfc', BalancedRandomForestClassifier())]

model = Pipeline(steps)


# Fit it to the training data
model.fit(X_train, y_train)

model_predict = model.predict(X_test)

# Compute and print metrics
print (confusion_matrix(y_test , model_predict))
print ("Accuracy score: ", accuracy_score(y_test , model_predict))
print("F1 score: ", f1_score(y_test , model_predict))
print("Precision score: " , precision_score(y_test , model_predict))
print("Recall score: " , recall_score(y_test , model_predict))
print("ROC-AUC score: " , roc_auc_score(y_test , model_predict))
print (classification_report(y_test , model_predict))
brfc = model
brfc

[[1905338    1013]
 [     97    2338]]
Accuracy score:  0.9994184785512886
F1 score:  0.8081576218458347
Precision score:  0.6977021784541928
Recall score:  0.9601642710472279
ROC-AUC score:  0.9798164446828402
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1906351
           1       0.70      0.96      0.81      2435

    accuracy                           1.00   1908786
   macro avg       0.85      0.98      0.90   1908786
weighted avg       1.00      1.00      1.00   1908786

Wall time: 1h 1min 53s


Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('smote',
                 SMOTE(k_neighbors=5, kind='deprecated',
                       m_neighbors='deprecated', n_jobs=1,
                       out_step='deprecated', random_state=None, ratio=None,
                       sampling_strategy='auto', svm_estimator='deprecated')),
                ('brfc',
                 BalancedRandomForestClassifier(bootstrap=True,
                                                class_weight=None,
                                                criterion='gini',
                                                max_depth=None,
                                                max_features='auto',
                                                max_leaf_nodes=None,
                                                min_impurity_decrease=0.0,
                                                min_samples_leaf=2,
          

In [21]:
%%time
# Setup the pipeline
steps = [('scaler', StandardScaler()),
         ('smote', SMOTE()),         
         ('xgb', GradientBoostingClassifier())]

model = Pipeline(steps)


# Fit it to the training data
model.fit(X_train, y_train)

model_predict = model.predict(X_test)

# Compute and print metrics
print (confusion_matrix(y_test , model_predict))
print ("Accuracy score: ", accuracy_score(y_test , model_predict))
print("F1 score: ", f1_score(y_test , model_predict))
print("Precision score: " , precision_score(y_test , model_predict))
print("Recall score: " , recall_score(y_test , model_predict))
print("ROC-AUC score: " , roc_auc_score(y_test , model_predict))
print (classification_report(y_test , model_predict))
xgb = model
xgb

[[1881366   24985]
 [     44    2391]]
Accuracy score:  0.9868874771713539
F1 score:  0.160410586696186
Precision score:  0.08733927527761542
Recall score:  0.9819301848049281
ROC-AUC score:  0.9844119969861425
              precision    recall  f1-score   support

           0       1.00      0.99      0.99   1906351
           1       0.09      0.98      0.16      2435

    accuracy                           0.99   1908786
   macro avg       0.54      0.98      0.58   1908786
weighted avg       1.00      0.99      0.99   1908786

Wall time: 41min 48s


Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('smote',
                 SMOTE(k_neighbors=5, kind='deprecated',
                       m_neighbors='deprecated', n_jobs=1,
                       out_step='deprecated', random_state=None, ratio=None,
                       sampling_strategy='auto', svm_estimator='deprecated')),
                ('xgb',
                 GradientBoostingClassifier(criterion='friedman_mse', init=None,
                                            l..., loss='deviance',
                                            max_depth=3, max_features=None,
                                            max_leaf_nodes=None,
                                            min_impurity_decrease=0.0,
                                            min_impurity_split=None,
                                            min_samples_leaf=1,
                                            min_samples_spli

In [None]:
%%time
# Setup the pipeline
steps = [('scaler', StandardScaler()),
         ('smote', SMOTE()),         
         ('knn', KNeighborsClassifier())]

model = Pipeline(steps)


# Fit it to the training data
model.fit(X_train, y_train)

model_predict = model.predict(X_test)

# Compute and print metrics
print (confusion_matrix(y_test , model_predict))
print ("Accuracy score: ", accuracy_score(y_test , model_predict))
print("F1 score: ", f1_score(y_test , model_predict))
print("Precision score: " , precision_score(y_test , model_predict))
print("Recall score: " , recall_score(y_test , model_predict))
print("ROC-AUC score: " , roc_auc_score(y_test , model_predict))
print (classification_report(y_test , model_predict))
knn = model
knn

In [None]:
%%time
# Setup the pipeline
steps = [('scaler', StandardScaler()),
         ('smote', SMOTE()),         
         ('svm', svm.SVC())]

model = Pipeline(steps)


# Fit it to the training data
model.fit(X_train, y_train)

model_predict = model.predict(X_test)

# Compute and print metrics
print (confusion_matrix(y_test , model_predict))
print ("Accuracy score: ", accuracy_score(y_test , model_predict))
print("F1 score: ", f1_score(y_test , model_predict))
print("Precision score: " , precision_score(y_test , model_predict))
print("Recall score: " , recall_score(y_test , model_predict))
print("ROC-AUC score: " , roc_auc_score(y_test , model_predict))
print (classification_report(y_test , model_predict))
svm = model
svm