In [1]:
%%time
# Import Modules
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
pd.options.display.float_format = '{:,.2f}'.format
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score , confusion_matrix , precision_score , precision_recall_curve , f1_score
from sklearn.metrics import classification_report , roc_auc_score, recall_score
from sklearn.pipeline import Pipeline
import sklearn.metrics as metrics
from joblib import dump, load
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
#Import Models
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.cluster import KMeans
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier


Using TensorFlow backend.


CPU times: user 4.35 s, sys: 2.27 s, total: 6.62 s
Wall time: 13.7 s


In [2]:
%%time
# Read csv file into dataframe
df = pd.read_csv('paysim_features.csv').drop(['Unnamed: 0'], axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 20 columns):
step              int64
amount            float64
ChangeOrig        float64
ChangeDest        float64
nameOrig          object
oldbalanceOrg     float64
newbalanceOrig    float64
nameDest          object
oldbalanceDest    float64
newbalanceDest    float64
isFraud           int64
isFlaggedFraud    int64
type_CASH_IN      int64
type_CASH_OUT     int64
type_DEBIT        int64
type_PAYMENT      int64
type_TRANSFER     int64
steptime          object
dayofweek         int64
timeofday         float64
dtypes: float64(8), int64(9), object(3)
memory usage: 970.9+ MB
CPU times: user 20.5 s, sys: 5.8 s, total: 26.3 s
Wall time: 27.4 s


In [3]:
df.describe()

Unnamed: 0,step,amount,ChangeOrig,ChangeDest,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER,dayofweek,timeofday
count,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0
mean,243.4,179861.9,21230.56,124294.73,833883.1,855113.67,1100701.67,1224996.4,0.0,0.0,0.22,0.35,0.01,0.34,0.08,3.12,15.32
std,142.33,603858.23,146643.29,812939.08,2888242.67,2924048.5,3399180.11,3674128.94,0.04,0.0,0.41,0.48,0.08,0.47,0.28,1.65,4.32
min,1.0,0.0,-10000000.0,-13060826.21,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,156.0,13389.57,-10150.44,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,12.0
50%,239.0,74871.94,0.0,0.0,14208.0,0.0,132705.66,214661.44,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,16.0
75%,335.0,208721.48,0.0,149105.43,107315.18,144258.41,943036.71,1111909.25,0.0,0.0,0.0,1.0,0.0,1.0,0.0,4.0,19.0
max,743.0,92445516.64,1915267.9,105687838.82,59585040.37,49585040.37,356015889.35,356179278.92,1.0,1.0,1.0,1.0,1.0,1.0,1.0,6.0,23.0


In [4]:
df.head()

Unnamed: 0,step,amount,ChangeOrig,ChangeDest,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER,steptime,dayofweek,timeofday
0,1,9839.64,-9839.64,0.0,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0,0,0,0,1,0,1970-01-01 01:00:00,3,1.0
1,1,1864.28,-1864.28,0.0,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0,0,0,0,1,0,1970-01-01 01:00:00,3,1.0
2,1,181.0,-181.0,0.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0,0,0,0,0,1,1970-01-01 01:00:00,3,1.0
3,1,181.0,-181.0,-21182.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0,0,1,0,0,0,1970-01-01 01:00:00,3,1.0
4,1,11668.14,-11668.14,0.0,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0,0,0,0,1,0,1970-01-01 01:00:00,3,1.0


In [5]:
# Define X with selected columns, y as the binary variable to predict
X = df.drop(['isFraud' , 'isFlaggedFraud' , 'nameOrig' , 'nameDest' ,'steptime'] , axis = 1 ).values
y = df['isFraud'].values
# Train, Test, Split to segregate Training from Test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [6]:
X.shape

(6362620, 15)

In [7]:
%%time
# Setup the pipeline
steps = [('scaler', StandardScaler()),
         ('lr', LogisticRegression(class_weight='balanced'))]

model = Pipeline(steps)


# Fit it to the training data
model.fit(X_train, y_train)

model_predict = model.predict(X_test)

# Compute and print metrics
print (confusion_matrix(y_test , model_predict))
print ("Accuracy score: ", accuracy_score(y_test , model_predict))
print("F1 score: ", f1_score(y_test , model_predict))
print("Precision score: " , precision_score(y_test , model_predict))
print("Recall score: " , recall_score(y_test , model_predict))
print("ROC-AUC score: " , roc_auc_score(y_test , model_predict))
print (classification_report(y_test , model_predict))
lr = model
lr



[[1819643   86708]
 [    140    2295]]
Accuracy score:  0.9545009236237064
F1 score:  0.05019794833657779
Precision score:  0.02578564767479748
Recall score:  0.9425051334702259
ROC-AUC score:  0.9485106897145642
              precision    recall  f1-score   support

           0       1.00      0.95      0.98   1906351
           1       0.03      0.94      0.05      2435

    accuracy                           0.95   1908786
   macro avg       0.51      0.95      0.51   1908786
weighted avg       1.00      0.95      0.98   1908786

CPU times: user 1min 36s, sys: 6.47 s, total: 1min 42s
Wall time: 1min 36s


Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('lr',
                 LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='warn', n_jobs=None,
                                    penalty='l2', random_state=None,
                                    solver='warn', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)

In [8]:
%%time
# Setup the pipeline
steps = [('scaler', StandardScaler()),
         ('rfc', RandomForestClassifier(class_weight='balanced'))]

model = Pipeline(steps)


# Fit it to the training data
model.fit(X_train, y_train)

model_predict = model.predict(X_test)

# Compute and print metrics
print (confusion_matrix(y_test , model_predict))
print ("Accuracy score: ", accuracy_score(y_test , model_predict))
print("F1 score: ", f1_score(y_test , model_predict))
print("Precision score: " , precision_score(y_test , model_predict))
print("Recall score: " , recall_score(y_test , model_predict))
print("ROC-AUC score: " , roc_auc_score(y_test , model_predict))
print (classification_report(y_test , model_predict))
rfc = model
rfc



[[1906332      19]
 [    482    1953]]
Accuracy score:  0.9997375295082843
F1 score:  0.8863172226004085
Precision score:  0.9903651115618661
Recall score:  0.8020533880903491
ROC-AUC score:  0.9010217107026526
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1906351
           1       0.99      0.80      0.89      2435

    accuracy                           1.00   1908786
   macro avg       1.00      0.90      0.94   1908786
weighted avg       1.00      1.00      1.00   1908786

CPU times: user 2min 39s, sys: 3.01 s, total: 2min 42s
Wall time: 2min 27s


Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('rfc',
                 RandomForestClassifier(bootstrap=True, class_weight='balanced',
                                        criterion='gini', max_depth=None,
                                        max_features='auto',
                                        max_leaf_nodes=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        n_estimators=10, n_jobs=None,
                                        oob_score=False, random_state=None,
                                        verbose=0, warm_start=False))],
         verbose=False)

In [9]:
%%time
# Setup the pipeline
steps = [('scaler', StandardScaler()),
         ('brfc', BalancedRandomForestClassifier(class_weight='balanced'))]

model = Pipeline(steps)


# Fit it to the training data
model.fit(X_train, y_train)

model_predict = model.predict(X_test)

# Compute and print metrics
print (confusion_matrix(y_test , model_predict))
print ("Accuracy score: ", accuracy_score(y_test , model_predict))
print("F1 score: ", f1_score(y_test , model_predict))
print("Precision score: " , precision_score(y_test , model_predict))
print("Recall score: " , recall_score(y_test , model_predict))
print("ROC-AUC score: " , roc_auc_score(y_test , model_predict))
print (classification_report(y_test , model_predict))
brfc = model
brfc

[[1877057   29294]
 [     20    2415]]
Accuracy score:  0.9846425948220492
F1 score:  0.14145970009372072
Precision score:  0.07616134220568294
Recall score:  0.9917864476386037
ROC-AUC score:  0.9882099587752465
              precision    recall  f1-score   support

           0       1.00      0.98      0.99   1906351
           1       0.08      0.99      0.14      2435

    accuracy                           0.98   1908786
   macro avg       0.54      0.99      0.57   1908786
weighted avg       1.00      0.98      0.99   1908786

CPU times: user 5min 22s, sys: 32.7 s, total: 5min 55s
Wall time: 4min 10s


Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('brfc',
                 BalancedRandomForestClassifier(bootstrap=True,
                                                class_weight='balanced',
                                                criterion='gini',
                                                max_depth=None,
                                                max_features='auto',
                                                max_leaf_nodes=None,
                                                min_impurity_decrease=0.0,
                                                min_samples_leaf=2,
                                                min_samples_split=2,
                                                min_weight_fraction_leaf=0.0,
                                                n_estimators=100, n_jobs=1,
                                                oob_score=False,
              

In [10]:
%%time
# Setup the pipeline
steps = [('scaler', StandardScaler()),
         ('lsvm', svm.LinearSVC(class_weight='balanced'))]

model = Pipeline(steps)


# Fit it to the training data
model.fit(X_train, y_train)

model_predict = model.predict(X_test)

# Compute and print metrics
print (confusion_matrix(y_test , model_predict))
print ("Accuracy score: ", accuracy_score(y_test , model_predict))
print("F1 score: ", f1_score(y_test , model_predict))
print("Precision score: " , precision_score(y_test , model_predict))
print("Recall score: " , recall_score(y_test , model_predict))
print("ROC-AUC score: " , roc_auc_score(y_test , model_predict))
print (classification_report(y_test , model_predict))
lsvm = model
lsvm



[[1894472   11879]
 [    397    2038]]
Accuracy score:  0.9935686871131704
F1 score:  0.24926614481409
Precision score:  0.14643960623697636
Recall score:  0.8369609856262834
ROC-AUC score:  0.9153648546122019
              precision    recall  f1-score   support

           0       1.00      0.99      1.00   1906351
           1       0.15      0.84      0.25      2435

    accuracy                           0.99   1908786
   macro avg       0.57      0.92      0.62   1908786
weighted avg       1.00      0.99      1.00   1908786

CPU times: user 20min 54s, sys: 8.34 s, total: 21min 2s
Wall time: 21min


Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('lsvm',
                 LinearSVC(C=1.0, class_weight='balanced', dual=True,
                           fit_intercept=True, intercept_scaling=1,
                           loss='squared_hinge', max_iter=1000,
                           multi_class='ovr', penalty='l2', random_state=None,
                           tol=0.0001, verbose=0))],
         verbose=False)