In [83]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from collections import Counter
from imblearn.over_sampling import RandomOverSampler, SMOTE
import dataframe_image as dfi

https://www.analyticsvidhya.com/blog/2017/03/imbalanced-data-classification/
https://stackoverflow.com/questions/55921286/should-i-balance-the-test-set-when-i-have-highly-unbalanced-data


In [92]:
df = pd.read_csv("../data/raw/creditcard.csv")
df['normAmount'] = StandardScaler().fit_transform(df['Amount'].values.reshape (-1,1))
df = df.drop(['Amount'], axis = 1)

df.head()


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Class,normAmount
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,0,0.244964
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,0,-0.342475
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,0,1.160686
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,0,0.140534
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,0,-0.073403


In [93]:
result_dir = "../results"
results_name = "all_previous_in_training"

In [94]:
GROUP_TIME_LENGTH = 7200
max_time = df["Time"].max()
group_df = df.groupby(pd.cut(df["Time"], np.arange(-1, max_time + GROUP_TIME_LENGTH, GROUP_TIME_LENGTH)))


Distribution of Target class

In [95]:
df["Class"].value_counts()

0    284315
1       492
Name: Class, dtype: int64

Defining classifier

In [96]:
def predict_with_metric(model,X_test, y_test):
    prediction = model.predict(X_test)
    conf_matrix = confusion_matrix(y_test,prediction)
    tn= conf_matrix[0][0]
    fp= conf_matrix[0][1]
    fn = conf_matrix[1][0]
    tp =conf_matrix[1][1]
    return {"Prediction":list(prediction.flatten()),"Accuracy":accuracy_score(y_test, prediction),"Precision":precision_score(y_test, prediction),"Recall":recall_score(y_test,prediction),"F1":f1_score(y_test,prediction),"TN":tn,"FP":fp,"FN":fn,"TP":tp}

In [97]:
%%time
NUM_ESTIMATOR = 50
rf_clf = RandomForestClassifier(n_estimators=NUM_ESTIMATOR)

CPU times: user 35 µs, sys: 1e+03 ns, total: 36 µs
Wall time: 40.1 µs


In [98]:
normal_acc_l = []
normal_pre_l = []
normal_rec_l = []
normal_f1_l = []
normal_tn_l= []
normal_fp_l= []
normal_fn_l = []
normal_tp_l = []
group_time_l = []
train_data = pd.DataFrame()
for group_time, group in group_df:
    # print("")
    print(f"===== Processing group:{group_time} , group number:{int((group_time.left + 1) / GROUP_TIME_LENGTH + 1)}=====")
    group_number = int((group_time.left + 1) / GROUP_TIME_LENGTH + 1)
    if group_number > 3:
        print(f"Predicting group {group_number}")
        X_test = group[group.columns.difference(["Class","Time"])]
        y_train = group["Class"]
        model_prediction  = predict_with_metric(rf_clf,X_test,y_train)
        group_time_l.append(group_time)
        normal_acc_l.append(model_prediction["Accuracy"])
        normal_pre_l.append(model_prediction["Precision"])
        normal_rec_l.append(model_prediction["Recall"])
        normal_f1_l.append(model_prediction["F1"])
        normal_tn_l.append(model_prediction["TN"])
        normal_fp_l.append(model_prediction["FP"])
        normal_fn_l.append(model_prediction["FN"])
        normal_tp_l.append(model_prediction["TP"])
    train_data = pd.concat([train_data,group])
    X_train = train_data[train_data.columns.difference(["Class","Time"])]
    y_train = train_data["Class"]
    rf_clf.fit(X_train,y_train)

===== Processing group:(-1.0, 7199.0] , group number:1=====
===== Processing group:(7199.0, 14399.0] , group number:2=====
===== Processing group:(14399.0, 21599.0] , group number:3=====
===== Processing group:(21599.0, 28799.0] , group number:4=====
Predicting group 4
===== Processing group:(28799.0, 35999.0] , group number:5=====
Predicting group 5
===== Processing group:(35999.0, 43199.0] , group number:6=====
Predicting group 6
===== Processing group:(43199.0, 50399.0] , group number:7=====
Predicting group 7
===== Processing group:(50399.0, 57599.0] , group number:8=====
Predicting group 8
===== Processing group:(57599.0, 64799.0] , group number:9=====
Predicting group 9
===== Processing group:(64799.0, 71999.0] , group number:10=====
Predicting group 10
===== Processing group:(71999.0, 79199.0] , group number:11=====
Predicting group 11
===== Processing group:(79199.0, 86399.0] , group number:12=====
Predicting group 12
===== Processing group:(86399.0, 93599.0] , group number:13=

In [99]:
metric_data = list(zip(group_time_l,normal_acc_l,normal_pre_l,normal_rec_l,normal_f1_l,normal_tn_l,normal_fp_l,normal_fn_l,normal_tp_l))
df = pd.DataFrame(metric_data, columns =['period','Accuracy', 'Precision', 'Recall','F-1','TN','FP','FN','TP'])
df.to_csv(f"{result_dir}/dynamic_clf_normal_{results_name}.csv")
dfi.export(df.style.hide_index(), f"{result_dir}/dynamic_clf_normal_{results_name}.png")
df

  dfi.export(df.style.hide_index(), f"{result_dir}/dynamic_clf_normal_{results_name}.png")
[1113/141557.178384:INFO:headless_shell.cc(660)] Written to file /tmp/tmptrd6_jn1/temp.png.


Unnamed: 0,period,Accuracy,Precision,Recall,F-1,TN,FP,FN,TP
0,"(21599.0, 28799.0]",0.997307,0.6875,0.846154,0.758621,5163,10,4,22
1,"(28799.0, 35999.0]",0.999234,0.916667,0.55,0.6875,13036,1,9,11
2,"(35999.0, 43199.0]",0.999464,0.928571,0.866667,0.896552,16757,3,6,39
3,"(43199.0, 50399.0]",0.999217,0.875,0.388889,0.538462,15298,1,11,7
4,"(50399.0, 57599.0]",0.999685,1.0,0.814815,0.897959,15838,0,5,22
5,"(57599.0, 64799.0]",0.999553,1.0,0.730769,0.844444,15642,0,7,19
6,"(64799.0, 71999.0]",0.999578,1.0,0.681818,0.810811,16579,0,7,15
7,"(71999.0, 79199.0]",0.999311,0.764706,0.590909,0.666667,18849,4,9,13
8,"(79199.0, 86399.0]",0.999801,1.0,0.85,0.918919,15039,0,3,17
9,"(86399.0, 93599.0]",0.997733,0.454545,0.416667,0.434783,5717,6,7,5


## Random Over sampling

In [100]:
%%time
NUM_ESTIMATOR = 50
rf_clf = RandomForestClassifier(n_estimators=NUM_ESTIMATOR)

CPU times: user 75 µs, sys: 0 ns, total: 75 µs
Wall time: 78.2 µs


In [101]:
ros_acc_l = []
ros_pre_l = []
ros_rec_l = []
ros_f1_l = []
ros_tn_l= []
ros_fp_l= []
ros_fn_l = []
ros_tp_l = []
group_time_l = []
train_data = pd.DataFrame()
for group_time, group in group_df:
    # print("")
    print(f"===== Processing group:{group_time} , group number:{int((group_time.left + 1) / GROUP_TIME_LENGTH + 1)}=====")
    group_number = int((group_time.left + 1) / GROUP_TIME_LENGTH + 1)
    if group_number > 3:
        print(f"Predicting group {group_number}")
        X_test = group[group.columns.difference(["Class","Time"])]
        y_train = group["Class"]
        model_prediction  = predict_with_metric(rf_clf,X_test,y_train)
        group_time_l.append(group_time)
        ros_acc_l.append(model_prediction["Accuracy"])
        ros_pre_l.append(model_prediction["Precision"])
        ros_rec_l.append(model_prediction["Recall"])
        ros_f1_l.append(model_prediction["F1"])
        ros_tn_l.append(model_prediction["TN"])
        ros_fp_l.append(model_prediction["FP"])
        ros_fn_l.append(model_prediction["FN"])
        ros_tp_l.append(model_prediction["TP"])
    # Random over sampling
    train_data = pd.concat([train_data,group])
    X_train = train_data[train_data.columns.difference(["Class","Time"])]
    y_train = train_data["Class"]
    ros = RandomOverSampler(random_state=1234)
    X_group_ros, y_group_ros = ros.fit_resample(X_train,y_train)
    # Check the number of records after over sampling
    print(sorted(Counter(y_group_ros).items()))
    # Training model
    rf_clf.fit(X_train,y_train)


===== Processing group:(-1.0, 7199.0] , group number:1=====
[(0, 6176), (1, 6176)]
===== Processing group:(7199.0, 14399.0] , group number:2=====
[(0, 9539), (1, 9539)]
===== Processing group:(14399.0, 21599.0] , group number:3=====
[(0, 12285), (1, 12285)]
===== Processing group:(21599.0, 28799.0] , group number:4=====
Predicting group 4
[(0, 17458), (1, 17458)]
===== Processing group:(28799.0, 35999.0] , group number:5=====
Predicting group 5
[(0, 30495), (1, 30495)]
===== Processing group:(35999.0, 43199.0] , group number:6=====
Predicting group 6
[(0, 47255), (1, 47255)]
===== Processing group:(43199.0, 50399.0] , group number:7=====
Predicting group 7
[(0, 62554), (1, 62554)]
===== Processing group:(50399.0, 57599.0] , group number:8=====
Predicting group 8
[(0, 78392), (1, 78392)]
===== Processing group:(57599.0, 64799.0] , group number:9=====
Predicting group 9
[(0, 94034), (1, 94034)]
===== Processing group:(64799.0, 71999.0] , group number:10=====
Predicting group 10
[(0, 1106

In [102]:
metric_data = list(zip(group_time_l,ros_acc_l,ros_pre_l,ros_rec_l,ros_f1_l,ros_tn_l,ros_fp_l,ros_fn_l,ros_tp_l))
df = pd.DataFrame(metric_data, columns =['period','Accuracy', 'Precision', 'Recall','F-1','TN','FP','FN','TP'])
df.to_csv(f"{result_dir}/dynamic_clf_ros_{results_name}.csv")
dfi.export(df.style.hide_index(), f"{result_dir}/dynamic_clf_ros_{results_name}.png")
df

  dfi.export(df.style.hide_index(), f"{result_dir}/dynamic_clf_ros_{results_name}.png")
[1113/143424.310653:INFO:headless_shell.cc(660)] Written to file /tmp/tmp2hvxkbd5/temp.png.


Unnamed: 0,period,Accuracy,Precision,Recall,F-1,TN,FP,FN,TP
0,"(21599.0, 28799.0]",0.995961,0.777778,0.269231,0.4,5171,2,19,7
1,"(28799.0, 35999.0]",0.999234,0.916667,0.55,0.6875,13036,1,9,11
2,"(35999.0, 43199.0]",0.999464,0.928571,0.866667,0.896552,16757,3,6,39
3,"(43199.0, 50399.0]",0.999217,0.8,0.444444,0.571429,15297,2,10,8
4,"(50399.0, 57599.0]",0.999685,1.0,0.814815,0.897959,15838,0,5,22
5,"(57599.0, 64799.0]",0.999553,1.0,0.730769,0.844444,15642,0,7,19
6,"(64799.0, 71999.0]",0.999578,1.0,0.681818,0.810811,16579,0,7,15
7,"(71999.0, 79199.0]",0.99947,0.833333,0.681818,0.75,18850,3,7,15
8,"(79199.0, 86399.0]",0.999801,1.0,0.85,0.918919,15039,0,3,17
9,"(86399.0, 93599.0]",0.997559,0.4,0.333333,0.363636,5717,6,8,4


# Over sampling with SMOTHE

In [103]:
%%time
NUM_ESTIMATOR = 50
rf_clf = RandomForestClassifier(n_estimators=NUM_ESTIMATOR)

CPU times: user 162 µs, sys: 1e+03 ns, total: 163 µs
Wall time: 311 µs


In [104]:
smote_acc_l = []
smote_pre_l = []
smote_rec_l = []
smote_f1_l = []
smote_tn_l= []
smote_fp_l= []
smote_fn_l = []
smote_tp_l = []
group_time_l = []
train_data = pd.DataFrame()
for group_time, group in group_df:
    # print("")
    print(f"===== Processing group:{group_time} , group number:{int((group_time.left + 1) / GROUP_TIME_LENGTH + 1)}=====")
    group_number = int((group_time.left + 1) / GROUP_TIME_LENGTH + 1)
    if group_number > 3:
        print(f"Predicting group {group_number}")
        X_test = group[group.columns.difference(["Class","Time"])]
        y_train = group["Class"]
        model_prediction  = predict_with_metric(rf_clf,X_test,y_train)
        group_time_l.append(group_time)
        smote_acc_l.append(model_prediction["Accuracy"])
        smote_pre_l.append(model_prediction["Precision"])
        smote_rec_l.append(model_prediction["Recall"])
        smote_f1_l.append(model_prediction["F1"])
        smote_tn_l.append(model_prediction["TN"])
        smote_fp_l.append(model_prediction["FP"])
        smote_fn_l.append(model_prediction["FN"])
        smote_tp_l.append(model_prediction["TP"])
    # Random over sampling
    train_data = pd.concat([train_data,group])
    X_train = train_data[train_data.columns.difference(["Class","Time"])]
    y_train = train_data["Class"]
    smote = SMOTE(random_state=1234,k_neighbors=3)
    X_group_smote, y_group_smote = smote.fit_resample(X_train,y_train)
    # Check the number of records after over sampling
    print(sorted(Counter(y_group_smote).items()))
    # Training model
    rf_clf.fit(X_train,y_train)
    ## pdate train data


===== Processing group:(-1.0, 7199.0] , group number:1=====
[(0, 6176), (1, 6176)]
===== Processing group:(7199.0, 14399.0] , group number:2=====
[(0, 9539), (1, 9539)]
===== Processing group:(14399.0, 21599.0] , group number:3=====
[(0, 12285), (1, 12285)]
===== Processing group:(21599.0, 28799.0] , group number:4=====
Predicting group 4
[(0, 17458), (1, 17458)]
===== Processing group:(28799.0, 35999.0] , group number:5=====
Predicting group 5
[(0, 30495), (1, 30495)]
===== Processing group:(35999.0, 43199.0] , group number:6=====
Predicting group 6
[(0, 47255), (1, 47255)]
===== Processing group:(43199.0, 50399.0] , group number:7=====
Predicting group 7
[(0, 62554), (1, 62554)]
===== Processing group:(50399.0, 57599.0] , group number:8=====
Predicting group 8
[(0, 78392), (1, 78392)]
===== Processing group:(57599.0, 64799.0] , group number:9=====
Predicting group 9
[(0, 94034), (1, 94034)]
===== Processing group:(64799.0, 71999.0] , group number:10=====
Predicting group 10
[(0, 1106

In [105]:
metric_data = list(zip(group_time_l,smote_acc_l,smote_pre_l,smote_rec_l,smote_f1_l,smote_tn_l,smote_fp_l,smote_fn_l,smote_tp_l))
df = pd.DataFrame(metric_data, columns =['period','Accuracy', 'Precision', 'Recall','F-1','TN','FP','FN','TP'])
df.to_csv(f"{result_dir}/dynamic_clf_smote_{results_name}.csv")
dfi.export(df.style.hide_index(), f"{result_dir}/dynamic_clf_smote_{results_name}.png")
df

  dfi.export(df.style.hide_index(), f"{result_dir}/dynamic_clf_smote_{results_name}.png")
[1113/145217.941054:INFO:headless_shell.cc(660)] Written to file /tmp/tmpfxsewu4w/temp.png.


Unnamed: 0,period,Accuracy,Precision,Recall,F-1,TN,FP,FN,TP
0,"(21599.0, 28799.0]",0.996538,0.611111,0.846154,0.709677,5159,14,4,22
1,"(28799.0, 35999.0]",0.999311,1.0,0.55,0.709677,13037,0,9,11
2,"(35999.0, 43199.0]",0.999464,0.928571,0.866667,0.896552,16757,3,6,39
3,"(43199.0, 50399.0]",0.999282,0.888889,0.444444,0.592593,15298,1,10,8
4,"(50399.0, 57599.0]",0.999433,1.0,0.666667,0.8,15838,0,9,18
5,"(57599.0, 64799.0]",0.999553,1.0,0.730769,0.844444,15642,0,7,19
6,"(64799.0, 71999.0]",0.999578,1.0,0.681818,0.810811,16579,0,7,15
7,"(71999.0, 79199.0]",0.99947,0.833333,0.681818,0.75,18850,3,7,15
8,"(79199.0, 86399.0]",0.999734,0.944444,0.85,0.894737,15038,1,3,17
9,"(86399.0, 93599.0]",0.997733,0.454545,0.416667,0.434783,5717,6,7,5
