In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from collections import Counter
from imblearn.over_sampling import RandomOverSampler, SMOTE
import dataframe_image as dfi

https://www.analyticsvidhya.com/blog/2017/03/imbalanced-data-classification/
https://stackoverflow.com/questions/55921286/should-i-balance-the-test-set-when-i-have-highly-unbalanced-data


In [3]:
df = pd.read_csv("../../data/final/creditcard_extra_graph_100_L2_2022-11-26_19:10:54.csv")
df['normAmount'] = StandardScaler().fit_transform(df['Amount'].values.reshape (-1,1))
df = df.drop(['Amount'], axis = 1)

df.head()


Unnamed: 0.1,Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,...,V25,V26,V27,V28,Class,inverted_dist,fraud_neighbor_count,community_risk,personalized_page_rank,normAmount
0,0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,...,0.128539,-0.189115,0.133558,-0.021053,0,0.0,0,0.002778,0.000177,0.244964
1,1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,...,0.16717,0.125895,-0.008983,0.014724,0,0.0,0,0.0,0.000359,-0.342475
2,2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,...,-0.327642,-0.139097,-0.055353,-0.059752,0,0.0,0,0.0,4.7e-05,1.160686
3,3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,...,0.647376,-0.221929,0.062723,0.061458,0,0.0,0,0.001603,6.4e-05,0.140534
4,4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,...,-0.20601,0.502292,0.219422,0.215153,0,0.0,0,0.002778,0.000136,-0.073403


In [4]:
result_dir = "../results"
results_name = "all_previous_in_training"

In [5]:
GROUP_TIME_LENGTH = 7200
max_time = df["Time"].max()
group_df = df.groupby(pd.cut(df["Time"], np.arange(-1, max_time + GROUP_TIME_LENGTH, GROUP_TIME_LENGTH)))


Distribution of Target class

In [6]:
df["Class"].value_counts()

0    284315
1       492
Name: Class, dtype: int64

Defining classifier

In [7]:
def predict_with_metric(model,X_test, y_test):
    prediction = model.predict(X_test)
    conf_matrix = confusion_matrix(y_test,prediction)
    tn= conf_matrix[0][0]
    fp= conf_matrix[0][1]
    fn = conf_matrix[1][0]
    tp =conf_matrix[1][1]
    return {"Prediction":list(prediction.flatten()),"Accuracy":accuracy_score(y_test, prediction),"Precision":precision_score(y_test, prediction),"Recall":recall_score(y_test,prediction),"F1":f1_score(y_test,prediction),"TN":tn,"FP":fp,"FN":fn,"TP":tp}

In [8]:
%%time
NUM_ESTIMATOR = 50
rf_clf = RandomForestClassifier(n_estimators=NUM_ESTIMATOR)

CPU times: user 28 µs, sys: 0 ns, total: 28 µs
Wall time: 31 µs


In [9]:
normal_acc_l = []
normal_pre_l = []
normal_rec_l = []
normal_f1_l = []
normal_tn_l= []
normal_fp_l= []
normal_fn_l = []
normal_tp_l = []
group_time_l = []
train_data = pd.DataFrame()
for group_time, group in group_df:
    # print("")
    print(f"===== Processing group:{group_time} , group number:{int((group_time.left + 1) / GROUP_TIME_LENGTH + 1)}=====")
    group_number = int((group_time.left + 1) / GROUP_TIME_LENGTH + 1)
    if group_number > 3:
        print(f"Predicting group {group_number}")
        X_test = group[group.columns.difference(["Class","Time"])]
        y_train = group["Class"]
        model_prediction  = predict_with_metric(rf_clf,X_test,y_train)
        group_time_l.append(group_time)
        normal_acc_l.append(model_prediction["Accuracy"])
        normal_pre_l.append(model_prediction["Precision"])
        normal_rec_l.append(model_prediction["Recall"])
        normal_f1_l.append(model_prediction["F1"])
        normal_tn_l.append(model_prediction["TN"])
        normal_fp_l.append(model_prediction["FP"])
        normal_fn_l.append(model_prediction["FN"])
        normal_tp_l.append(model_prediction["TP"])
    train_data = pd.concat([train_data,group])
    X_train = train_data[train_data.columns.difference(["Class","Time"])]
    y_train = train_data["Class"]
    rf_clf.fit(X_train,y_train)

===== Processing group:(-1.0, 7199.0] , group number:1=====
===== Processing group:(7199.0, 14399.0] , group number:2=====
===== Processing group:(14399.0, 21599.0] , group number:3=====
===== Processing group:(21599.0, 28799.0] , group number:4=====
Predicting group 4
===== Processing group:(28799.0, 35999.0] , group number:5=====
Predicting group 5
===== Processing group:(35999.0, 43199.0] , group number:6=====
Predicting group 6
===== Processing group:(43199.0, 50399.0] , group number:7=====
Predicting group 7
===== Processing group:(50399.0, 57599.0] , group number:8=====
Predicting group 8
===== Processing group:(57599.0, 64799.0] , group number:9=====
Predicting group 9
===== Processing group:(64799.0, 71999.0] , group number:10=====
Predicting group 10
===== Processing group:(71999.0, 79199.0] , group number:11=====
Predicting group 11
===== Processing group:(79199.0, 86399.0] , group number:12=====
Predicting group 12
===== Processing group:(86399.0, 93599.0] , group number:13=

In [10]:
metric_data = list(zip(group_time_l,normal_acc_l,normal_pre_l,normal_rec_l,normal_f1_l,normal_tn_l,normal_fp_l,normal_fn_l,normal_tp_l))
df = pd.DataFrame(metric_data, columns =['period','Accuracy', 'Precision', 'Recall','F-1','TN','FP','FN','TP'])
df.to_csv(f"{result_dir}/dynamic_clf_normal_extra_feature_{results_name}.csv")
dfi.export(df.style.hide_index(), f"{result_dir}/dynamic_clf_normal_extra_feature_{results_name}.png")
df

OSError: Cannot save file into a non-existent directory: '../results'

## Random Over sampling

In [None]:
%%time
NUM_ESTIMATOR = 50
rf_clf = RandomForestClassifier(n_estimators=NUM_ESTIMATOR)

In [None]:
ros_acc_l = []
ros_pre_l = []
ros_rec_l = []
ros_f1_l = []
ros_tn_l= []
ros_fp_l= []
ros_fn_l = []
ros_tp_l = []
group_time_l = []
train_data = pd.DataFrame()
for group_time, group in group_df:
    # print("")
    print(f"===== Processing group:{group_time} , group number:{int((group_time.left + 1) / GROUP_TIME_LENGTH + 1)}=====")
    group_number = int((group_time.left + 1) / GROUP_TIME_LENGTH + 1)
    if group_number > 3:
        print(f"Predicting group {group_number}")
        X_test = group[group.columns.difference(["Class","Time"])]
        y_train = group["Class"]
        model_prediction  = predict_with_metric(rf_clf,X_test,y_train)
        group_time_l.append(group_time)
        ros_acc_l.append(model_prediction["Accuracy"])
        ros_pre_l.append(model_prediction["Precision"])
        ros_rec_l.append(model_prediction["Recall"])
        ros_f1_l.append(model_prediction["F1"])
        ros_tn_l.append(model_prediction["TN"])
        ros_fp_l.append(model_prediction["FP"])
        ros_fn_l.append(model_prediction["FN"])
        ros_tp_l.append(model_prediction["TP"])
    # Random over sampling
    train_data = pd.concat([train_data,group])
    X_train = train_data[train_data.columns.difference(["Class","Time"])]
    y_train = train_data["Class"]
    ros = RandomOverSampler(random_state=1234)
    X_group_ros, y_group_ros = ros.fit_resample(X_train,y_train)
    # Check the number of records after over sampling
    print(sorted(Counter(y_group_ros).items()))
    # Training model
    rf_clf.fit(X_train,y_train)


In [None]:
metric_data = list(zip(group_time_l,ros_acc_l,ros_pre_l,ros_rec_l,ros_f1_l,ros_tn_l,ros_fp_l,ros_fn_l,ros_tp_l))
df = pd.DataFrame(metric_data, columns =['period','Accuracy', 'Precision', 'Recall','F-1','TN','FP','FN','TP'])
df.to_csv(f"{result_dir}/dynamic_clf_ros_extra_feature_{results_name}.csv")
dfi.export(df.style.hide_index(), f"{result_dir}/dynamic_clf_ros_extra_feature_{results_name}.png")
df

# Over sampling with SMOTHE

In [None]:
%%time
NUM_ESTIMATOR = 50
rf_clf = RandomForestClassifier(n_estimators=NUM_ESTIMATOR)

In [None]:
smote_acc_l = []
smote_pre_l = []
smote_rec_l = []
smote_f1_l = []
smote_tn_l= []
smote_fp_l= []
smote_fn_l = []
smote_tp_l = []
group_time_l = []
train_data = pd.DataFrame()
for group_time, group in group_df:
    # print("")
    print(f"===== Processing group:{group_time} , group number:{int((group_time.left + 1) / GROUP_TIME_LENGTH + 1)}=====")
    group_number = int((group_time.left + 1) / GROUP_TIME_LENGTH + 1)
    if group_number > 3:
        print(f"Predicting group {group_number}")
        X_test = group[group.columns.difference(["Class","Time"])]
        y_train = group["Class"]
        model_prediction  = predict_with_metric(rf_clf,X_test,y_train)
        group_time_l.append(group_time)
        smote_acc_l.append(model_prediction["Accuracy"])
        smote_pre_l.append(model_prediction["Precision"])
        smote_rec_l.append(model_prediction["Recall"])
        smote_f1_l.append(model_prediction["F1"])
        smote_tn_l.append(model_prediction["TN"])
        smote_fp_l.append(model_prediction["FP"])
        smote_fn_l.append(model_prediction["FN"])
        smote_tp_l.append(model_prediction["TP"])
    # Random over sampling
    train_data = pd.concat([train_data,group])
    X_train = train_data[train_data.columns.difference(["Class","Time"])]
    y_train = train_data["Class"]
    smote = SMOTE(random_state=1234,k_neighbors=3)
    X_group_smote, y_group_smote = smote.fit_resample(X_train,y_train)
    # Check the number of records after over sampling
    print(sorted(Counter(y_group_smote).items()))
    # Training model
    rf_clf.fit(X_train,y_train)
    ## pdate train data


In [None]:
metric_data = list(zip(group_time_l,smote_acc_l,smote_pre_l,smote_rec_l,smote_f1_l,smote_tn_l,smote_fp_l,smote_fn_l,smote_tp_l))
df = pd.DataFrame(metric_data, columns =['period','Accuracy', 'Precision', 'Recall','F-1','TN','FP','FN','TP'])
df.to_csv(f"{result_dir}/dynamic_clf_smote_extra_feature_{results_name}.csv")
dfi.export(df.style.hide_index(), f"{result_dir}/dynamic_clf_smote_extra_feature_{results_name}.png")
df

In [None]:
smote_acc_l