In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from collections import Counter
from imblearn.over_sampling import RandomOverSampler, SMOTE
import dataframe_image as dfi

https://www.analyticsvidhya.com/blog/2017/03/imbalanced-data-classification/
https://stackoverflow.com/questions/55921286/should-i-balance-the-test-set-when-i-have-highly-unbalanced-data


In [7]:
df = pd.read_csv("../../data/final/trans_fraud_extra_2022-10-31_12:38:18.csv")
df['normAmount'] = StandardScaler().fit_transform(df['Amount'].values.reshape (-1,1))
df = df.drop(['Amount'], axis = 1)

df.head()


Unnamed: 0.1,Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,...,V25,V26,V27,V28,Class,inversed_dist,fraud_neighbor_count,community_risk,personalized_page_rank,normAmount
0,0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,...,0.128539,-0.189115,0.133558,-0.021053,0,0.0,0,0.002786,0.000177,0.244964
1,1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,...,0.16717,0.125895,-0.008983,0.014724,0,0.0,0,0.0,0.000359,-0.342475
2,2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,...,-0.327642,-0.139097,-0.055353,-0.059752,0,0.0,0,0.0,4.7e-05,1.160686
3,3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,...,0.647376,-0.221929,0.062723,0.061458,0,0.0,0,0.001603,6.4e-05,0.140534
4,4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,...,-0.20601,0.502292,0.219422,0.215153,0,0.0,0,0.002786,0.000136,-0.073403


In [8]:
df.columns

Index(['Unnamed: 0', 'Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8',
       'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18',
       'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28',
       'Class', 'inversed_dist', 'fraud_neighbor_count', 'community_risk',
       'personalized_page_rank', 'normAmount'],
      dtype='object')

In [19]:
result_dir = "../../results"
results_name = "ablation_all_previous_in_training"

In [20]:
GROUP_TIME_LENGTH = 7200
max_time = df["Time"].max()
group_df = df.groupby(pd.cut(df["Time"], np.arange(-1, max_time + GROUP_TIME_LENGTH, GROUP_TIME_LENGTH)))


KeyError: 'Time'

Distribution of Target class

In [None]:
df["Class"].value_counts()

Defining classifier

In [None]:
def predict_with_metric(model,X_test, y_test):
    prediction = model.predict(X_test)
    conf_matrix = confusion_matrix(y_test,prediction)
    tn= conf_matrix[0][0]
    fp= conf_matrix[0][1]
    fn = conf_matrix[1][0]
    tp =conf_matrix[1][1]
    return {"Prediction":list(prediction.flatten()),"Accuracy":accuracy_score(y_test, prediction),"Precision":precision_score(y_test, prediction),"Recall":recall_score(y_test,prediction),"F1":f1_score(y_test,prediction),"TN":tn,"FP":fp,"FN":fn,"TP":tp}

In [None]:
%%time
NUM_ESTIMATOR = 50
rf_clf = RandomForestClassifier(n_estimators=NUM_ESTIMATOR)

In [24]:
def ablation_test(feature_name):
    acc_l = []
    pre_l = []
    rec_l = []
    f1_l = []
    tn_l= []
    fp_l= []
    fn_l = []
    tp_l = []
    group_time_l = []
    train_data = pd.DataFrame()
    for group_time, group in group_df:
        # print("")
        print(f"===== Processing group:{group_time} , group number:{int((group_time.left + 1) / GROUP_TIME_LENGTH + 1)}=====")
        group_number = int((group_time.left + 1) / GROUP_TIME_LENGTH + 1)
        if group_number > 3:
            print(f"Predicting group {group_number}")
            X_test = group[group.columns.difference(["Class","Time",feature_name])]
            y_train = group["Class"]
            model_prediction  = predict_with_metric(rf_clf,X_test,y_train)
            group_time_l.append(group_time)
            acc_l.append(model_prediction["Accuracy"])
            pre_l.append(model_prediction["Precision"])
            rec_l.append(model_prediction["Recall"])
            f1_l.append(model_prediction["F1"])
            tn_l.append(model_prediction["TN"])
            fp_l.append(model_prediction["FP"])
            fn_l.append(model_prediction["FN"])
            tp_l.append(model_prediction["TP"])
        train_data = pd.concat([train_data,group])
        X_train = train_data[train_data.columns.difference(["Class","Time",feature_name])]
        y_train = train_data["Class"]
        rf_clf.fit(X_train,y_train)
    metric_data = list(zip(group_time_l,acc_l,pre_l,rec_l,f1_l,tn_l,fp_l,fn_l,tp_l))
    df = pd.DataFrame(metric_data, columns =['period','Accuracy', 'Precision', 'Recall','F-1','TN','FP','FN','TP'])
    df.to_csv(f"{result_dir}/dynamic_clf_normal_extra_feature_{results_name}_{feature_name}.csv")
    dfi.export(df.style.hide_index(), f"{result_dir}/dynamic_clf_normal_extra_feature_{results_name}_{feature_name}.png")
    ## Total metrics
    TN = sum(tn_l)
    TP = sum(tp_l)
    FN = sum(fn_l)
    FP = sum(fp_l)
    metrics = {}
    metrics["Precision"] = TP/(TP+FP)
    metrics["Recall"] = TP/(TP+FN)
    metrics["F1"] = (metrics["Precision"]*metrics["Recall"]*2)/(metrics["Precision"]+metrics["Recall"])
    metrics_df = pd.DataFrame(columns=["Precision","Recall","F1"])
    metrics_df = metrics_df.append(metrics,ignore_index=True)
    dfi.export(metrics_df.style.hide_index(), f"{result_dir}/dynamic_clf_normal_extra_feature_{results_name}_{feature_name}_total_metrics.png")



In [25]:
ablation_feature_list = ['inversed_dist', 'fraud_neighbor_count', 'community_risk','personalized_page_rank', 'normAmount']
for f_name in ablation_feature_list:
    ablation_test(f_name)

===== Processing group:(-1.0, 7199.0] , group number:1=====
===== Processing group:(7199.0, 14399.0] , group number:2=====
===== Processing group:(14399.0, 21599.0] , group number:3=====
===== Processing group:(21599.0, 28799.0] , group number:4=====
Predicting group 4
===== Processing group:(28799.0, 35999.0] , group number:5=====
Predicting group 5
===== Processing group:(35999.0, 43199.0] , group number:6=====
Predicting group 6
===== Processing group:(43199.0, 50399.0] , group number:7=====
Predicting group 7
===== Processing group:(50399.0, 57599.0] , group number:8=====
Predicting group 8
===== Processing group:(57599.0, 64799.0] , group number:9=====
Predicting group 9
===== Processing group:(64799.0, 71999.0] , group number:10=====
Predicting group 10
===== Processing group:(71999.0, 79199.0] , group number:11=====
Predicting group 11
===== Processing group:(79199.0, 86399.0] , group number:12=====
Predicting group 12
===== Processing group:(86399.0, 93599.0] , group number:13=

  dfi.export(df.style.hide_index(), f"{result_dir}/dynamic_clf_normal_extra_feature_{results_name}_{feature_name}.png")
[1114/021340.821852:INFO:headless_shell.cc(660)] Written to file /tmp/tmpvn9x3i1w/temp.png.
  metrics_df = metrics_df.append(metrics,ignore_index=True)
  dfi.export(metrics_df.style.hide_index(), f"{result_dir}/dynamic_clf_normal_extra_feature_{results_name}_{feature_name}_total_metrics.png")
[1114/021341.075363:INFO:headless_shell.cc(660)] Written to file /tmp/tmp6jm01ohy/temp.png.


===== Processing group:(-1.0, 7199.0] , group number:1=====
===== Processing group:(7199.0, 14399.0] , group number:2=====
===== Processing group:(14399.0, 21599.0] , group number:3=====
===== Processing group:(21599.0, 28799.0] , group number:4=====
Predicting group 4
===== Processing group:(28799.0, 35999.0] , group number:5=====
Predicting group 5
===== Processing group:(35999.0, 43199.0] , group number:6=====
Predicting group 6
===== Processing group:(43199.0, 50399.0] , group number:7=====
Predicting group 7
===== Processing group:(50399.0, 57599.0] , group number:8=====
Predicting group 8
===== Processing group:(57599.0, 64799.0] , group number:9=====
Predicting group 9
===== Processing group:(64799.0, 71999.0] , group number:10=====
Predicting group 10
===== Processing group:(71999.0, 79199.0] , group number:11=====
Predicting group 11
===== Processing group:(79199.0, 86399.0] , group number:12=====
Predicting group 12
===== Processing group:(86399.0, 93599.0] , group number:13=

  dfi.export(df.style.hide_index(), f"{result_dir}/dynamic_clf_normal_extra_feature_{results_name}_{feature_name}.png")
[1114/022548.856950:INFO:headless_shell.cc(660)] Written to file /tmp/tmpn29dkzm1/temp.png.
  metrics_df = metrics_df.append(metrics,ignore_index=True)
  dfi.export(metrics_df.style.hide_index(), f"{result_dir}/dynamic_clf_normal_extra_feature_{results_name}_{feature_name}_total_metrics.png")
[1114/022549.136075:INFO:headless_shell.cc(660)] Written to file /tmp/tmpllrjxr21/temp.png.


===== Processing group:(-1.0, 7199.0] , group number:1=====
===== Processing group:(7199.0, 14399.0] , group number:2=====
===== Processing group:(14399.0, 21599.0] , group number:3=====
===== Processing group:(21599.0, 28799.0] , group number:4=====
Predicting group 4
===== Processing group:(28799.0, 35999.0] , group number:5=====
Predicting group 5
===== Processing group:(35999.0, 43199.0] , group number:6=====
Predicting group 6
===== Processing group:(43199.0, 50399.0] , group number:7=====
Predicting group 7
===== Processing group:(50399.0, 57599.0] , group number:8=====
Predicting group 8
===== Processing group:(57599.0, 64799.0] , group number:9=====
Predicting group 9
===== Processing group:(64799.0, 71999.0] , group number:10=====
Predicting group 10
===== Processing group:(71999.0, 79199.0] , group number:11=====
Predicting group 11
===== Processing group:(79199.0, 86399.0] , group number:12=====
Predicting group 12
===== Processing group:(86399.0, 93599.0] , group number:13=

  dfi.export(df.style.hide_index(), f"{result_dir}/dynamic_clf_normal_extra_feature_{results_name}_{feature_name}.png")
[1114/023726.248517:INFO:headless_shell.cc(660)] Written to file /tmp/tmpd8kaf4kz/temp.png.
  metrics_df = metrics_df.append(metrics,ignore_index=True)
  dfi.export(metrics_df.style.hide_index(), f"{result_dir}/dynamic_clf_normal_extra_feature_{results_name}_{feature_name}_total_metrics.png")
[1114/023726.506177:INFO:headless_shell.cc(660)] Written to file /tmp/tmpsxb5_qgp/temp.png.


===== Processing group:(-1.0, 7199.0] , group number:1=====
===== Processing group:(7199.0, 14399.0] , group number:2=====
===== Processing group:(14399.0, 21599.0] , group number:3=====
===== Processing group:(21599.0, 28799.0] , group number:4=====
Predicting group 4
===== Processing group:(28799.0, 35999.0] , group number:5=====
Predicting group 5
===== Processing group:(35999.0, 43199.0] , group number:6=====
Predicting group 6
===== Processing group:(43199.0, 50399.0] , group number:7=====
Predicting group 7
===== Processing group:(50399.0, 57599.0] , group number:8=====
Predicting group 8
===== Processing group:(57599.0, 64799.0] , group number:9=====
Predicting group 9
===== Processing group:(64799.0, 71999.0] , group number:10=====
Predicting group 10
===== Processing group:(71999.0, 79199.0] , group number:11=====
Predicting group 11
===== Processing group:(79199.0, 86399.0] , group number:12=====
Predicting group 12
===== Processing group:(86399.0, 93599.0] , group number:13=

  dfi.export(df.style.hide_index(), f"{result_dir}/dynamic_clf_normal_extra_feature_{results_name}_{feature_name}.png")
[1114/024913.626419:INFO:headless_shell.cc(660)] Written to file /tmp/tmpm97bk76a/temp.png.
  metrics_df = metrics_df.append(metrics,ignore_index=True)
  dfi.export(metrics_df.style.hide_index(), f"{result_dir}/dynamic_clf_normal_extra_feature_{results_name}_{feature_name}_total_metrics.png")
[1114/024913.895068:INFO:headless_shell.cc(660)] Written to file /tmp/tmpy4fczut1/temp.png.


===== Processing group:(-1.0, 7199.0] , group number:1=====
===== Processing group:(7199.0, 14399.0] , group number:2=====
===== Processing group:(14399.0, 21599.0] , group number:3=====
===== Processing group:(21599.0, 28799.0] , group number:4=====
Predicting group 4
===== Processing group:(28799.0, 35999.0] , group number:5=====
Predicting group 5
===== Processing group:(35999.0, 43199.0] , group number:6=====
Predicting group 6
===== Processing group:(43199.0, 50399.0] , group number:7=====
Predicting group 7
===== Processing group:(50399.0, 57599.0] , group number:8=====
Predicting group 8
===== Processing group:(57599.0, 64799.0] , group number:9=====
Predicting group 9
===== Processing group:(64799.0, 71999.0] , group number:10=====
Predicting group 10
===== Processing group:(71999.0, 79199.0] , group number:11=====
Predicting group 11
===== Processing group:(79199.0, 86399.0] , group number:12=====
Predicting group 12
===== Processing group:(86399.0, 93599.0] , group number:13=

  dfi.export(df.style.hide_index(), f"{result_dir}/dynamic_clf_normal_extra_feature_{results_name}_{feature_name}.png")
[1114/030109.706846:INFO:headless_shell.cc(660)] Written to file /tmp/tmp6l8niop1/temp.png.
  metrics_df = metrics_df.append(metrics,ignore_index=True)
  dfi.export(metrics_df.style.hide_index(), f"{result_dir}/dynamic_clf_normal_extra_feature_{results_name}_{feature_name}_total_metrics.png")
[1114/030109.981632:INFO:headless_shell.cc(660)] Written to file /tmp/tmpwo669kq3/temp.png.
