# Notebook 04: Model Evaluation and Insights

In [2]:
# import the libraries
import pandas as pd
import numpy as np

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, \
                            recall_score, f1_score, plot_roc_curve, roc_auc_score, average_precision_score, \
                            precision_recall_curve, ConfusionMatrixDisplay, plot_precision_recall_curve

from pycaret.classification import *
from pycaret.classification import load_config
load_config('../datasets/my_config') # load the configuration from the setup pycaret

In [3]:
# load the best_model
filename = '../datasets/best_model'
best_model = load_model(filename)

Transformation Pipeline and Model Successfully Loaded


In [4]:
# load the test_encoded file
test_encoded = pd.read_pickle('../datasets/test_encoded.pkl')

In [5]:
# check the shape and first 2 rows of test_encoded
print(test_encoded.shape)
test_encoded.head(2)

(555719, 291)


Unnamed: 0,amt,is_male,is_fraud,age,distance,pre_amt,cum_sum,amt_diff,pct_change,amt_avg,diff_minutes_pre_trans,diff_distance,trans_last_5T,trans_last_1h,trans_last_24h,trans_last_7d,trans_last_30d,avg_last_5T,avg_last_1h,avg_last_24h,avg_last_7d,avg_last_30d,min_last_24h,min_last_7d,min_last_30d,max_last_24h,max_last_7d,max_last_30d,merch_last_24h,merch_last_7d,merch_last_14d,merch_last_30d,avg_merch_last_24h,avg_merch_last_7d,avg_merch_last_14d,avg_merch_last_30d,min_merch_last_24h,min_merch_last_7d,min_merch_last_14d,min_merch_last_30d,max_merch_last_24h,max_merch_last_7d,max_merch_last_14d,max_merch_last_30d,internet_transaction,category_food_dining,category_gas_transport,category_grocery_net,category_grocery_pos,category_health_fitness,category_home,category_kids_pets,category_misc_net,category_misc_pos,category_personal_care,category_shopping_net,category_shopping_pos,category_travel,state_AL,state_AR,state_AZ,state_CA,state_CO,state_CT,state_DC,state_FL,state_GA,state_HI,state_IA,state_ID,state_IL,state_IN,state_KS,state_KY,state_LA,state_MA,state_MD,state_ME,state_MI,state_MN,state_MO,state_MS,state_MT,state_NC,state_ND,state_NE,state_NH,state_NJ,state_NM,state_NV,state_NY,state_OH,state_OK,state_OR,state_PA,state_RI,state_SC,state_SD,state_TN,state_TX,state_UT,state_VA,state_VT,state_WA,state_WI,state_WV,state_WY,trans_month_7,trans_month_8,trans_month_9,trans_month_10,trans_month_11,trans_month_12,trans_week_26,trans_week_27,trans_week_28,trans_week_29,trans_week_30,trans_week_31,trans_week_32,trans_week_33,trans_week_34,trans_week_35,trans_week_36,trans_week_37,trans_week_38,trans_week_39,trans_week_40,trans_week_41,trans_week_42,trans_week_43,trans_week_44,trans_week_45,trans_week_46,trans_week_47,trans_week_48,trans_week_49,trans_week_50,trans_week_51,trans_week_52,trans_week_53,trans_day_2,trans_day_3,trans_day_4,trans_day_5,trans_day_6,trans_day_7,trans_day_8,trans_day_9,trans_day_10,trans_day_11,trans_day_12,trans_day_13,trans_day_14,trans_day_15,trans_day_16,trans_day_17,trans_day_18,trans_day_19,trans_day_20,trans_day_21,trans_day_22,trans_day_23,trans_day_24,trans_day_25,trans_day_26,trans_day_27,trans_day_28,trans_day_29,trans_day_30,trans_day_31,trans_hour_1,trans_hour_2,trans_hour_3,trans_hour_4,trans_hour_5,trans_hour_6,trans_hour_7,trans_hour_8,trans_hour_9,trans_hour_10,trans_hour_11,trans_hour_12,trans_hour_13,trans_hour_14,trans_hour_15,trans_hour_16,trans_hour_17,trans_hour_18,trans_hour_19,trans_hour_20,trans_hour_21,trans_hour_22,trans_hour_23,trans_dayofweek_1,trans_dayofweek_2,trans_dayofweek_3,trans_dayofweek_4,trans_dayofweek_5,trans_dayofweek_6,amt_group_high,amt_group_low,amt_group_medium,amt_group_very_high,amt_group_very_low,age_group_23_33,age_group_33_43,age_group_43_53,age_group_53_63,age_group_63_73,age_group_73_83,age_group_83_93,age_group_above_93,customer_segment_Lost_Cust,customer_segment_Low_Value_Cust,customer_segment_Medium_Value_Cust,cluster_1,cluster_2,cluster_3,cluster_4,cluster_5,cluster_6,cluster_7,cluster_8,cluster_9,cluster_10,cluster_11,cluster_12,cluster_13,cluster_14,cluster_15,cluster_16,cluster_17,cluster_18,cluster_19,cluster_20,cluster_21,cluster_22,cluster_23,cluster_24,cluster_25,cluster_26,cluster_27,cluster_28,cluster_29,cluster_30,cluster_31,cluster_32,cluster_33,cluster_34,cluster_35,cluster_36,cluster_37,cluster_38,cluster_39,cluster_40,cluster_41,cluster_42,cluster_43,state_DE,trans_month_2,trans_month_3,trans_month_4,trans_month_5,trans_month_6,trans_week_2,trans_week_3,trans_week_4,trans_week_5,trans_week_6,trans_week_7,trans_week_8,trans_week_9,trans_week_10,trans_week_11,trans_week_12,trans_week_13,trans_week_14,trans_week_15,trans_week_16,trans_week_17,trans_week_18,trans_week_19,trans_week_20,trans_week_21,trans_week_22,trans_week_23,trans_week_24,trans_week_25,amt_group_above_medium,customer_segment_Top_Cust
0,124.66,0,0,34,30.533617,0.0,124.66,0.0,0.0,124.66,0.0,0.0,1.0,1.0,1.0,1.0,1.0,124.66,124.66,124.66,124.66,124.66,124.66,124.66,124.66,124.66,124.66,124.66,1.0,1.0,1.0,1.0,124.66,124.66,124.66,124.66,124.66,124.66,124.66,124.66,124.66,124.66,124.66,124.66,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,78.52,0,0,34,91.864216,124.66,203.18,-46.14,-0.370127,101.59,199.9,84.495812,1.0,1.0,2.0,2.0,2.0,78.52,78.52,101.59,101.59,101.59,78.52,78.52,78.52,124.66,124.66,124.66,1.0,1.0,1.0,1.0,78.52,78.52,78.52,78.52,78.52,78.52,78.52,78.52,78.52,78.52,78.52,78.52,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# check on the tuned model's metrics
# data = none (default), predict label and score on the holdout set (validation datasets)
predict_model(best_model) 

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extreme Gradient Boosting,0.9994,0.9997,0.9361,0.9654,0.9505,0.9502,0.9503


Unnamed: 0,amt,is_male,age,distance,pre_amt,cum_sum,amt_diff,pct_change,amt_avg,diff_minutes_pre_trans,diff_distance,trans_last_5T,trans_last_1h,trans_last_24h,trans_last_7d,trans_last_30d,avg_last_5T,avg_last_1h,avg_last_24h,avg_last_7d,avg_last_30d,min_last_24h,min_last_7d,min_last_30d,max_last_24h,max_last_7d,max_last_30d,merch_last_24h,merch_last_7d,merch_last_14d,merch_last_30d,avg_merch_last_24h,avg_merch_last_7d,avg_merch_last_14d,avg_merch_last_30d,min_merch_last_24h,min_merch_last_7d,min_merch_last_14d,min_merch_last_30d,max_merch_last_24h,max_merch_last_7d,max_merch_last_14d,max_merch_last_30d,internet_transaction,category_food_dining,category_gas_transport,category_grocery_net,category_grocery_pos,category_health_fitness,category_home,category_kids_pets,category_misc_net,category_misc_pos,category_personal_care,category_shopping_net,category_shopping_pos,category_travel,state_AL,state_AR,state_AZ,state_CA,state_CO,state_CT,state_DC,state_DE,state_FL,state_GA,state_HI,state_IA,state_ID,state_IL,state_IN,state_KS,state_KY,state_LA,state_MA,state_MD,state_ME,state_MI,state_MN,state_MO,state_MS,state_MT,state_NC,state_ND,state_NE,state_NH,state_NJ,state_NM,state_NV,state_NY,state_OH,state_OK,state_OR,state_PA,state_RI,state_SC,state_SD,state_TN,state_TX,state_UT,state_VA,state_VT,state_WA,state_WI,state_WV,state_WY,trans_month_2,trans_month_3,trans_month_4,trans_month_5,trans_month_6,trans_month_7,trans_month_8,trans_month_9,trans_month_10,trans_month_11,trans_month_12,trans_week_2,trans_week_3,trans_week_4,trans_week_5,trans_week_6,trans_week_7,trans_week_8,trans_week_9,trans_week_10,trans_week_11,trans_week_12,trans_week_13,trans_week_14,trans_week_15,trans_week_16,trans_week_17,trans_week_18,trans_week_19,trans_week_20,trans_week_21,trans_week_22,trans_week_23,trans_week_24,trans_week_25,trans_week_26,trans_week_27,trans_week_28,trans_week_29,trans_week_30,trans_week_31,trans_week_32,trans_week_33,trans_week_34,trans_week_35,trans_week_36,trans_week_37,trans_week_38,trans_week_39,trans_week_40,trans_week_41,trans_week_42,trans_week_43,trans_week_44,trans_week_45,trans_week_46,trans_week_47,trans_week_48,trans_week_49,trans_week_50,trans_week_51,trans_week_52,trans_day_2,trans_day_3,trans_day_4,trans_day_5,trans_day_6,trans_day_7,trans_day_8,trans_day_9,trans_day_10,trans_day_11,trans_day_12,trans_day_13,trans_day_14,trans_day_15,trans_day_16,trans_day_17,trans_day_18,trans_day_19,trans_day_20,trans_day_21,trans_day_22,trans_day_23,trans_day_24,trans_day_25,trans_day_26,trans_day_27,trans_day_28,trans_day_29,trans_day_30,trans_day_31,trans_hour_1,trans_hour_2,trans_hour_3,trans_hour_4,trans_hour_5,trans_hour_6,trans_hour_7,trans_hour_8,trans_hour_9,trans_hour_10,trans_hour_11,trans_hour_12,trans_hour_13,trans_hour_14,trans_hour_15,trans_hour_16,trans_hour_17,trans_hour_18,trans_hour_19,trans_hour_20,trans_hour_21,trans_hour_22,trans_hour_23,trans_dayofweek_1,trans_dayofweek_2,trans_dayofweek_3,trans_dayofweek_4,trans_dayofweek_5,trans_dayofweek_6,amt_group_low,amt_group_medium,amt_group_above_medium,amt_group_high,amt_group_very_high,age_group_23_33,age_group_33_43,age_group_43_53,age_group_53_63,age_group_63_73,age_group_73_83,age_group_83_93,age_group_above_93,customer_segment_Lost_Cust,customer_segment_Low_Value_Cust,customer_segment_Medium_Value_Cust,customer_segment_Top_Cust,cluster_1,cluster_2,cluster_3,cluster_4,cluster_5,cluster_6,cluster_7,cluster_8,cluster_9,cluster_10,cluster_11,cluster_12,cluster_13,cluster_14,cluster_15,cluster_16,cluster_17,cluster_18,cluster_19,cluster_20,cluster_21,cluster_22,cluster_23,cluster_24,cluster_25,cluster_26,cluster_27,cluster_28,cluster_29,cluster_30,cluster_31,cluster_32,cluster_33,cluster_34,cluster_35,cluster_36,cluster_37,cluster_38,cluster_39,cluster_40,cluster_41,cluster_42,cluster_43,trans_week_53,amt_group_very_low,is_fraud,Label,Score
0,51.189999,0,65,78.064827,119.080002,4631.080078,-67.889999,-0.570121,53.230804,527.150024,88.290123,1.0,1.0,5.0,21.0,79.0,51.189999,51.189999,51.410000,47.161903,53.539112,9.580000,3.300000,1.20,119.080002,152.160004,365.170013,4.0,31.0,56.0,118.0,53.847500,65.368065,65.661430,64.724579,48.750000,30.959999,29.549999,9.44,61.450001,113.360001,113.360001,113.360001,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0.0,0.0,0,0,1.0
1,117.180000,1,84,46.729183,67.620003,101794.773438,49.560001,0.732919,75.966248,295.649994,80.111580,1.0,1.0,6.0,24.0,103.0,117.180000,117.180000,56.741665,56.247501,66.656311,7.760000,1.980000,1.04,117.180000,191.970001,541.580017,5.0,16.0,28.0,54.0,76.713997,58.208126,57.657856,53.770000,43.259998,23.190001,9.230000,9.23,117.180000,117.180000,117.180000,117.180000,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,1.0
2,12.600000,1,24,34.141266,44.000000,15895.009766,-31.400000,-0.713636,47.306576,52.116665,106.660583,1.0,2.0,3.0,14.0,40.0,12.600000,28.299999,23.796667,55.744999,52.554501,12.600000,1.760000,1.76,44.000000,341.070007,341.070007,4.0,51.0,101.0,150.0,44.470001,58.590786,50.447227,54.488201,12.600000,1.370000,1.190000,1.19,79.330002,369.989990,369.989990,386.529999,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,1.0
3,67.000000,1,45,84.936119,77.000000,101437.726562,-10.000000,-0.129870,94.713097,21.516666,39.872356,1.0,2.0,8.0,37.0,157.0,67.000000,72.000000,80.823753,88.332703,87.699043,63.009998,32.169998,30.08,94.190002,483.359985,566.270020,10.0,39.0,85.0,187.0,46.384998,49.799744,46.819294,50.002247,4.530000,1.020000,1.020000,1.02,165.119995,182.229996,182.229996,257.839996,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0.0,0.0,0,0,1.0
4,36.919998,0,32,74.577301,47.340000,134935.781250,-10.420000,-0.220110,62.125130,491.700012,94.524391,1.0,1.0,14.0,87.0,339.0,36.919998,36.919998,101.807144,53.040573,61.151356,1.480000,1.220000,1.09,894.049988,894.049988,894.049988,11.0,29.0,51.0,108.0,68.130913,60.294827,58.494904,52.845833,36.919998,10.020000,10.020000,10.02,111.410004,111.410004,111.410004,111.410004,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
324164,93.360001,0,41,124.870926,53.070000,46174.789062,40.290001,0.759186,96.398308,234.466660,129.808990,1.0,1.0,7.0,13.0,31.0,93.360001,93.360001,91.075714,104.370003,96.137100,35.230000,2.500000,2.50,229.350006,412.619995,625.760010,6.0,18.0,44.0,90.0,52.068333,70.362221,62.341591,64.671997,1.460000,1.190000,1.190000,1.06,132.220001,374.299988,374.299988,374.299988,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0.0,0.0,0,0,1.0
324165,58.730000,0,62,79.407158,5.400000,80701.398438,53.330002,9.875926,68.217583,842.233337,90.145065,1.0,1.0,4.0,20.0,80.0,58.730000,58.730000,29.709999,41.571499,59.358124,1.890000,1.520000,1.04,58.730000,145.199997,707.090027,4.0,23.0,45.0,129.0,40.715000,51.888695,61.302891,61.822636,26.410000,7.840000,7.840000,1.24,58.730000,133.500000,195.300003,357.739990,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,1.0
324166,126.959999,0,35,72.621765,379.899994,32278.859375,-252.940002,-0.665807,92.755348,1797.849976,71.122665,1.0,1.0,1.0,15.0,57.0,126.959999,126.959999,126.959999,91.959999,100.983856,126.959999,2.380000,1.02,126.959999,379.899994,956.080017,5.0,49.0,87.0,191.0,48.270000,54.097755,51.601151,51.916336,3.170000,2.420000,1.120000,1.12,126.959999,272.420013,272.420013,274.109985,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0.0,0.0,0,0,1.0
324167,1.300000,1,24,108.287743,2.130000,44264.488281,-0.830000,-0.389671,63.966026,19.200001,153.192551,1.0,3.0,11.0,43.0,160.0,1.300000,2.956667,15.944546,36.455582,51.414562,1.300000,1.200000,1.03,78.379997,302.640015,1173.699951,5.0,8.0,23.0,40.0,104.987999,552.911255,297.392181,225.899994,1.300000,1.300000,1.290000,1.29,503.250000,3892.290039,3892.290039,3892.290039,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0.0,0.0,0,0,1.0


In [7]:
# Returns the last printed scoring grid.
train_result = pull()
train_result

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extreme Gradient Boosting,0.9994,0.9997,0.9361,0.9654,0.9505,0.9502,0.9503


In [8]:
# split the test_encoded into test_eval and y_true
test_eval = test_encoded.drop('is_fraud', axis=1)
y_true = test_encoded['is_fraud']

In [9]:
# used the tuned model for prediction
predicted_prob = predict_model(best_model, 
                               test_eval,
                               raw_score = True)

In [10]:
# predict y_pred
y_pred = predicted_prob['Label']

In [11]:
# check the confusion matrix
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
tn, fp, fn, tp

(553415, 159, 199, 1946)

In [13]:
# evaluate tuned model on the test datasets
print(f"accuracy score: {accuracy_score(y_true, y_pred)}")
print(f"recall score: {recall_score(y_true, y_pred)}")
print(f"f1 score: {f1_score(y_true, y_pred)}")
print(f"precision score: {precision_score(y_true, y_pred)}")
print(f"average precision score: {average_precision_score(y_true, y_pred)}")
print(f"roc auc score: {roc_auc_score(y_true, y_pred)}")

accuracy score: 0.9993557895267212
recall score: 0.9072261072261072
f1 score: 0.9157647058823529
precision score: 0.9244655581947744
average precision score: 0.8390573842742098
roc auc score: 0.9534694413769298


Although the overall metrics score drop a little bit as compared to the metrics on validation dataset, the overall metrics are good and decent with above 90%. It will be able to generalize the future data and I will suggest to proceed and deploy the model for credit card fraud detection.

## Dashboard Preparation

In [12]:
print(predicted_prob.shape)
predicted_prob.head(2)

(555719, 293)


Unnamed: 0,amt,is_male,age,distance,pre_amt,cum_sum,amt_diff,pct_change,amt_avg,diff_minutes_pre_trans,diff_distance,trans_last_5T,trans_last_1h,trans_last_24h,trans_last_7d,trans_last_30d,avg_last_5T,avg_last_1h,avg_last_24h,avg_last_7d,avg_last_30d,min_last_24h,min_last_7d,min_last_30d,max_last_24h,max_last_7d,max_last_30d,merch_last_24h,merch_last_7d,merch_last_14d,merch_last_30d,avg_merch_last_24h,avg_merch_last_7d,avg_merch_last_14d,avg_merch_last_30d,min_merch_last_24h,min_merch_last_7d,min_merch_last_14d,min_merch_last_30d,max_merch_last_24h,max_merch_last_7d,max_merch_last_14d,max_merch_last_30d,internet_transaction,category_food_dining,category_gas_transport,category_grocery_net,category_grocery_pos,category_health_fitness,category_home,category_kids_pets,category_misc_net,category_misc_pos,category_personal_care,category_shopping_net,category_shopping_pos,category_travel,state_AL,state_AR,state_AZ,state_CA,state_CO,state_CT,state_DC,state_FL,state_GA,state_HI,state_IA,state_ID,state_IL,state_IN,state_KS,state_KY,state_LA,state_MA,state_MD,state_ME,state_MI,state_MN,state_MO,state_MS,state_MT,state_NC,state_ND,state_NE,state_NH,state_NJ,state_NM,state_NV,state_NY,state_OH,state_OK,state_OR,state_PA,state_RI,state_SC,state_SD,state_TN,state_TX,state_UT,state_VA,state_VT,state_WA,state_WI,state_WV,state_WY,trans_month_7,trans_month_8,trans_month_9,trans_month_10,trans_month_11,trans_month_12,trans_week_26,trans_week_27,trans_week_28,trans_week_29,trans_week_30,trans_week_31,trans_week_32,trans_week_33,trans_week_34,trans_week_35,trans_week_36,trans_week_37,trans_week_38,trans_week_39,trans_week_40,trans_week_41,trans_week_42,trans_week_43,trans_week_44,trans_week_45,trans_week_46,trans_week_47,trans_week_48,trans_week_49,trans_week_50,trans_week_51,trans_week_52,trans_week_53,trans_day_2,trans_day_3,trans_day_4,trans_day_5,trans_day_6,trans_day_7,trans_day_8,trans_day_9,trans_day_10,trans_day_11,trans_day_12,trans_day_13,trans_day_14,trans_day_15,trans_day_16,trans_day_17,trans_day_18,trans_day_19,trans_day_20,trans_day_21,trans_day_22,trans_day_23,trans_day_24,trans_day_25,trans_day_26,trans_day_27,trans_day_28,trans_day_29,trans_day_30,trans_day_31,trans_hour_1,trans_hour_2,trans_hour_3,trans_hour_4,trans_hour_5,trans_hour_6,trans_hour_7,trans_hour_8,trans_hour_9,trans_hour_10,trans_hour_11,trans_hour_12,trans_hour_13,trans_hour_14,trans_hour_15,trans_hour_16,trans_hour_17,trans_hour_18,trans_hour_19,trans_hour_20,trans_hour_21,trans_hour_22,trans_hour_23,trans_dayofweek_1,trans_dayofweek_2,trans_dayofweek_3,trans_dayofweek_4,trans_dayofweek_5,trans_dayofweek_6,amt_group_high,amt_group_low,amt_group_medium,amt_group_very_high,amt_group_very_low,age_group_23_33,age_group_33_43,age_group_43_53,age_group_53_63,age_group_63_73,age_group_73_83,age_group_83_93,age_group_above_93,customer_segment_Lost_Cust,customer_segment_Low_Value_Cust,customer_segment_Medium_Value_Cust,cluster_1,cluster_2,cluster_3,cluster_4,cluster_5,cluster_6,cluster_7,cluster_8,cluster_9,cluster_10,cluster_11,cluster_12,cluster_13,cluster_14,cluster_15,cluster_16,cluster_17,cluster_18,cluster_19,cluster_20,cluster_21,cluster_22,cluster_23,cluster_24,cluster_25,cluster_26,cluster_27,cluster_28,cluster_29,cluster_30,cluster_31,cluster_32,cluster_33,cluster_34,cluster_35,cluster_36,cluster_37,cluster_38,cluster_39,cluster_40,cluster_41,cluster_42,cluster_43,state_DE,trans_month_2,trans_month_3,trans_month_4,trans_month_5,trans_month_6,trans_week_2,trans_week_3,trans_week_4,trans_week_5,trans_week_6,trans_week_7,trans_week_8,trans_week_9,trans_week_10,trans_week_11,trans_week_12,trans_week_13,trans_week_14,trans_week_15,trans_week_16,trans_week_17,trans_week_18,trans_week_19,trans_week_20,trans_week_21,trans_week_22,trans_week_23,trans_week_24,trans_week_25,amt_group_above_medium,customer_segment_Top_Cust,Label,Score_0,Score_1
0,124.66,0,34,30.533617,0.0,124.66,0.0,0.0,124.66,0.0,0.0,1.0,1.0,1.0,1.0,1.0,124.66,124.66,124.66,124.66,124.66,124.66,124.66,124.66,124.66,124.66,124.66,1.0,1.0,1.0,1.0,124.66,124.66,124.66,124.66,124.66,124.66,124.66,124.66,124.66,124.66,124.66,124.66,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,0.0
1,78.52,0,34,91.864216,124.66,203.18,-46.14,-0.370127,101.59,199.9,84.495812,1.0,1.0,2.0,2.0,2.0,78.52,78.52,101.59,101.59,101.59,78.52,78.52,78.52,124.66,124.66,124.66,1.0,1.0,1.0,1.0,78.52,78.52,78.52,78.52,78.52,78.52,78.52,78.52,78.52,78.52,78.52,78.52,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,0.0


In [13]:
# load the final test datasets on pickle file
test_final = pd.read_pickle('../datasets/test_final.pkl')

In [14]:
print(test_final.shape)
test_final.head(2)

(555719, 74)


Unnamed: 0,trans_datetime,cc_num,merchant,category,amt,is_male,street,city,state,zip,lat,long,city_pop,job,trans_num,merch_lat,merch_long,is_fraud,name,coords_ori,coords_merch,trans_year,trans_month,trans_week,trans_day,trans_hour,trans_minute,trans_dayofweek,age,distance,amt_group,age_group,pre_amt,cum_sum,amt_diff,pct_change,amt_avg,diff_minutes_pre_trans,diff_distance,trans_last_5T,trans_last_1h,trans_last_24h,trans_last_7d,trans_last_30d,avg_last_5T,avg_last_1h,avg_last_24h,avg_last_7d,avg_last_30d,min_last_24h,min_last_7d,min_last_30d,max_last_24h,max_last_7d,max_last_30d,merch_last_24h,merch_last_7d,merch_last_14d,merch_last_30d,avg_merch_last_24h,avg_merch_last_7d,avg_merch_last_14d,avg_merch_last_30d,min_merch_last_24h,min_merch_last_7d,min_merch_last_14d,min_merch_last_30d,max_merch_last_24h,max_merch_last_7d,max_merch_last_14d,max_merch_last_30d,internet_transaction,customer_segment,cluster
0,2020-06-21 13:05:42,60416207185,fraud_Kutch-Ferry,home,124.66,0,9886 Anita Drive,Fort_Washakie,WY,82514,43.0048,-108.8964,1645,Information systems manager,f21e1a09f59c50a66735409308285b1d,42.945526,-108.530901,0,Mary Diaz,"(43.0048, -108.8964)","(42.945526, -108.530901)",2020,6,25,21,13,5,6,34,30.533617,high,33_43,0.0,124.66,0.0,0.0,124.66,0.0,0.0,1.0,1.0,1.0,1.0,1.0,124.66,124.66,124.66,124.66,124.66,124.66,124.66,124.66,124.66,124.66,124.66,1.0,1.0,1.0,1.0,124.66,124.66,124.66,124.66,124.66,124.66,124.66,124.66,124.66,124.66,124.66,124.66,0,Lost_Cust,43
1,2020-06-21 16:25:36,60416207185,fraud_Halvorson_Group,misc_pos,78.52,0,9886 Anita Drive,Fort_Washakie,WY,82514,43.0048,-108.8964,1645,Information systems manager,08f67c9765438973006f8250351d8c1f,42.19313,-108.682054,0,Mary Diaz,"(43.0048, -108.8964)","(42.19313, -108.682054)",2020,6,25,21,16,25,6,34,91.864216,above_medium,33_43,124.66,203.18,-46.14,-0.370127,101.59,199.9,84.495812,1.0,1.0,2.0,2.0,2.0,78.52,78.52,101.59,101.59,101.59,78.52,78.52,78.52,124.66,124.66,124.66,1.0,1.0,1.0,1.0,78.52,78.52,78.52,78.52,78.52,78.52,78.52,78.52,78.52,78.52,78.52,78.52,0,Lost_Cust,43


In [15]:
# concat the test_final and predicted_prob
test_dashboard = pd.concat([test_final, predicted_prob[predicted_prob.columns[-3:]]], axis=1)

In [16]:
# masking the senstive card information
test_dashboard['cc_num'] = test_dashboard['cc_num'].astype(str).str[:-4] + 'XXXX'

In [17]:
print(test_dashboard.shape)
test_dashboard.head()

(555719, 77)


Unnamed: 0,trans_datetime,cc_num,merchant,category,amt,is_male,street,city,state,zip,lat,long,city_pop,job,trans_num,merch_lat,merch_long,is_fraud,name,coords_ori,coords_merch,trans_year,trans_month,trans_week,trans_day,trans_hour,trans_minute,trans_dayofweek,age,distance,amt_group,age_group,pre_amt,cum_sum,amt_diff,pct_change,amt_avg,diff_minutes_pre_trans,diff_distance,trans_last_5T,trans_last_1h,trans_last_24h,trans_last_7d,trans_last_30d,avg_last_5T,avg_last_1h,avg_last_24h,avg_last_7d,avg_last_30d,min_last_24h,min_last_7d,min_last_30d,max_last_24h,max_last_7d,max_last_30d,merch_last_24h,merch_last_7d,merch_last_14d,merch_last_30d,avg_merch_last_24h,avg_merch_last_7d,avg_merch_last_14d,avg_merch_last_30d,min_merch_last_24h,min_merch_last_7d,min_merch_last_14d,min_merch_last_30d,max_merch_last_24h,max_merch_last_7d,max_merch_last_14d,max_merch_last_30d,internet_transaction,customer_segment,cluster,Label,Score_0,Score_1
0,2020-06-21 13:05:42,6041620XXXX,fraud_Kutch-Ferry,home,124.66,0,9886 Anita Drive,Fort_Washakie,WY,82514,43.0048,-108.8964,1645,Information systems manager,f21e1a09f59c50a66735409308285b1d,42.945526,-108.530901,0,Mary Diaz,"(43.0048, -108.8964)","(42.945526, -108.530901)",2020,6,25,21,13,5,6,34,30.533617,high,33_43,0.0,124.66,0.0,0.0,124.66,0.0,0.0,1.0,1.0,1.0,1.0,1.0,124.66,124.66,124.66,124.66,124.66,124.66,124.66,124.66,124.66,124.66,124.66,1.0,1.0,1.0,1.0,124.66,124.66,124.66,124.66,124.66,124.66,124.66,124.66,124.66,124.66,124.66,124.66,0,Lost_Cust,43,0,1.0,0.0
1,2020-06-21 16:25:36,6041620XXXX,fraud_Halvorson_Group,misc_pos,78.52,0,9886 Anita Drive,Fort_Washakie,WY,82514,43.0048,-108.8964,1645,Information systems manager,08f67c9765438973006f8250351d8c1f,42.19313,-108.682054,0,Mary Diaz,"(43.0048, -108.8964)","(42.19313, -108.682054)",2020,6,25,21,16,25,6,34,91.864216,above_medium,33_43,124.66,203.18,-46.14,-0.370127,101.59,199.9,84.495812,1.0,1.0,2.0,2.0,2.0,78.52,78.52,101.59,101.59,101.59,78.52,78.52,78.52,124.66,124.66,124.66,1.0,1.0,1.0,1.0,78.52,78.52,78.52,78.52,78.52,78.52,78.52,78.52,78.52,78.52,78.52,78.52,0,Lost_Cust,43,0,1.0,0.0
2,2020-06-22 07:58:33,6041620XXXX,fraud_Conroy-Cruickshank,gas_transport,65.25,0,9886 Anita Drive,Fort_Washakie,WY,82514,43.0048,-108.8964,1645,Information systems manager,ca1f04d9d549c507356ffa8f9b43d81f,43.932724,-109.699794,0,Mary Diaz,"(43.0048, -108.8964)","(43.932724, -109.699794)",2020,6,26,22,7,58,0,34,121.877934,above_medium,33_43,78.52,268.43,-13.27,-0.169002,89.476667,932.95,210.284759,1.0,1.0,3.0,3.0,3.0,65.25,65.25,89.476667,89.476667,89.476667,65.25,65.25,65.25,124.66,124.66,124.66,3.0,3.0,3.0,3.0,58.963333,58.963333,58.963333,58.963333,47.58,47.58,47.58,47.58,65.25,65.25,65.25,65.25,0,Lost_Cust,43,0,1.0,0.0
3,2020-06-22 15:32:31,6041620XXXX,fraud_Larkin_Ltd,kids_pets,87.74,0,9886 Anita Drive,Fort_Washakie,WY,82514,43.0048,-108.8964,1645,Information systems manager,d671d98fded8ca75d799959a444577a5,43.546064,-109.212939,0,Mary Diaz,"(43.0048, -108.8964)","(43.546064, -109.212939)",2020,6,26,22,15,32,0,34,65.393092,high,33_43,65.25,356.17,22.49,0.344674,89.0425,453.966667,58.169994,1.0,1.0,3.0,4.0,4.0,87.74,87.74,77.17,89.0425,89.0425,65.25,65.25,65.25,87.74,124.66,124.66,9.0,10.0,10.0,10.0,69.293333,73.988,73.988,73.988,5.5,5.5,5.5,5.5,171.63,171.63,171.63,171.63,0,Lost_Cust,43,0,1.0,0.0
4,2020-06-23 12:28:54,6041620XXXX,fraud_Leffler-Goldner,personal_care,148.02,0,9886 Anita Drive,Fort_Washakie,WY,82514,43.0048,-108.8964,1645,Information systems manager,86d346444b5b262159976aa4d8b64fbb,42.876538,-109.33322,0,Mary Diaz,"(43.0048, -108.8964)","(42.876538, -109.33322)",2020,6,26,23,12,28,1,34,38.39478,very_high,33_43,87.74,504.19,60.28,0.68703,100.838,1256.383333,75.021641,1.0,1.0,2.0,5.0,5.0,148.02,148.02,117.88,100.838,100.838,87.74,65.25,65.25,148.02,148.02,148.02,10.0,17.0,17.0,17.0,46.953,48.147647,48.147647,48.147647,4.7,4.7,4.7,4.7,148.02,182.61,182.61,182.61,0,Lost_Cust,43,0,1.0,0.0


In [62]:
# save the test_dashboard to csv file
test_dashboard.to_csv('../datasets/test_dashboard.csv')

In [63]:
print(test_dashboard['trans_datetime'].min())
print(test_dashboard['trans_datetime'].max())

2020-06-21 12:14:25
2020-12-31 23:59:34


## Tableau Dashboard: Credit Card Fraud Monitoring

I decided to build a dashboard on Tableau Public for monitoring Credit Card Transactions. The predicted outcome displayed on the dashboard from the test datasets. Senstive credit card information has been masked before publish to the Tableau Public.

For full view of published Dashboard can be accessed via [Tableau Public Website](https://public.tableau.com/app/profile/vincent.chua/viz/frauddetection_16470019928680/Dashboard1#1).

In [98]:
%%html
<div class='tableauPlaceholder' id='viz1647095680509' style='position: relative'><object class='tableauViz'  style='display:none;'><param name='host_url' value='https%3A%2F%2Fpublic.tableau.com%2F' /> <param name='embed_code_version' value='3' /> <param name='site_root' value='' /><param name='name' value='frauddetection_16470019928680&#47;Dashboard1' /><param name='tabs' value='no' /><param name='toolbar' value='yes' /><param name='animate_transition' value='yes' /><param name='display_static_image' value='yes' /><param name='display_spinner' value='yes' /><param name='display_overlay' value='yes' /><param name='display_count' value='yes' /><param name='language' value='en-US' /></object></div>                <script type='text/javascript'>                    var divElement = document.getElementById('viz1647095680509');                    var vizElement = divElement.getElementsByTagName('object')[0];                    if ( divElement.offsetWidth > 800 ) { vizElement.style.width='1200px';vizElement.style.height='827px';} else if ( divElement.offsetWidth > 500 ) { vizElement.style.width='1200px';vizElement.style.height='827px';} else { vizElement.style.width='100%';vizElement.style.height='1727px';}                     var scriptElement = document.createElement('script');                    scriptElement.src = 'https://public.tableau.com/javascripts/api/viz_v1.js';                    vizElement.parentNode.insertBefore(scriptElement, vizElement);                </script>

## Conclusions and Recommendations

### Findings

1) XGBoost is our final model that provide the highest recall score and f1-score on classified the credit card fraud transactions. It shows 94.5% in F1-score and 91.4% of Recall Score on our validation dataset. Although both f1-score and recall score decrease on our unseen test datasets, it still manage to maintain 91.6% f1-score and 90.7% recall score.

2) From EDA we found that the very_high transaction amount group has the less transaction numbers, however it have the highest fraud transaction rate than the other amount group. It is about 300 times higher than very_low transaction amount group. We also found out that the age above 73 have relatively higher fraud transaction rate as compared to age below 73, however the lowest fraud transaction rate happens in age group between 33 to 43.

3) State of AK has the highest fraud transaction rate out the rest of the state with credit card transactions 1000 and above. The lowest fraud transaction rate happens to be in state of ID. AK have about 8.5 times higher fraud transaction rate than ID.

4) From the RFM Analysis, we segmentize the customer into 5 segments which are Top_Cust, High_value_Cust, Medium_Value_cust, Low_Value_Cust and Lost_Cust. 14% of the credit cards fall under Top Cust and High Value Cust segments, while 26% of credit cards fall in the bottom segment - Lost Cust. Lost Cust also has the highest Fraud Transactions Rate (2%) among the customer segments, while the lowest Fraud Transactions Rate (0.26%) is under Top Cust segment. It means that the Lost Cust Segment have about 10 times higher chances to have fraud transaction than Top Cust Segment.

5) From our final model, we manage to identify the top predictors and features that important to detect credit card fraud transactions. The top 3 predictors are Average and Max Transaction Amount in the last 24 hours and grocery_pos Category. Besides the top 3 predictors, we also identified features that have strong effect on our target from SHAP Summary Chart. Transaction Amount, previous Transaction Amount, Average Transaction amount in last 5 minutes and 7 days are the features have positive impact on detecting whether or not the credit card transaction is fraud transaction.

### Recommendations

In this project, we manage to accomplish two goals that specified in our problem statement.

1) We tested our Fraud Detection Model on unseen test dataset with about 550k observations. The model achieved 99.94% of accuracy, 91.58% F1-Score, 90.7% Recall Score and Precision Score of 92.4%. This will improve the efficiency and effectiveness on fraud detection by the Risk and Compliance Team and reduce unnecessary false alarm triggered to credit card users due to Type I error. With above 90% recall score, it will also allow the bank save up huge losses from the fraud transactions.

2) The Top 3 Predictors that cause the model to detect fraud transactions are:
1. Average Transaction Amount in last 24 hours *(avg_last_24h)*
2. Maximum Transaction Amount in last 24 hours *(max_last_24h)*
3. Grocery Pos Category *(category_grocery_pos)*

The 4 features with strong effects and impact on target are:
1. Credit Card Transaction Amount *(amt)*
2. Previous/Last Credit Card Transaction Amount *(pre_amt)*
3. Average Transaction Amount in last 5 minutes *(avg_last_5T)*
4. Average Transaction Amount in last 7 days *(avg_last_7d)*

From the predictors and features above, it shows that the model is able to detect the fraud by the difference in spending behaviors and transactions amount. Which it make sense that the fraudster usually have high chance to have different spending behaviors as compared to the credit cardholder did.

3) This model have its limitation that it was build based on US States' data and it may not be able to generalized to unseen data outside the US states. Besides that, the model also detect the fraud heavily rely on the spending behaviors, it may have the limitation if the credit card has not been used at all by the original cardholder. It will not have the spending behavior record, and the model may not able to detect it accurately.

4) I have build the Credit Card Monitoring Dashboard via [Tableau Public](https://public.tableau.com/app/profile/vincent.chua/viz/frauddetection_16470019928680/Dashboard1#1), the Risk and Compliance Team can monitor the credit card transactions via different age groups, geographically, timeline trends, different categories and identify the predicted outcome with anomaly score shown on each individual credit card.

### Future Enhancements

- The fraud transactions may not only happens due to change in spending behavior at all time. Hence feature engineering is crucial in this project, besides what I have done on the feature engineering, I will also explore other possibilities to create new features demographically as well as geographically from the datasets, for example Jobs and Income per capita on different states.
- Besides using the traditional classificaiton algorithms for modeling, I will also like to explore on the Neural Networks deep learning methods to check if the outcome will be better off. My aim is to continue optimize the model to achieve target recall score of minimal 95%.
- Understand the credit card transactions are time sensitve, heavily depend on the changes in behaviors between cardholder and fraudster. I will also like to perform the time-dependent graph (network analysis) to identify and capture the potential anomaly and fraud pattern.
- Deploy the model for real-time alert and automized the necessary prevention measures for the cardholders.

### Citations

1. https://www.kaggle.com/kartik2112/fraud-detection
2. https://towardsdatascience.com/recency-frequency-monetary-model-with-python-and-how-sephora-uses-it-to-optimize-their-google-d6a0707c5f17
3. https://www.ravelin.com/insights/machine-learning-for-fraud-detection
4. https://opengovasia.com/the-importance-of-using-big-data-in-combating-money-laundering/
5. https://www.cylynx.io/blog/network-analytics-for-fraud-detection-in-banking-and-finance/
6. https://www.straitstimes.com/singapore/460-jump-in-unauthorised-online-banking-and-card-transactions-in-2020
7. https://www.channelnewsasia.com/singapore/credit-card-fraud-banks-divert-sms-otp-overseas-imda-mas-spf-2179541#:~:text=Davina%20Tham&text=SINGAPORE%3A%20%22Malicious%20actors%22%20overseas,to%20overseas%20mobile%20network%20systems
8. https://mothership.sg/2021/06/dbs-credit-card-fraud-bypass-otp-sms/#:~:text=Joshua%20Lee-,S'pore%20woman%20loses%20S%2410%2C000%20in%20DBS%20credit%20card,didn't%20receive%20any%20notification.