In [60]:
import pandas as pd
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, precision_recall_curve
import statsmodels.api as sm
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from bayes_opt import BayesianOptimization

<h2> Loading datasets

In [2]:
train_identity_df = pd.read_csv('./fraud_detection/train_identity.csv')
train_trasaction_df = pd.read_csv('./fraud_detection/train_transaction.csv')


In [3]:
train_identity_df

Unnamed: 0,TransactionID,id_01,id_02,id_03,id_04,id_05,id_06,id_07,id_08,id_09,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987004,0.0,70787.0,,,,,,,,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M
1,2987008,-5.0,98945.0,,,0.0,-5.0,,,,...,mobile safari 11.0,32.0,1334x750,match_status:1,T,F,F,T,mobile,iOS Device
2,2987010,-5.0,191631.0,0.0,0.0,0.0,0.0,,,0.0,...,chrome 62.0,,,,F,F,T,T,desktop,Windows
3,2987011,-5.0,221832.0,,,0.0,-6.0,,,,...,chrome 62.0,,,,F,F,T,T,desktop,
4,2987016,0.0,7460.0,0.0,0.0,1.0,0.0,,,0.0,...,chrome 62.0,24.0,1280x800,match_status:2,T,F,T,T,desktop,MacOS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
144228,3577521,-15.0,145955.0,0.0,0.0,0.0,0.0,,,0.0,...,chrome 66.0 for android,,,,F,F,T,F,mobile,F3111 Build/33.3.A.1.97
144229,3577526,-5.0,172059.0,,,1.0,-5.0,,,,...,chrome 55.0 for android,32.0,855x480,match_status:2,T,F,T,F,mobile,A574BL Build/NMF26F
144230,3577529,-20.0,632381.0,,,-1.0,-36.0,,,,...,chrome 65.0 for android,,,,F,F,T,F,mobile,Moto E (4) Plus Build/NMA26.42-152
144231,3577531,-5.0,55528.0,0.0,0.0,0.0,-7.0,,,0.0,...,chrome 66.0,24.0,2560x1600,match_status:2,T,F,T,F,desktop,MacOS


In [4]:
train_trasaction_df

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,2987000,0,86400,68.50,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.00,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.00,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.00,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.00,H,4497,514.0,150.0,mastercard,102.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
590535,3577535,0,15811047,49.00,W,6550,,150.0,visa,226.0,...,,,,,,,,,,
590536,3577536,0,15811049,39.50,W,10444,225.0,150.0,mastercard,224.0,...,,,,,,,,,,
590537,3577537,0,15811079,30.95,W,12037,595.0,150.0,mastercard,224.0,...,,,,,,,,,,
590538,3577538,0,15811088,117.00,W,7826,481.0,150.0,mastercard,224.0,...,,,,,,,,,,


Merging transaction df with identity df to generate the complete dataset

In [5]:
dataset_df = train_trasaction_df.merge(train_identity_df, how='left', on='TransactionID')
dataset_df

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987000,0,86400,68.50,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.00,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.00,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.00,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.00,H,4497,514.0,150.0,mastercard,102.0,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
590535,3577535,0,15811047,49.00,W,6550,,150.0,visa,226.0,...,,,,,,,,,,
590536,3577536,0,15811049,39.50,W,10444,225.0,150.0,mastercard,224.0,...,,,,,,,,,,
590537,3577537,0,15811079,30.95,W,12037,595.0,150.0,mastercard,224.0,...,,,,,,,,,,
590538,3577538,0,15811088,117.00,W,7826,481.0,150.0,mastercard,224.0,...,,,,,,,,,,


<h2> Dataset investigation

In this section, features types available are investigated. Furthermore, the columns in the datasets will be grouped to categorical and numerical features. This is because the categorical features needed to be encoded prior to used by the model

In [9]:
column_to_exclude = ['TransactionID', 'isFraud', 'TransactionDT']

In [10]:
column_numeric = []
column_object = []

In [11]:
col_type_arr = []
for col_name, col_type in zip(dataset_df.columns, dataset_df.dtypes):
    col_type_arr.append(str(col_type))
    if col_name not in column_to_exclude:
        if str(col_type) == "float64" or str(col_type) == "int64":
            column_numeric.append(col_name)
        else:
            column_object.append(col_name)
        print(col_name, col_type )
    else:
        print(col_name)

TransactionID
isFraud
TransactionDT
TransactionAmt float64
ProductCD object
card1 int64
card2 float64
card3 float64
card4 object
card5 float64
card6 object
addr1 float64
addr2 float64
dist1 float64
dist2 float64
P_emaildomain object
R_emaildomain object
C1 float64
C2 float64
C3 float64
C4 float64
C5 float64
C6 float64
C7 float64
C8 float64
C9 float64
C10 float64
C11 float64
C12 float64
C13 float64
C14 float64
D1 float64
D2 float64
D3 float64
D4 float64
D5 float64
D6 float64
D7 float64
D8 float64
D9 float64
D10 float64
D11 float64
D12 float64
D13 float64
D14 float64
D15 float64
M1 object
M2 object
M3 object
M4 object
M5 object
M6 object
M7 object
M8 object
M9 object
V1 float64
V2 float64
V3 float64
V4 float64
V5 float64
V6 float64
V7 float64
V8 float64
V9 float64
V10 float64
V11 float64
V12 float64
V13 float64
V14 float64
V15 float64
V16 float64
V17 float64
V18 float64
V19 float64
V20 float64
V21 float64
V22 float64
V23 float64
V24 float64
V25 float64
V26 float64
V27 float64
V28 float64

<H2> Splitting data to training set, validation set, and test set

In [14]:
train_df, test_df = train_test_split(dataset_df[column_numeric+column_object+['isFraud']], 
                                                    test_size=0.2, stratify = dataset_df['isFraud'], random_state=1)

In [15]:
train_df, val_df = train_test_split(train_df[column_numeric+column_object+['isFraud']], 
                                                    test_size=0.2, stratify = train_df['isFraud'], random_state=1)

In [16]:
train_df[['isFraud']].value_counts()

isFraud
0          364721
1           13224
Name: count, dtype: int64

<H2>Encoding categorical features

In [17]:
enc = preprocessing.OrdinalEncoder()

In [18]:
enc.fit(dataset_df[column_object])

In [20]:
train_col_categorical = enc.transform(train_df[column_object])
train_preprocessed = train_df[column_numeric+['isFraud']].copy()
train_preprocessed[column_object] =  train_col_categorical
#pd.concat([train_df[column_numeric+['isFraud']], pd.DataFrame(train_col_categorical, columns = column_object)], axis=0)

In [53]:
train_preprocessed.to_parquet('train_preprocess.parquet')

In [23]:
val_col_categorical = enc.transform(val_df[column_object])
val_preprocessed = val_df[column_numeric+['isFraud']].copy()
val_preprocessed[column_object] =  val_col_categorical

In [54]:
val_preprocessed.to_parquet('val_preprocess.parquet')

In [46]:
test_col_categorical = enc.transform(test_df[column_object])
test_preprocessed = test_df[column_numeric+['isFraud']].copy()
test_preprocessed[column_object] =  test_col_categorical

In [55]:
test_preprocessed.to_parquet('test_preprocess.parquet')

In [24]:
train_preprocessed

Unnamed: 0,TransactionAmt,card1,card2,card3,card5,addr1,addr2,dist1,dist2,C1,...,id_30,id_31,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
293545,17.643,3154,408.0,185.0,224.0,,,,,1.0,...,,,,,,,,,,
137643,50.000,4030,174.0,150.0,226.0,126.0,87.0,,19.0,1.0,...,73.0,100.0,149.0,2.0,1.0,0.0,0.0,1.0,1.0,1727.0
553915,59.000,9175,111.0,150.0,226.0,126.0,87.0,0.0,,84.0,...,,,,,,,,,,
473207,77.000,12116,404.0,150.0,102.0,205.0,87.0,,,1.0,...,,,,,,,,,,
309380,32.000,3281,555.0,150.0,226.0,251.0,87.0,,,1.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
342368,117.000,8394,490.0,150.0,226.0,126.0,87.0,,,2.0,...,,,,,,,,,,
392173,34.983,14276,177.0,185.0,137.0,,,,0.0,1.0,...,,50.0,,,0.0,0.0,1.0,0.0,0.0,
553205,87.000,17188,321.0,150.0,226.0,122.0,87.0,0.0,,5.0,...,,,,,,,,,,
68106,34.000,18215,111.0,150.0,226.0,337.0,87.0,,,4.0,...,,,,,,,,,,


<h2>Feature Selection

Using two tailed hypothesis testing (z-test), we aim to filter out features that does not bring any information to the model

In [26]:
col_numeric_feat = []
for col in column_numeric:
    _, p_val =sm.stats.ztest(train_preprocessed[col].fillna(0), x2=train_preprocessed['isFraud'])
    if p_val < 0.05:
        col_numeric_feat.append(col)

In [27]:
col_obj_feat = []
for col in column_object:
    _, p_val =sm.stats.ztest(train_preprocessed[col].fillna(-1), x2=train_preprocessed['isFraud'])
    if p_val < 0.05:
        col_obj_feat.append(col)

In [110]:
print(f'number of numerical features used {len(col_numeric_feat)} out of {len(column_numeric)}')

number of numerical features used 399 out of 400


In [111]:
print(f'number of categorical features used {len(col_obj_feat)} out of {len(column_object)}')

number of categorical features used 31 out of 31


<H2> Preprocess dataset

apply feature processing for null values in the datasets

In [32]:
train_preprocessed[col_numeric_feat] = train_preprocessed[col_numeric_feat].fillna(0)

In [33]:
val_preprocessed[col_numeric_feat] = val_preprocessed[col_numeric_feat].fillna(0)

In [47]:
test_preprocessed[col_numeric_feat] = test_preprocessed[col_numeric_feat].fillna(0)

<h2>Build Baseline model

In this section, a baseline model will be built using xgboost to get a baseline performance using the dataset

In [34]:
clf = xgb.XGBClassifier(n_estimators=50, max_depth=2, learning_rate=1, objective='binary:logistic')

In [35]:
clf.fit(train_preprocessed[col_numeric_feat + col_obj_feat].fillna(-1), train_preprocessed['isFraud'])

In [36]:
prob = clf.predict_proba(val_preprocessed[col_numeric_feat + col_obj_feat].fillna(-1))

auc roc of baseline model

In [37]:
roc_auc_score(val_preprocessed['isFraud'], prob[:,1])

0.8848072907882915

<H2>Build a challenger model

As a point of comparison, a challenger model using random forest is built

In [38]:
clf_forest = RandomForestClassifier(n_estimators=200, max_depth=6, random_state=0)

In [39]:
clf_forest.fit(train_preprocessed[col_numeric_feat + col_obj_feat].fillna(-1), train_preprocessed['isFraud'])

auc roc of challenger model

In [40]:
prob_forest = clf_forest.predict_proba(val_preprocessed[col_numeric_feat + col_obj_feat].fillna(-1))
roc_auc_score(val_preprocessed['isFraud'], prob_forest[:,1])

0.8496829411843816

<H2> Champion model

As it can be seen, the baseline model auc roc is higher. Hence, the final model will be built using xgboost.

<H2> Hyperparameter tuning

Using bayes opt, which is a more space efficient way of hyperparameter tuning than using grid search, the parameters of the champion model will be further optimised to improve performance further.

In [41]:
def xgb_cl_bo(max_depth, n_estimators):
    params_xgb = {}
    params_xgb['max_depth'] = round(max_depth)
    params_xgb['n_estimators'] = round(n_estimators)
    clf = xgb.XGBClassifier(random_state=123, **params_xgb)
    clf.fit(train_preprocessed[col_numeric_feat + col_obj_feat].fillna(-1), train_preprocessed['isFraud'])
    scores = roc_auc_score(val_preprocessed['isFraud'], clf.predict_proba(val_preprocessed[col_numeric_feat + col_obj_feat].fillna(-1))[:,1])
    return scores
params_xgb ={
    'max_depth':(2, 4),
    'n_estimators':(40, 100),
}
xgb_bo = BayesianOptimization(xgb_cl_bo, params_xgb, random_state=111)
xgb_bo.maximize(init_points=2, n_iter=4)

|   iter    |  target   | max_depth | n_esti... |
-------------------------------------------------
| [0m1        [0m | [0m0.8921   [0m | [0m3.224    [0m | [0m50.14    [0m |
| [95m2        [0m | [95m0.9018   [0m | [95m2.872    [0m | [95m86.16    [0m |
| [95m3        [0m | [95m0.913    [0m | [95m3.96     [0m | [95m85.42    [0m |
| [0m4        [0m | [0m0.913    [0m | [0m3.907    [0m | [0m85.49    [0m |
| [0m5        [0m | [0m0.901    [0m | [0m3.316    [0m | [0m83.2     [0m |
| [0m6        [0m | [0m0.8847   [0m | [0m2.012    [0m | [0m66.92    [0m |


In [42]:
xgb_bo.max

{'target': 0.9130201814406987,
 'params': {'max_depth': 3.9599840823449632,
  'n_estimators': 85.41966506748345}}

<H2>Building model using best parameter

In [43]:
params_xgb = {}
params_xgb['max_depth'] = round(xgb_bo.max['params']['max_depth'])
params_xgb['n_estimators'] = round(xgb_bo.max['params']['n_estimators'])
clf = xgb.XGBClassifier(random_state=123, **params_xgb)

In [44]:
clf.fit(train_preprocessed[col_numeric_feat + col_obj_feat].fillna(-1), train_preprocessed['isFraud'])

In [48]:
scores = roc_auc_score(test_preprocessed['isFraud'], clf.predict_proba(test_preprocessed[col_numeric_feat + col_obj_feat].fillna(-1))[:,1])

auc roc of the best performing model

In [49]:
scores

0.9092278822802701

In [51]:
clf.save_model('./xgb_best_model.json')

<h2>Evaluating performance

In [56]:
predict_label = clf.predict(test_preprocessed[col_numeric_feat + col_obj_feat].fillna(-1))

In [67]:
predict_prob = clf.predict_proba(test_preprocessed[col_numeric_feat + col_obj_feat].fillna(-1))

In [98]:
def performance_evaluation(label, predicted_label):
    tn, fp, fn, tp = confusion_matrix(label, predicted_label).ravel()
    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    f1 = 2*precision*recall/(precision+recall)
    percentage_fp = fp/len(label)
    return {
        'fp':fp,
        'tp':tp,
        'prop_fp':percentage_fp,
        'precision':precision,
        'recall':recall,
        'f1':f1
    }

In [99]:
performance_evaluation(test_preprocessed['isFraud'], predict_label)

{'fp': 198,
 'tp': 1652,
 'prop_fp': 0.0016764317404409524,
 'precision': 0.892972972972973,
 'recall': 0.3997096540043552,
 'f1': 0.5522313220792245}

<H2> Getting performance at different threholds

As this is a fraud detection scenario, it is important to get higher recall to ensure that more users are protected. This can be achieved by setting a lower threshold than the default prediction threshold of 0.5. In this section, the recall is aimed to be increased to 70% by lowering the threshold.

In [71]:
 precision, recall, thresholds =precision_recall_curve(test_preprocessed['isFraud'], predict_prob[:,1])

In [76]:
pr_rc_df =pd.DataFrame.from_dict({'precision':precision[1:], 'recall': recall[1:], 'thresholds':thresholds})

In [89]:
pr_rc_df = pr_rc_df.sort_values(by=['recall','thresholds'], ascending=True).reset_index()

In [90]:
pr_rc_df

Unnamed: 0,level_0,index,precision,recall,thresholds
0,0,112611,1.000000,0.000000,0.999814
1,1,112610,1.000000,0.000242,0.999661
2,2,112609,1.000000,0.000484,0.999293
3,3,112608,1.000000,0.000726,0.999290
4,4,112607,1.000000,0.000968,0.999225
...,...,...,...,...,...
112607,112567,44,0.035044,1.000000,0.000439
112608,112566,45,0.035046,1.000000,0.000440
112609,112565,46,0.035047,1.000000,0.000441
112610,112564,47,0.035047,1.000000,0.000441


In [101]:
for idx,row in pr_rc_df.iterrows():
    if row['recall'] > 0.7:
        print(row)
        break

level_0         8446.000000
index         104165.000000
precision          0.339154
recall             0.700218
thresholds         0.075218
Name: 8441, dtype: float64


In [105]:
predict_label_custom_threshold = predict_prob[:,1]>0.075218

In [106]:
predict_label_custom_threshold = predict_label_custom_threshold.astype(int)

In [107]:
performance_evaluation(test_preprocessed['isFraud'], predict_label_custom_threshold)

{'fp': 5639,
 'tp': 2894,
 'prop_fp': 0.04774443729467945,
 'precision': 0.3391538731981718,
 'recall': 0.7002177594967336,
 'f1': 0.4569714195483973}

Even at a lower threshold (at recall 70%), the proportion of false positive (people who are negatively impacted eventhough the are not comitting any fraud) is still low. Hence, a lower threshold of ~0.0752 should be used to reduce the fraud risk in the platform