In [10]:
# %pip install xgboost
# %pip install lightgbm

In [11]:
import pandas as pd
import os
from tqdm import tqdm
import numpy as np
from xgboost import XGBClassifier
import xgboost as xgb
from lightgbm import LGBMClassifier
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score,roc_auc_score,f1_score,recall_score,precision_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn import preprocessing

In [12]:
data_root = '../data/'
train_identity = pd.read_csv(f'{data_root}train_identity.csv')
train_transaction = pd.read_csv(f'{data_root}train_transaction.csv')
df = pd.merge(train_transaction, train_identity, on='TransactionID', how='left')
del train_identity,train_transaction

# Data preprocessing

In [13]:
#Delete columns with excessive proportion of nan values

In [14]:
num_sample = len(df)
threshold = 0.9
for col in df.columns:
    if (num_sample - df[col].count())/num_sample > threshold:
        print(f'del {col} nan rate: {(num_sample - df[col].count())/num_sample}')
        del df[col]

del dist2 nan rate: 0.9362837403054831
del D7 nan rate: 0.9340992989467267
del id_07 nan rate: 0.9912707013919464
del id_08 nan rate: 0.9912707013919464
del id_18 nan rate: 0.9236072069631185
del id_21 nan rate: 0.9912639279303688
del id_22 nan rate: 0.9912469942764249
del id_23 nan rate: 0.9912469942764249
del id_24 nan rate: 0.9919615944728554
del id_25 nan rate: 0.9913096487960172
del id_26 nan rate: 0.9912571544687913
del id_27 nan rate: 0.9912469942764249


In [15]:
#nan value filling, 0 filling is used here.
df = df.fillna(0)

In [16]:
#Find and encode the features of discrete types.
cate_cols = ['ProductCD', 'card4', 'card6', 'P_emaildomain', 'R_emaildomain', 'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'id_12', 'id_15', 'id_16', 'id_28', 'id_29', 'id_30', 'id_31', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 'DeviceType', 'DeviceInfo']
for f in tqdm(cate_cols):
    map_dict = dict(zip(df[f].unique(), range(df[f].nunique())))
    df[f + '_count'] = df[f].map(df[f].value_counts())
    df[f] = df[f].map(map_dict)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 29/29 [00:02<00:00, 13.31it/s]


# Slice data sets

In [17]:
df = df.sample(frac=1).reset_index(drop=True) #Disrupt the order of data sets
train_num = int(0.8*len(df)) 
valid_num = int(0.1*len(df))

train_df = df[:train_num].reset_index(drop=True)
valid_df = df[train_num:train_num+valid_num].reset_index(drop=True)
test_df = df[train_num+valid_num:].reset_index(drop=True)

# Training model

In [18]:
from sklearn.metrics import classification_report
y_true = [0,1,0,0,0,1]
y_pred = [0,0,0,1,1,0]
s=classification_report(y_true, y_pred)
print(s)

              precision    recall  f1-score   support

           0       0.50      0.50      0.50         4
           1       0.00      0.00      0.00         2

    accuracy                           0.33         6
   macro avg       0.25      0.25      0.25         6
weighted avg       0.33      0.33      0.33         6



In [19]:
def train_xgb(train_df, valid_df, test_df):
    
    label_col = 'isFraud'
    drop_fea = ['isFraud']
    feature = [x for x in train_df.columns if x not in drop_fea]

    params = {
        'booster': 'gbtree',
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'gamma': 0.1,
        'max_depth': 6,
        'alpha': 0,
        'lambda': 0,
        'subsample': 0.7,
        'colsample_bytree': 0.5,
        'min_child_weight': 3,
        'silent': 0,
        'eta': 0.03,
        'nthread': -1,
        'seed': 2019,
    }

    trn_data = xgb.DMatrix(train_df[feature], label=train_df[label_col])
    val_data = xgb.DMatrix(valid_df[feature], label=valid_df[label_col])

    watchlist = [(trn_data, 'train'), (val_data, 'valid')]
    clf = xgb.train(params, trn_data, 1000, watchlist, verbose_eval=50, early_stopping_rounds=100)

    ##Prediction
    
    y_pre = clf.predict(xgb.DMatrix(train_df[feature]), ntree_limit=clf.best_ntree_limit)
    train_acc = accuracy_score(train_df[label_col],y_pre>0.5)
    train_f1 = f1_score(train_df[label_col],y_pre>0.5)
    train_recall = recall_score(train_df[label_col],y_pre>0.5)
    train_precision = precision_score(train_df[label_col],y_pre>0.5)
    
    y_pre = clf.predict(xgb.DMatrix(valid_df[feature]), ntree_limit=clf.best_ntree_limit)
    valid_acc = accuracy_score(valid_df[label_col],y_pre>0.5)
    valid_f1 = f1_score(valid_df[label_col],y_pre>0.5)
    valid_recall = recall_score(valid_df[label_col],y_pre>0.5)
    valid_precision = precision_score(valid_df[label_col],y_pre>0.5)
    
    y_pre = clf.predict(xgb.DMatrix(test_df[feature]), ntree_limit=clf.best_ntree_limit)
    test_acc = accuracy_score(test_df[label_col],y_pre>0.5)
    test_f1 = f1_score(test_df[label_col],y_pre>0.5)
    test_recall = recall_score(test_df[label_col],y_pre>0.5)
    test_precision = precision_score(test_df[label_col],y_pre>0.5)
    
    return [train_acc, valid_acc, test_acc, train_f1, valid_f1, test_f1, train_recall, valid_recall, test_recall, train_precision, valid_precision, test_precision]

In [20]:
def train_lgb(train_df, valid_df, test_df):
    label_col = 'isFraud'
    drop_fea = ['isFraud']
    feature = [x for x in train_df.columns if x not in drop_fea]
    
    params = {'num_leaves': 60,
          'min_data_in_leaf': 30,
          'objective': 'binary', 
          'max_depth': -1,
          'learning_rate': 0.06,
          "min_sum_hessian_in_leaf": 6,
          "boosting": "gbdt",
          "feature_fraction": 0.9,  
          "bagging_freq": 1,
          "bagging_fraction": 0.8,
          "bagging_seed": 11,
          "lambda_l1": 0.1,             
          "verbosity": -1,
          "nthread": -1,                
          'metric': {'binary_logloss', 'auc'},  
          "random_state": 2019, 
          }
    
    trn_data = lgb.Dataset(train_df[feature], label=train_df[label_col])
    val_data = lgb.Dataset(valid_df[feature], label=valid_df[label_col])


    clf = lgb.train(params,
                    trn_data,
                    1000,
                    valid_sets=[trn_data, val_data],
                    verbose_eval=20,
                    early_stopping_rounds=60)
    
    ##Prediction
      
    y_pre = clf.predict(train_df[feature], num_iteration=clf.best_iteration)
    train_acc = accuracy_score(train_df[label_col],y_pre>0.5)
    train_f1 = f1_score(train_df[label_col],y_pre>0.5)
    train_recall = recall_score(train_df[label_col],y_pre>0.5)
    train_precision = precision_score(train_df[label_col],y_pre>0.5)
    
    y_pre = clf.predict(valid_df[feature], num_iteration=clf.best_iteration)
    valid_acc = accuracy_score(valid_df[label_col],y_pre>0.5)
    valid_f1 = f1_score(valid_df[label_col],y_pre>0.5)
    valid_recall = recall_score(valid_df[label_col],y_pre>0.5)
    valid_precision = precision_score(valid_df[label_col],y_pre>0.5)
    
    y_pre = clf.predict(test_df[feature], num_iteration=clf.best_iteration)
    test_acc = accuracy_score(test_df[label_col],y_pre>0.5)
    test_f1 = f1_score(test_df[label_col],y_pre>0.5)
    test_recall = recall_score(test_df[label_col],y_pre>0.5)
    test_precision = precision_score(test_df[label_col],y_pre>0.5)
    
    return [train_acc, valid_acc, test_acc, train_f1, valid_f1, test_f1, train_recall, valid_recall, test_recall, train_precision, valid_precision, test_precision]

In [21]:
def train_rf(train_df,valid_df,test_df):
    label_col = 'isFraud'
    drop_fea = ['isFraud']
    feature = [x for x in train_df.columns if x not in drop_fea]

    clf = RandomForestClassifier(n_estimators=100,max_depth=6,random_state=11,verbose=1,n_jobs=-1)

    clf.fit(train_df[feature],train_df[label_col])
    
    ##Prediction
    
    y_pre = clf.predict(train_df[feature])
    train_acc = accuracy_score(train_df[label_col],y_pre)
    train_f1 = f1_score(train_df[label_col],y_pre)
    train_recall = recall_score(train_df[label_col],y_pre)
    train_precision = precision_score(train_df[label_col],y_pre)
    
    y_pre = clf.predict(valid_df[feature])
    valid_acc = accuracy_score(valid_df[label_col],y_pre)
    valid_f1 = f1_score(valid_df[label_col],y_pre)
    valid_recall = recall_score(valid_df[label_col],y_pre)
    valid_precision = precision_score(valid_df[label_col],y_pre)
    
    y_pre = clf.predict(test_df[feature])
    test_acc = accuracy_score(test_df[label_col],y_pre)
    test_f1 = f1_score(test_df[label_col],y_pre)
    test_recall = recall_score(test_df[label_col],y_pre)
    test_precision = precision_score(test_df[label_col],y_pre)
    
    return [train_acc, valid_acc, test_acc, train_f1, valid_f1, test_f1, train_recall, valid_recall, test_recall, train_precision, valid_precision, test_precision]

In [22]:
def train_nn(train_df,valid_df,test_df):

    label_col = 'isFraud'
    drop_fea = ['isFraud']
    feature = [x for x in train_df.columns if x not in drop_fea]

    clf = MLPClassifier(solver = 'adam',verbose=1, activation = 'logistic', max_iter = 100,learning_rate_init=0.01,
                        hidden_layer_sizes = (128,32),random_state = 4399, early_stopping=True)

    scaler = preprocessing.StandardScaler().fit(train_df[feature])
    train_df[feature] = scaler.transform(train_df[feature])
    valid_df[feature] = scaler.transform(valid_df[feature])
    test_df[feature] = scaler.transform(test_df[feature])

    clf.fit(train_df[feature],train_df[label_col])

    ##Prediction

    y_pre = clf.predict(train_df[feature])
    train_acc = accuracy_score(train_df[label_col],y_pre)
    train_f1 = f1_score(train_df[label_col],y_pre)
    train_recall = recall_score(train_df[label_col],y_pre)
    train_precision = precision_score(train_df[label_col],y_pre)

    y_pre = clf.predict(valid_df[feature])
    valid_acc = accuracy_score(valid_df[label_col],y_pre)
    valid_f1 = f1_score(valid_df[label_col],y_pre)
    valid_recall = recall_score(valid_df[label_col],y_pre)
    valid_precision = precision_score(valid_df[label_col],y_pre)

    y_pre = clf.predict(test_df[feature])
    test_acc = accuracy_score(test_df[label_col],y_pre)
    test_f1 = f1_score(test_df[label_col],y_pre)
    test_recall = recall_score(test_df[label_col],y_pre)
    test_precision = precision_score(test_df[label_col],y_pre)


    
    return [train_acc, valid_acc, test_acc, train_f1, valid_f1, test_f1, train_recall, valid_recall, test_recall, train_precision, valid_precision, test_precision]

In [12]:
def train_all_model(train_df, valid_df, test_df):
    print('[INFO] Train LGB')
    lgb_acc = train_lgb(train_df, valid_df, test_df)
    print('[INFO] Train RF')
    rf_acc = train_rf(train_df, valid_df, test_df)
    print('[INFO] Train NN')
    nn_acc = train_nn(train_df, valid_df, test_df)
    print('[INFO] Train XGB')
    xgb_acc = train_xgb(train_df, valid_df, test_df)
    
    print('*'*100)
    print('LGB: Train acc:{:.4f} Valid acc:{:.4f} Test acc:{:.4f}'.format(lgb_acc[0],lgb_acc[1],lgb_acc[2]))
    print('LGB: Train f1:{:.4f} Valid f1:{:.4f} Test f1:{:.4f}'.format(lgb_acc[3],lgb_acc[4],lgb_acc[5]))
    print('LGB: Train recall:{:.4f} Valid recall:{:.4f} Test recall:{:.4f}'.format(lgb_acc[6],lgb_acc[7],lgb_acc[8]))
    print('LGB: Train precision:{:.4f} Valid precision:{:.4f} Test precision:{:.4f}'.format(lgb_acc[9],lgb_acc[10],lgb_acc[11]))
    print('*'*100)
    
    print('*'*100)
    print('XGB: Train acc:{:.4f} Valid acc:{:.4f} Test acc:{:.4f}'.format(xgb_acc[0],xgb_acc[1],xgb_acc[2]))
    print('XGB: Train f1:{:.4f} Valid f1:{:.4f} Test f1:{:.4f}'.format(xgb_acc[3],xgb_acc[4],xgb_acc[5]))
    print('XGB: Train recall:{:.4f} Valid recall:{:.4f} Test recall:{:.4f}'.format(xgb_acc[6],xgb_acc[7],xgb_acc[8]))
    print('XGB: Train precision:{:.4f} Valid precision:{:.4f} Test precision:{:.4f}'.format(xgb_acc[9],xgb_acc[10],xgb_acc[11]))
    print('*'*100)
    
    print('*'*100)
    print('RF: Train acc:{:.4f} Valid acc:{:.4f} Test acc:{:.4f}'.format(rf_acc[0],rf_acc[1],rf_acc[2]))
    print('RF: Train f1:{:.4f} Valid f1:{:.4f} Test f1:{:.4f}'.format(rf_acc[3],rf_acc[4],rf_acc[5]))
    print('RF: Train recall:{:.4f} Valid recall:{:.4f} Test recall:{:.4f}'.format(rf_acc[6],rf_acc[7],rf_acc[8]))
    print('RF: Train precision:{:.4f} Valid precision:{:.4f} Test precision:{:.4f}'.format(rf_acc[9],rf_acc[10],rf_acc[11]))
    print('*'*100)
    
    print('*'*100)
    print('NN: Train acc:{:.4f} Valid acc:{:.4f} Test acc:{:.4f}'.format(nn_acc[0],nn_acc[1],nn_acc[2]))
    print('NN: Train f1:{:.4f} Valid f1:{:.4f} Test f1:{:.4f}'.format(nn_acc[3],nn_acc[4],nn_acc[5]))
    print('NN: Train recall:{:.4f} Valid recall:{:.4f} Test recall:{:.4f}'.format(nn_acc[6],nn_acc[7],nn_acc[8]))
    print('NN: Train precision:{:.4f} Valid precision:{:.4f} Test precision:{:.4f}'.format(nn_acc[9],nn_acc[10],nn_acc[11]))
    print('*'*100)
    
    return lgb_acc,xgb_acc,rf_acc,nn_acc

In [13]:
lgb_acc,xgb_acc,rf_acc,nn_acc = train_all_model(train_df, valid_df, test_df)

[INFO] Train LGB
Training until validation scores don't improve for 60 rounds
[20]	training's binary_logloss: 0.0958519	training's auc: 0.892941	valid_1's binary_logloss: 0.0979825	valid_1's auc: 0.885291
[40]	training's binary_logloss: 0.0841684	training's auc: 0.91331	valid_1's binary_logloss: 0.0878075	valid_1's auc: 0.901737
[60]	training's binary_logloss: 0.0776605	training's auc: 0.926343	valid_1's binary_logloss: 0.0822016	valid_1's auc: 0.913117
[80]	training's binary_logloss: 0.0731154	training's auc: 0.935238	valid_1's binary_logloss: 0.078638	valid_1's auc: 0.919881
[100]	training's binary_logloss: 0.0694938	training's auc: 0.942049	valid_1's binary_logloss: 0.076128	valid_1's auc: 0.924671
[120]	training's binary_logloss: 0.0665733	training's auc: 0.94745	valid_1's binary_logloss: 0.0739167	valid_1's auc: 0.929501
[140]	training's binary_logloss: 0.0641588	training's auc: 0.951405	valid_1's binary_logloss: 0.072197	valid_1's auc: 0.932484
[160]	training's binary_logloss: 0.

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    6.6s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   19.6s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.1s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.4s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.0s finished


[INFO] Train NN
Iteration 1, loss = 0.11207488
Validation score: 0.971340
Iteration 2, loss = 0.10737790
Validation score: 0.971171
Iteration 3, loss = 0.10634000
Validation score: 0.971531
Iteration 4, loss = 0.10627922
Validation score: 0.971361
Iteration 5, loss = 0.10565607
Validation score: 0.972039
Iteration 6, loss = 0.10547238
Validation score: 0.973436
Iteration 7, loss = 0.10485044
Validation score: 0.973161
Iteration 8, loss = 0.10483017
Validation score: 0.973118
Iteration 9, loss = 0.10457459
Validation score: 0.972631
Iteration 10, loss = 0.10427984
Validation score: 0.973034
Iteration 11, loss = 0.10445549
Validation score: 0.971488
Iteration 12, loss = 0.10428688
Validation score: 0.973182
Iteration 13, loss = 0.10380164
Validation score: 0.973690
Iteration 14, loss = 0.10324046
Validation score: 0.972991
Iteration 15, loss = 0.10302937
Validation score: 0.973563
Iteration 16, loss = 0.10353329
Validation score: 0.973563
Iteration 17, loss = 0.10337950
Validation score:



****************************************************************************************************
LGB: Train acc:0.9927 Valid acc:0.9860 Test acc:0.9862
LGB: Train f1:0.8841 Valid f1:0.7575 Test f1:0.7615
LGB: Train recall:0.7945 Valid recall:0.6252 Test recall:0.6332
LGB: Train precision:0.9964 Valid precision:0.9606 Test precision:0.9551
****************************************************************************************************
****************************************************************************************************
XGB: Train acc:0.9829 Valid acc:0.9808 Test acc:0.9815
XGB: Train f1:0.6856 Valid f1:0.6375 Test f1:0.6516
XGB: Train recall:0.5321 Valid recall:0.4826 Test recall:0.4985
XGB: Train precision:0.9633 Valid precision:0.9389 Test precision:0.9402
****************************************************************************************************
****************************************************************************************************
RF: Tra

# Comparing Results

****************************************************************************************************
LGB: Train acc:0.9927 Valid acc:0.9858 Test acc:0.9857
LGB: Train f1:0.8834 Valid f1:0.7616 Test f1:0.7585
LGB: Train recall:0.7937 Valid recall:0.6339 Test recall:0.6295
LGB: Train precision:0.9958 Valid precision:0.9536 Test precision:0.9540
****************************************************************************************************
****************************************************************************************************
XGB: Train acc:0.9831 Valid acc:0.9804 Test acc:0.9803
XGB: Train f1:0.6871 Valid f1:0.6368 Test f1:0.6352
XGB: Train recall:0.5335 Valid recall:0.4824 Test recall:0.4796
XGB: Train precision:0.9649 Valid precision:0.9364 Test precision:0.9405
****************************************************************************************************
****************************************************************************************************
RF: Train acc:0.9716 Valid acc:0.9706 Test acc:0.9710
RF: Train f1:0.3361 Valid f1:0.3277 Test f1:0.3362
RF: Train recall:0.2061 Valid recall:0.2009 Test recall:0.2059
RF: Train precision:0.9101 Valid precision:0.8887 Test precision:0.9156
****************************************************************************************************
****************************************************************************************************
NN: Train acc:0.9751 Valid acc:0.9735 Test acc:0.9744
NN: Train f1:0.5149 Valid f1:0.4947 Test f1:0.5105
NN: Train recall:0.3788 Valid recall:0.3632 Test recall:0.3738
NN: Train precision:0.8036 Valid precision:0.7751 Test precision:0.8049
****************************************************************************************************