In [5]:
import xgboost as xgb
import numpy as np
import os 
from pathlib import Path
import pickle 
import pandas as pd
from sklearn.metrics import roc_auc_score
import datetime
WORK_PATH = str(Path( os.path.abspath('')).parent.parent)
feat_dict = pickle.load(open(WORK_PATH + "/results/features_category.pkl" ,"rb"))

numeric_cols = feat_dict['numeric_cols']
cat_cols = feat_dict['cat_cols']

print(cat_cols)
print(numeric_cols)


['merchantName', 'posEntryMode', 'posConditionCode', 'merchantCategoryCode']
['accountNumber', 'customerId', 'creditLimit', 'availableMoney', 'transactionAmount', 'cardCVV', 'enteredCVV', 'cardLast4Digits', 'currentBalance', 'cardPresent', 'isFraud']


In [6]:
# Load processed Dataset which unnessary columns are deleted 
# the processed data has the same number of rows, but with smaller number of columns
pd00 = pd.read_csv(WORK_PATH + "/data/processed_data.csv")
print(pd00.shape)

(786363, 26)


In [7]:
## Extract date info
def get_week_in_year(date):
    """
    get week number  in which the transaction happens based on date
    return an integer number between 0 and 51
    """
    y,m, d = date.split("-")
    week = datetime.date(int(y), int(m), int(d)).isocalendar()[1]
    if week >52:
        # for some reasons get_week('2016-01-01') return 53 but not zero 
        return 0
    else:
        return week
        
print(get_week_in_year('2016-01-01'))
print(get_week_in_year('2016-12-31'))
print(get_week_in_year('2016-01-07'))

pd00['date'] = pd00['transactionDateTime'].apply(lambda x: x.split("T")[0]).astype(str)
pd00['week_in_year'] = pd00['date'].apply(get_week_in_year)

0
52
1


### Part I. Construct basic features based on current transaction

In [8]:
# First perform one-hot encoding for  two categorical features merchantCategoryCode and posEntryMode
freq_items = pd00['merchantCategoryCode'].value_counts().index.tolist()[:15]
def relabel(x):
    if x in freq_items:
        return x
    else:
        return 'other'
pd00['merchantCategoryCode'] = pd00['merchantCategoryCode'].apply(relabel )
pd00 = pd.get_dummies(pd00,  columns =  ['merchantCategoryCode'] )
pd00 = pd.get_dummies(pd00,  columns =  ['posEntryMode'] )


In [9]:
basic_feats = ['creditLimit', 'availableMoney', 'transactionAmount','cardPresent']  + \
               [col for col in pd00.columns.tolist() if "posEntryMode" in col or "merchantCategoryCode" in col ] 

In [10]:
pd00 = pd00[pd00['week_in_year']>=12]
print(pd00.shape, pd00['date'].min())

(624515, 46) 2016-03-21


### Part II. Load pre-computed time series  features

In [11]:
file_name_list = ['basic_cat_ts_feats_lb4wk.csv', # count distinct merchant name/merchant code prev 4 weeks before week of transaction
                  'basic_cat_ts_feats_lb12wk.csv', # same as above but look back 12 weeks
                 'basic_num_ts_feats_lb4wk.csv',  # compute max/min/avg of trans amount/balance/credit limit... 
                  'basic_num_ts_feats_lb12wk.csv',
                  'stats_diff_time_feats_lb4wk.csv',# count stats of difference time bw two consecutive transaction of one user
                   'stats_diff_time_feats_lb12wk.csv'] 


for file_name in file_name_list:
    new_feat_pd = pd.read_csv(WORK_PATH +'/results/' + file_name)
    pd00 = pd.merge(pd00, new_feat_pd, on = ['accountNumber','week_in_year'], how ='left')
    print(pd00.shape)

    



(624515, 48)
(624515, 50)
(624515, 75)
(624515, 100)
(624515, 104)
(624515, 108)


In [12]:
new_feats =  [col for col in pd00.columns.tolist() if "4weeks" in col or "12weeks" in col  ]  # time series feat

all_feats = new_feats + basic_feats

In [13]:

x_train = pd00[pd00['tvt_code_0']=='train'][all_feats]# in-of-sample from 2016-01 to 2016-10
y_train = pd00[pd00['tvt_code_0']=='train']['isFraud']

x_val = pd00[pd00['tvt_code_0']=='val'][all_feats] # also in-of-sample from 2016-01 to 2016-10, disjoint with train
y_val = pd00[pd00['tvt_code_0']=='val']['isFraud']

x_test = pd00[pd00['tvt_code_0']=='test'][all_feats] # out-of-sample from 2016-10 to 2016-12
y_test = pd00[pd00['tvt_code_0']=='test']['isFraud']



In [14]:
clf = xgb.XGBClassifier(
                max_depth = 10,
                n_estimators=1000, # should not too large otherwise overfitting
                learning_rate=0.035, # not too small, not too large
                nthread=4,
                subsample=0.8,
                colsample_bytree=0.8,
                min_child_weight = 5,
                seed=42)

clf.fit(x_train, y_train, early_stopping_rounds=50, eval_metric="auc", verbose=200,
        eval_set=[(x_val, y_val)])

[0]	validation_0-auc:0.71434
Will train until validation_0-auc hasn't improved in 50 rounds.
[200]	validation_0-auc:0.79761
Stopping. Best iteration:
[307]	validation_0-auc:0.80378



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.035, max_delta_step=0, max_depth=10,
              min_child_weight=5, missing=nan, monotone_constraints='()',
              n_estimators=1000, n_jobs=4, nthread=4, num_parallel_tree=1,
              random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              seed=42, subsample=0.8, tree_method='exact',
              validate_parameters=1, verbosity=None)

In [15]:
y_pred = clf.predict_proba(x_test)[:,1]
roc_auc_score(y_test, y_pred)

0.7728623354981442

In [16]:
import collections
feat_imp_dict  = clf.get_booster().get_score(importance_type="gain")
feat_imp_dict = sorted(feat_imp_dict.items(), key=lambda kv: kv[1], reverse= True)
feat_imp_dict = collections.OrderedDict(feat_imp_dict  )



In [18]:
feat_imp_dict

OrderedDict([('posEntryMode_5.0', 31.63127089915872),
             ('cardPresent', 24.132752438018333),
             ('posEntryMode_90.0', 11.53332648235294),
             ('creditLimit_amax_lb_4weeks', 10.19875669),
             ('merchantCategoryCode_entertainment', 10.160546224711084),
             ('transactionAmount', 9.834974387250506),
             ('merchantCategoryCode_online_gifts', 8.606456994346491),
             ('posEntryMode_9.0', 8.575894963022849),
             ('merchantCategoryCode_fuel', 8.187713519730542),
             ('posEntryMode_80.0', 7.851771985785713),
             ('merchantCategoryCode_mobileapps', 7.673508755750002),
             ('merchantCategoryCode_online_retail', 7.571619195702294),
             ('merchantCategoryCode_health', 7.4594579575364275),
             ('merchantCategoryCode_other', 7.216271566847061),
             ('merchantCategoryCode_online_subscriptions', 7.215727415052632),
             ('creditLimit_mean_lb_4weeks', 7.174981014458333)