Reference: https://www.kaggle.com/cdeotte/xgb-fraud-with-magic-0-9600

# Loading Packages and Functions

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [171]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score

In [3]:
import lightgbm as lgb

In [4]:
import xgboost as xgb

# Custom Functions

In [1]:
def create_uids(dfs, groups):
    """
    Cretes uids based on lists of variables given on groups
    It can be used on a list of dataframes given by dfs
    
    """
    for df in dfs:
        created_columns = []
        for group in groups:
            new_name = '_'.join(group)
            df[new_name] = ''
            for col in group:
                df[new_name] += df[col].astype(str)
            df[new_name] = df[new_name].astype('category')
            created_columns.append(new_name)
    
    return dfs, created_columns
    

In [2]:
def freq_encode_full(train, test, cols):
    
    """
    Frequency encoding for variables in cols, using BOTH train
    and test set. Appends _FULL_FE to the created variables names.
    
    """
    new_names = []
    
    for col in cols:
        
        df = pd.concat([train[[col]], test[[col]]])
        values_dict = df[col].value_counts().to_dict()
        train[col+'_FULL_FE'] = train[col].map(values_dict)
        test[col+'_FULL_FE'] = test[col].map(values_dict)
        
        new_names.append(col+'_FULL_FE')
        
    return train, test, new_names

In [3]:
def uid_aggregation_full(train, test, uids, cols, aggs):
          
    """
    Creates aggregation of features in cols, based on groups in uids,
    using aggregations in aggs. Example: creates a new feature that 
    is the mean of C1 on groups of card1_card2.
    
    """
    for uid in uids:
        for col in cols:
            for agg in aggs:
                
                temp = pd.concat([train[[uid,col]], test[[uid,col]]])
                new_name = uid + '_' + col + '_' + agg
                temp = temp.groupby(uid)[col].agg([agg]).reset_index().rename(columns={agg:new_name})
                temp.index = list(temp[uid])
                temp = temp[new_name].to_dict()
                
                train[new_name] = train[uid].map(temp)
                test[new_name] = test[uid].map(temp)
                
    return train, test

In [4]:
def factorize_dfs(train, test, cols):
    """
    Factorize categorical columns for train and test data.
    Mostly for XGBoost model.
    
    """
    train_size = train.shape[0]
    
    for col in cols:
        df = pd.concat([train[col], test[col]])
        df, _ = df.factorize(sort=True)
        
        train[col] = df[:train_size]
        test[col] = df[train_size:]
    
    return train, test

In [8]:
def fill_nas(dfs):
    """
    Fills categorical NAs with 'Missing' and numerical with -999
    
    """
    
    # Categorical features
    cat_columns = [col for col in dfs[0].columns if dfs[0][col].dtype == object]
    for df in dfs:
        df[cat_columns] = df[cat_columns].fillna('Missing')
    
    # C columns
    # C_columns = ['C'+str(i) for i in range(1, 15)]
    num_columns = [col for col in dfs[0].columns if dfs[0][col].dtype != object]
    for df in dfs:
        #df[c_columns] = df[c_columns].fillna(0)
        df[num_columns] = df[num_columns].fillna(-999)
        
    return dfs


In [82]:
def extract_features(dfs):
    
    """
    Creates some features based on dates, normalize columns D,
    and creates the feature Transaction_cents
    
    """
    
    for df in dfs:
        df['Day'] = df['TransactionDT'].dt.day
        df['Day_Week'] = df['TransactionDT'].dt.dayofweek
        df['Hour'] = df['TransactionDT'].dt.hour
        df['Minute'] = df['TransactionDT'].dt.minute
        
        # Number of days
        first_date = pd.date_range('2017-12-01', periods=1)[0]
        df['days_from_start'] = (df['TransactionDT'] - first_date).dt.days
        
        # Normalizing D features
        d_columns = ['D' + str(i) for i in range(1, 16)]
        for col in d_columns:
            df[col+'_n'] = df['days_from_start'] - df[col]
        
        df.drop(d_columns, axis=1, inplace=True)
        
        # Cents from TransactionAmt
        df['Transaction_cents'] = (df['TransactionAmt'] - np.floor(df['TransactionAmt'])).astype('float32')
        
        # TransactionDT is no longer necessary
        df.drop(['TransactionDT', 'days_from_start'], axis=1, inplace=True)
    
    return dfs

In [10]:
def text_features(dfs):
    
    """
    Features based on text information
    
    """

    for df in dfs:
        
        # P_emaildomain
        df['P_emaildomain_initial'] = df['P_emaildomain'].apply(lambda x: x.split('.')[0])
        df['P_emaildomain_final'] = df['P_emaildomain'].apply(lambda x: '#'+''.join(x.split('.')[1:]))
        # R_emaildomain
        df['R_emaildomain_initial'] = df['R_emaildomain'].apply(lambda x: x.split('.')[0])
        df['R_emaildomain_final'] = df['R_emaildomain'].apply(lambda x: '#'+''.join(x.split('.')[1:]))
        
        # DeviceInfo
        temp = df['DeviceInfo'].apply(lambda x: ''.join([i for i in x if i.isalpha()]))
        temp = temp.str.replace(r'SAMSUNGSM', 'SM')
        df['DeviceInfo_brand'] = temp.apply(lambda x: x.split('Build')[0])
        df['DeviceInfo_build'] = temp.apply(lambda x:'#' + ''.join(x.split('Build')[1:]))
        
        # id_30
        df['id_30'] = df['id_30'].str.lower()
        df['id_30_device'] = df['id_30'].apply(lambda x: ''.join([s for s in x if s.isalpha()]))
        df['id_30_version'] = df['id_30'].apply(lambda x:'#'+ ''.join([s for s in x if s.isnumeric()]))
        
        # id_31
        df['id_31'] = df['id_31'].str.lower()
        df['id_31_device'] = df['id_31'].apply(lambda x: ''.join([s for s in x if s.isalpha()]))
    
    return dfs

In [88]:
def preprocess_full(train, test, features_remove=None):
    
    """
    Preprocess the trian and validation/test dataset, creating uids
    and aggregations. The 'magic' features are defined here.
    
    """
    
    
    if features_remove is None:
        features_remove = []
    
    # Remove NAs
    train, test = fill_nas([train, test])
    
    # Extract features
    train, test = extract_features([train, test])
    
    # Extract text features
    train, test = text_features([train, test])
    
    # Creating New Feature for aggregation
    [train, test], new_cols = create_uids([train, test],
                                        [['card1', 'addr1'],
                                         ['card1', 'addr1', 'P_emaildomain'],
                                         ['card1', 'addr1', 'D1_n']
                                        ])
    
    # Aggregations (no magic feature)
    uids = ['card1', 'card1_addr1', 'card1_addr1_P_emaildomain']
    cols = ['TransactionAmt', 'D9_n', 'D11_n']
    aggs = ['mean', 'std']
    train, test = uid_aggregation_full(train, test, uids, cols, aggs)
    
    # Aggregations with magic feature
    uids = ['card1_addr1_D1_n']
    cols = ['TransactionAmt','D4_n','D9_n','D10_n','D15_n']
    aggs = ['mean', 'std']
    train, test = uid_aggregation_full(train, test, uids, cols, aggs)
    
    # Aggregations with magic features (C columns)
    uids = ['card1_addr1_D1_n']
    cols = ['C'+str(x) for x in range(1,15) if x!=3]
    aggs = ['mean']
    train, test = uid_aggregation_full(train, test, uids, cols, aggs)
    
    # Aggregations with magic features
    uids = ['card1_addr1_D1_n']
    cols = ['P_emaildomain','dist1','id_02','Transaction_cents']
    aggs = ['nunique']
    train, test = uid_aggregation_full(train, test, uids, cols, aggs)
    
    # Aggregations with magic features
    uids = ['card1_addr1_D1_n']
    cols = ['C14']
    aggs = ['std']
    train, test = uid_aggregation_full(train, test, uids, cols, aggs)
    
    # Aggregations with magic features 
    uids = ['card1_addr1_D1_n']
    cols = ['C13', 'V314']
    aggs = ['mean']
    train, test = uid_aggregation_full(train, test, uids, cols, aggs)
    
    # Aggregations with magic features
    uids = ['card1_addr1_D1_n']
    cols = ['V127','V136','V309','V307','V320']
    aggs = ['nunique']
    train, test = uid_aggregation_full(train, test, uids, cols, aggs)
    
    # Frequency Encoding
    columns = ['addr1', 'card1', 'card2', 'card3', 'P_emaildomain',
               'card1_addr1', 'card1_addr1_P_emaildomain',
               'card1_addr1_D1_n']
    train, test, new_names = freq_encode_full(train, test, columns)
    
    # Using frequency encoding for categoricals
    cat_columns = [col for col in train.columns if train[col].dtype == object]
    train[cat_columns] = train[cat_columns].astype('category')
    test[cat_columns] = test[cat_columns].astype('category')
    
    return train.drop(features_remove, axis=1), test.drop(features_remove, axis=1), new_names

In [14]:
def fit_baseline_model(X_train, X_val, y_train, y_val):
    
    lgb_params = {
                    'objective':'binary',
                    'boosting_type':'gbdt',
                    'metric':'auc',
                    'n_jobs':-1,
                    'learning_rate':0.01,
                    'num_leaves': 2**8,
                    'max_depth':-1,
                    'tree_learner':'serial',
                    'colsample_bytree': 0.7,
                    'subsample_freq':1,
                    'subsample':0.7,
                    'num_boosting_rounds':80000,
                    'max_bin':255,
                    'verbose':-1,
                    'seed': 357,
                    'early_stopping_rounds':100, 
                } 
    
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_val = lgb.Dataset(X_val, y_val)
    
    estimator = lgb.train(
            lgb_params,
            lgb_train,
            valid_sets = [lgb_train, lgb_val],
            verbose_eval = 200,
        )   
    
    columns = X_train.columns
    feature_imp = pd.DataFrame((zip(estimator.feature_importance(),columns)), 
                                columns=['Value','Feature'])
    feature_imp = feature_imp.sort_values(by='Value', ascending=False).reset_index(drop=True)
    
    return feature_imp

# Load Data and Clean NAs

In [15]:
# Train and Validation Sets
train = pd.read_pickle('./data/train_pickle.pkl')
val = train.loc[train['TransactionDT']>= '2018-05-01']
train = train.loc[train['TransactionDT']< '2018-05-01']
y_train = train['isFraud']
y_val = val['isFraud']
train.drop(['isFraud', 'TransactionID'], axis=1, inplace=True)
val.drop(['isFraud', 'TransactionID'], axis=1, inplace=True)

# Test Set
test = pd.read_pickle('./data/test_pickle.pkl')
test.drop(['TransactionID'], axis=1, inplace=True)

# Model

In [1]:
# https://www.kaggle.com/cdeotte/xgb-fraud-with-magic-0-9600

v =  [1, 3, 4, 6, 8, 11]
v += [13, 14, 17, 20, 23, 26, 27, 30]
v += [36, 37, 40, 41, 44, 47, 48]
v += [54, 56, 59, 62, 65, 67, 68, 70]
v += [76, 78, 80, 82, 86, 88, 89, 91]

#v += [96, 98, 99, 104] #relates to groups, no NAN 
v += [107, 108, 111, 115, 117, 120, 121, 123] # maybe group, no NAN
v += [124, 127, 129, 130, 136] # relates to groups, no NAN

# LOTS OF NAN BELOW
v += [138, 139, 142, 147, 156, 162] #b1
v += [165, 160, 166] #b1
v += [178, 176, 173, 182] #b2
v += [187, 203, 205, 207, 215] #b2
v += [169, 171, 175, 180, 185, 188, 198, 210, 209] #b2
v += [218, 223, 224, 226, 228, 229, 235] #b3
v += [240, 258, 257, 253, 252, 260, 261] #b3
v += [264, 266, 267, 274, 277] #b3
v += [220, 221, 234, 238, 250, 271] #b3

v += [294, 284, 285, 286, 291, 297] # relates to grous, no NAN
v += [303, 305, 307, 309, 310, 320] # relates to groups, no NAN
v += [281, 283, 289, 296, 301, 314] # relates to groups, no NAN

## LightGBM:

In [165]:
v_columns = ['V' + str(i) for i in range(1, 340) if i not in v]

In [166]:
train, val, cat_columns = preprocess_full(train, val, features_remove=v_columns)

In [168]:
lgb_params = {
    'objective': 'binary',
    'metric': 'auc',
    #'metric': 'None',
    'learning_rate': 0.01,
    'num_leaves': 2**8,
    'max_bin': 255,
    'max_depth': -1,
    'bagging_freq': 5,
    'bagging_fraction': 0.7,
    'bagging_seed': 32,
    'feature_fraction': 0.7,
    'feature_fraction_seed': 32,
    'first_metric_only': True,
    'verbose': 100,
    'n_jobs': -1,
    'seed': 35,
} 

In [169]:
lgb_train = lgb.Dataset(train, y_train)
lgb_val = lgb.Dataset(val, y_val)

In [170]:
clf = lgb.train(lgb_params, lgb_train, 10000, 
                valid_sets = [lgb_train, lgb_val], 
                early_stopping_rounds=500, verbose_eval=200)

Training until validation scores don't improve for 500 rounds
[200]	training's auc: 0.992799	valid_1's auc: 0.936985
[400]	training's auc: 0.99869	valid_1's auc: 0.939169
[600]	training's auc: 0.999786	valid_1's auc: 0.938026
[800]	training's auc: 0.999967	valid_1's auc: 0.938328
Early stopping, best iteration is:
[305]	training's auc: 0.997094	valid_1's auc: 0.940073
Evaluated only: auc


In [171]:
best_iteration = clf.best_iteration

Submitting without retraining:

In [172]:
# Train and Validation Sets
train = pd.read_pickle('./data/train_pickle.pkl')
val = train.loc[train['TransactionDT']>= '2018-05-01']
train = train.loc[train['TransactionDT']< '2018-05-01']
y_train = train['isFraud']
y_val = val['isFraud']
train.drop(['isFraud', 'TransactionID'], axis=1, inplace=True)
val.drop(['isFraud', 'TransactionID'], axis=1, inplace=True)

# Test Set
test = pd.read_pickle('./data/test_pickle.pkl')
test.drop(['TransactionID'], axis=1, inplace=True)

In [173]:
train, test, cat_columns = preprocess_full(train, test, features_remove=v_columns)

In [174]:
y_test_pred = clf.predict(test, best_iteration)

In [175]:
sample = pd.read_csv('./data/sample_submission.csv')
submission = sample
submission['isFraud'] = y_test_pred

In [176]:
submission.to_csv('./data/submissions/lgb_pre_udi.csv', index=False)

## Trying XGBoost:

In [89]:
# Train and Validation Sets
train = pd.read_pickle('./data/train_pickle.pkl')
val = train.loc[train['TransactionDT']>= '2018-05-01']
train = train.loc[train['TransactionDT']< '2018-05-01']
y_train = train['isFraud']
y_val = val['isFraud']
train.drop(['isFraud', 'TransactionID'], axis=1, inplace=True)
val.drop(['isFraud', 'TransactionID'], axis=1, inplace=True)

# Test Set
test = pd.read_pickle('./data/test_pickle.pkl')
test.drop(['TransactionID'], axis=1, inplace=True)

In [90]:
v_columns = ['V' + str(i) for i in range(1, 340) if i not in v]

In [91]:
train, val, cat_columns = preprocess_full(train, val, features_remove=v_columns)

XGBoost does not deal with categorical features the same way the LightGBM does, so we need to encode them:

In [92]:
cat_columns = [col for col in train.columns if train[col].dtype == object]
to_encode = cat_columns + list(train.columns[train.dtypes == 'category'].values)

We need to factorize the categorical variables:

In [96]:
train, val = factorize_dfs(train, val, to_encode)

In [97]:
clf = xgb.XGBClassifier( 
        n_estimators=2000,
        max_depth=12, 
        learning_rate=0.02, 
        subsample=0.8,
        colsample_bytree=0.4, 
        missing=-1, 
        eval_metric='auc',
        nthread=4,
        tree_method='hist' 
        #tree_method='gpu_hist' 
    )

In [98]:
h = clf.fit(train, y_train, 
        eval_set=[(val, y_val)],
        verbose=50, early_stopping_rounds=100)

[0]	validation_0-auc:0.84143
Will train until validation_0-auc hasn't improved in 100 rounds.
[50]	validation_0-auc:0.89667
[100]	validation_0-auc:0.91109
[150]	validation_0-auc:0.92605
[200]	validation_0-auc:0.93690
[250]	validation_0-auc:0.94354
[300]	validation_0-auc:0.94697
[350]	validation_0-auc:0.94902
[400]	validation_0-auc:0.95018
[450]	validation_0-auc:0.95070
[500]	validation_0-auc:0.95078
[550]	validation_0-auc:0.95094
[600]	validation_0-auc:0.95084
Stopping. Best iteration:
[548]	validation_0-auc:0.95101



Predicting test set:

In [150]:
# Train and Validation Sets
train = pd.read_pickle('./data/train_pickle.pkl')
y_train = train['isFraud']
train.drop(['isFraud', 'TransactionID'], axis=1, inplace=True)

# Test Set
test = pd.read_pickle('./data/test_pickle.pkl')
test.drop(['TransactionID'], axis=1, inplace=True)

In [151]:
train, test, cat_columns = preprocess_full(train, test, features_remove=v_columns)

In [154]:
#train.to_pickle('./data/train_processed.pkl')
#test.to_pickle('./data/test_processed.pkl')

In [156]:
train, test = factorize_dfs(train, test, to_encode)

Predictions using GroupCV:

In [168]:
import gc

In [157]:
train_dates = pd.read_pickle('./data/train_dates.pkl')
train_dates['Month'] = train_dates['TransactionDT'].dt.month
# June has few observations, so we merge it with may
train_dates.loc[train_dates['Month']==6, 'Month'] = 5

In [159]:
gkf = GroupKFold(n_splits=6)

oof = np.zeros(len(train))
preds = np.zeros(len(test))

In [177]:
for i, (idxT, idxV) in enumerate(gkf.split(train_dates, y_train, groups=train_dates['Month'])):
    
    if i == 0: continue
    
    month = train_dates.iloc[idxV]['Month'].iloc[0]
    print('Fold',i,'withholding month', month)
    print(' rows of train =',len(idxT),'rows of holdout =',len(idxV))
    
    clf = xgb.XGBClassifier(
            n_estimators=2000,
            max_depth=12,
            learning_rate=0.02,
            subsample=0.8,
            colsample_bytree=0.4,
            missing=-1,
            eval_metric='auc',
            # USE CPU
            nthread=4,
            tree_method='hist'
        )     
    
    h = clf.fit(train.iloc[idxT], y_train.iloc[idxT],
                eval_set=[(train.iloc[idxV], y_train.iloc[idxV])],
                verbose=100, early_stopping_rounds=100)
    
    oof[idxV] += clf.predict_proba(train.iloc[idxV])[:,1]
    preds += clf.predict_proba(test)[:,1]/gkf.n_splits
    del h, clf
    gc.collect()
    
print('#'*20)
print ('XGB95 OOF CV=',roc_auc_score(y_train,oof))
    
    

Fold 1 withholding month 3
 rows of train = 488572 rows of holdout = 101968
[0]	validation_0-auc:0.84176
Will train until validation_0-auc hasn't improved in 100 rounds.
[100]	validation_0-auc:0.92239
[200]	validation_0-auc:0.94588
[300]	validation_0-auc:0.95519
[400]	validation_0-auc:0.95765
[500]	validation_0-auc:0.95842
[600]	validation_0-auc:0.95847
Stopping. Best iteration:
[566]	validation_0-auc:0.95860

Fold 2 withholding month 1
 rows of train = 498030 rows of holdout = 92510
[0]	validation_0-auc:0.83524
Will train until validation_0-auc hasn't improved in 100 rounds.
[100]	validation_0-auc:0.92307
[200]	validation_0-auc:0.94691
[300]	validation_0-auc:0.95888
[400]	validation_0-auc:0.96333
[500]	validation_0-auc:0.96485
[600]	validation_0-auc:0.96556
[700]	validation_0-auc:0.96603
[800]	validation_0-auc:0.96628
Stopping. Best iteration:
[779]	validation_0-auc:0.96634

Fold 3 withholding month 5
 rows of train = 498113 rows of holdout = 92427
[0]	validation_0-auc:0.84177
Will tr

In [183]:
sample = pd.read_csv('./data/sample_submission.csv')
submission = sample
submission['isFraud'] = preds

In [190]:
submission.to_csv('./data/submissions/xbg_6kfold.csv', index=False)