In [293]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score as auc
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

import xgboost as xgb
from xgboost.sklearn import XGBClassifier

import xgbfir

import matplotlib.pyplot as plt

from catboost import CatBoostClassifier

In [265]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample = pd.read_csv('sample_submissions.csv')

In [135]:
train.shape, test.shape

((348978, 51), (523466, 50))

In [4]:
train.head()

Unnamed: 0,transaction_id,num_var_1,num_var_2,num_var_3,num_var_4,num_var_5,num_var_6,num_var_7,cat_var_1,cat_var_2,...,cat_var_34,cat_var_35,cat_var_36,cat_var_37,cat_var_38,cat_var_39,cat_var_40,cat_var_41,cat_var_42,target
0,id_11,2.302632e-08,0.040182,0.0,1.8e-07,2.302632e-08,2.368421e-08,1.115205e-08,,ce,...,0,0,0,0,0,0,0,0,0,0
1,id_33,7.965789e-06,0.157872,0.0,2.105e-06,2.769737e-07,7.965789e-06,2.433058e-06,da,tn,...,0,0,0,0,0,0,0,0,0,0
2,id_51,7.828947e-08,0.08914,0.0,3.55e-07,4.671053e-08,1.052632e-07,4.276014e-07,gf,ce,...,0,0,0,0,0,0,0,0,0,0
3,id_54,7.894737e-08,0.227239,0.0,1.05e-06,1.381579e-07,2.190789e-07,1.848054e-08,,ce,...,0,0,0,0,0,0,0,0,0,0
4,id_62,3.321053e-06,0.16041,0.0,2.105e-06,2.769737e-07,3.340789e-06,2.152983e-06,da,tn,...,0,0,0,0,0,0,0,0,0,0


In [136]:
train['target'].value_counts()

0    311610
1     37368
Name: target, dtype: int64

In [266]:
test['target'] = 0
train_test = pd.concat([train, test])
train_test.shape

(872444, 51)

In [270]:
train_test.head()

Unnamed: 0,transaction_id,num_var_1,num_var_2,num_var_3,num_var_4,num_var_5,num_var_6,num_var_7,cat_var_1,cat_var_2,...,cat_var_34,cat_var_35,cat_var_36,cat_var_37,cat_var_38,cat_var_39,cat_var_40,cat_var_41,cat_var_42,target
0,id_11,2.302632e-08,0.040182,0.0,1.8e-07,2.302632e-08,2.368421e-08,1.115205e-08,,ce,...,0,0,0,0,0,0,0,0,0,0
1,id_33,7.965789e-06,0.157872,0.0,2.105e-06,2.769737e-07,7.965789e-06,2.433058e-06,da,tn,...,0,0,0,0,0,0,0,0,0,0
2,id_51,7.828947e-08,0.08914,0.0,3.55e-07,4.671053e-08,1.052632e-07,4.276014e-07,gf,ce,...,0,0,0,0,0,0,0,0,0,0
3,id_54,7.894737e-08,0.227239,0.0,1.05e-06,1.381579e-07,2.190789e-07,1.848054e-08,,ce,...,0,0,0,0,0,0,0,0,0,0
4,id_62,3.321053e-06,0.16041,0.0,2.105e-06,2.769737e-07,3.340789e-06,2.152983e-06,da,tn,...,0,0,0,0,0,0,0,0,0,0


In [267]:
predictors = [column for column in train_test.columns if column not in ['transaction_id', 'target']] 
train_test['num_missing_vals'] = train_test.isnull().sum(axis=1)
train_test['num_zeros'] = (train_test[predictors] == 0).astype(int).sum(axis=1)
train_test['num_ones'] = (train_test[predictors] == 1).astype(int).sum(axis=1)

In [222]:
train_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 872444 entries, 0 to 523465
Data columns (total 54 columns):
transaction_id      872444 non-null object
num_var_1           872444 non-null float64
num_var_2           872444 non-null float64
num_var_3           872444 non-null float64
num_var_4           872444 non-null float64
num_var_5           872444 non-null float64
num_var_6           872444 non-null float64
num_var_7           872444 non-null float64
cat_var_1           837970 non-null object
cat_var_2           872444 non-null object
cat_var_3           775229 non-null object
cat_var_4           872444 non-null object
cat_var_5           872444 non-null object
cat_var_6           850501 non-null object
cat_var_7           872444 non-null object
cat_var_8           754568 non-null object
cat_var_9           872444 non-null object
cat_var_10          872444 non-null object
cat_var_11          872444 non-null object
cat_var_12          872444 non-null object
cat_var_13          87

In [268]:
missing_cols = ['cat_var_1', 'cat_var_3', 'cat_var_6', 'cat_var_8']
for missing_col in missing_cols:
    train_test[missing_col].fillna(train_test[missing_col].mode()[0], inplace=True)
    train[missing_col].fillna(train[missing_col].mode()[0], inplace=True)

In [269]:
for i in [6, 13, 17, 18, 14, 10, 19, 5, 12, 9, 24, 11, 16, 15, 21, 4, 23]:
    temp = pd.DataFrame()
    temp[['cat_var_' + str(i), 'target_encode_cat_' + str(i)]] = train.groupby('cat_var_' + str(i))['target'].mean().reset_index()
    train_test = train_test.merge(temp, on='cat_var_' + str(i), how='left')

In [96]:
l = [6, 13, 17, 18, 14, 10, 19, 5, 12, 9, 24, 11, 16, 15, 21, 4, 23]
for i in range(43):
    if i in l:
        print '\'target_encode_cat_' + str(i) + '\',',
    else:
        print '\'cat_var_' + str(i) + '\',',

'cat_var_0', 'cat_var_1', 'cat_var_2', 'cat_var_3', 'target_encode_cat_4', 'target_encode_cat_5', 'target_encode_cat_6', 'cat_var_7', 'cat_var_8', 'target_encode_cat_9', 'target_encode_cat_10', 'target_encode_cat_11', 'target_encode_cat_12', 'target_encode_cat_13', 'target_encode_cat_14', 'target_encode_cat_15', 'target_encode_cat_16', 'target_encode_cat_17', 'target_encode_cat_18', 'target_encode_cat_19', 'cat_var_20', 'target_encode_cat_21', 'cat_var_22', 'target_encode_cat_23', 'target_encode_cat_24', 'cat_var_25', 'cat_var_26', 'cat_var_27', 'cat_var_28', 'cat_var_29', 'cat_var_30', 'cat_var_31', 'cat_var_32', 'cat_var_33', 'cat_var_34', 'cat_var_35', 'cat_var_36', 'cat_var_37', 'cat_var_38', 'cat_var_39', 'cat_var_40', 'cat_var_41', 'cat_var_42',


In [270]:
target_encode_predictors = ['transaction_num', 'num_var_1', 'num_var_2', 'num_var_3', 'num_var_4', 'num_var_5', 'num_var_6', 'num_var_7', 'cat_var_1', 'cat_var_2', 'cat_var_3', 'target_encode_cat_4', 'target_encode_cat_5', 'target_encode_cat_6', 'cat_var_7', 'cat_var_8', 'target_encode_cat_9', 'target_encode_cat_10', 'target_encode_cat_11', 'target_encode_cat_12', 'target_encode_cat_13', 'target_encode_cat_14', 'target_encode_cat_15', 'target_encode_cat_16', 'target_encode_cat_17', 'target_encode_cat_18', 'target_encode_cat_19', 'cat_var_20', 'target_encode_cat_21', 'cat_var_22', 'target_encode_cat_23', 'target_encode_cat_24', 'cat_var_25', 'cat_var_26', 'cat_var_27', 'cat_var_28', 'cat_var_29', 'cat_var_30', 'cat_var_31', 'cat_var_32', 'cat_var_33', 'cat_var_34', 'cat_var_35', 'cat_var_36', 'cat_var_37', 'cat_var_38', 'cat_var_39', 'cat_var_40', 'cat_var_41', 'cat_var_42', 'num_missing_vals', 'num_zeros', 'num_ones', 'mean_num_var_7', 'num_var_7_product_1', 'num_var_7_sum_1', 'num_var_7_diff_1', 'num_var_7_product_2']

58

In [271]:
for column in train_test.columns:
    if column == 'transaction_id' or train_test[column].dtype != 'O':
        continue
    print column
    le = LabelEncoder()
    train_test[column] = le.fit_transform(train_test[column])

cat_var_1
cat_var_2
cat_var_3
cat_var_4
cat_var_5
cat_var_6
cat_var_7
cat_var_8
cat_var_9
cat_var_10
cat_var_11
cat_var_12
cat_var_13
cat_var_14
cat_var_15
cat_var_16
cat_var_17
cat_var_18


In [272]:
temp = pd.DataFrame()
temp[['cat_var_8', 'mean_num_var_7']] = train_test.groupby('cat_var_8')['num_var_7'].mean().reset_index()
train_test = train_test.merge(temp, on='cat_var_8', how='left')

In [273]:
train_test['transaction_num'] = train_test['transaction_id'].apply(lambda x: int(x[3:]))

In [274]:
train_test['num_var_7_product_1'] = train_test['num_var_7'] * train_test['num_var_1']
train_test['num_var_7_sum_1'] = train_test['num_var_7'] + train_test['num_var_1']
train_test['num_var_7_diff_1'] = train_test['num_var_7'] - train_test['num_var_1']

In [275]:
train_test['num_var_7_product_2'] = train_test['num_var_7'] * train_test['num_var_2']

In [205]:
temp = pd.DataFrame()
temp[['cat_var_8', 'num_cat_var_8']] = train_test['cat_var_8'].value_counts().reset_index()
train_test = train_test.merge(temp, on='cat_var_8', how='left')

In [214]:
temp = pd.DataFrame()
temp[['cat_var_3', 'num_cat_var_3']] = train_test['cat_var_3'].value_counts().reset_index()
train_test = train_test.merge(temp, on='cat_var_3', how='left')

In [222]:
temp = pd.DataFrame()
temp[['cat_var_1', 'num_cat_var_1']] = train_test['cat_var_1'].value_counts().reset_index()
train_test = train_test.merge(temp, on='cat_var_1', how='left')

In [276]:
count_columns = [1, 2, 3, 6, 7, 8, 10, 13]
for count_column in count_columns:
    temp = pd.DataFrame()
    temp[['cat_var_' + str(count_column), 'num_cat_var_' + str(count_column)]] = train_test['cat_var_' + str(count_column)].value_counts().reset_index()
    train_test = train_test.merge(temp, on='cat_var_' + str(count_column), how='left')

In [277]:
count_predictors = ['num_var_1', 'num_var_2', 'num_var_3', 'num_var_4',
 'num_var_5', 'num_var_6', 'num_var_7', 'num_cat_var_1',
 'num_cat_var_2', 'num_cat_var_3', 'cat_var_4', 'cat_var_5',
 'num_cat_var_6', 'num_cat_var_7', 'num_cat_var_8', 'cat_var_9',
 'num_cat_var_10', 'cat_var_11', 'cat_var_12', 'num_cat_var_13',
 'cat_var_14', 'cat_var_15', 'cat_var_16', 'cat_var_17',
 'cat_var_18', 'cat_var_19', 'cat_var_20', 'cat_var_21',
 'cat_var_22', 'cat_var_23', 'cat_var_24', 'cat_var_25',
 'cat_var_26', 'cat_var_27', 'cat_var_28', 'cat_var_29', 'cat_var_30', 'cat_var_31',
 'cat_var_32', 'cat_var_33', 'cat_var_34', 'cat_var_35',
 'cat_var_36', 'cat_var_37', 'cat_var_38', 'cat_var_39',
 'cat_var_40', 'cat_var_41', 'cat_var_42', 'num_missing_vals',
 'num_zeros', 'num_ones', 'mean_num_var_7', 'transaction_num',
 'num_var_7_product_1', 'num_var_7_sum_1',
 'num_var_7_diff_1', 'num_var_7_product_2']

In [279]:
predictors = ['num_var_1', 'num_var_2', 'num_var_3', 'num_var_4',
 'num_var_5', 'num_var_6', 'num_var_7', 'cat_var_1',
 'cat_var_2', 'cat_var_4', 'cat_var_5',
 'cat_var_6', 'cat_var_7', 'cat_var_9',
 'cat_var_10', 'cat_var_11', 'cat_var_12', 'cat_var_13',
 'cat_var_14', 'cat_var_15', 'cat_var_16', 'cat_var_17',
 'cat_var_18', 'cat_var_19', 'cat_var_20', 'cat_var_21',
 'cat_var_22', 'cat_var_23', 'cat_var_24', 'cat_var_25',
 'cat_var_26', 'cat_var_27', 'cat_var_28', 'cat_var_29', 'cat_var_30', 'cat_var_31',
 'cat_var_32', 'cat_var_33', 'cat_var_34', 'cat_var_35',
 'cat_var_36', 'cat_var_37', 'cat_var_38', 'cat_var_39',
 'cat_var_40', 'cat_var_41', 'cat_var_42', 'num_missing_vals',
 'num_zeros', 'num_ones', 'mean_num_var_7', 'transaction_num',
 'num_var_7_product_1', 'num_var_7_sum_1',
 'num_var_7_diff_1', 'num_var_7_product_2', 'num_cat_var_8', 'num_cat_var_3']
train = train_test[:train.shape[0]]
test = train_test[train.shape[0]:]
train.shape, test.shape
print len(predictors)

58


In [305]:
df_train = train[target_encode_predictors].values
Y = train['target'].values
df_test = test[target_encode_predictors].values

In [306]:
%%time

oof_train=pd.DataFrame({'transaction_id': train['transaction_id'].values, 'target':0})
best=[]
score=[]
train_score = []

kfolds = 5
nrounds = 2000
skf = StratifiedKFold( n_splits=kfolds, shuffle=True, random_state=123)
i=0
for train_index, test_index in skf.split(df_train, Y):
    print('Fold {0}'.format(i + 1))
    X_train, X_val = df_train[train_index], df_train[test_index]
    y_train, y_val = Y[train_index],Y[test_index]

    dtrain = xgb.DMatrix(X_train,y_train)
    dval = xgb.DMatrix(X_val,y_val)
    watchlist = [(dtrain, 'train'), (dval, 'eval')]

    gbdt = xgb.train(xgb_params, dtrain, nrounds, watchlist, verbose_eval=1000, early_stopping_rounds=50)  
    bst=gbdt.best_ntree_limit
    pred=gbdt.predict(dval, ntree_limit=bst)
        
    oof_train.loc[test_index,"target"]= pred
    
    scr=auc(y_val, pred) 
    train_score.append(auc(y_train, gbdt.predict(dtrain, ntree_limit=bst)))
    print 'score: ', scr
    
    best.append(bst)
    score.append(scr)
    i+=1

print(np.mean(score))
print(np.mean(train_score))
print(np.mean(best))

oof_train.to_csv('xgb_oof_target_encode.csv', index=False)

Fold 1
[0]	train-auc:0.698748	eval-auc:0.698814
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 50 rounds.
Stopping. Best iteration:
[279]	train-auc:0.786025	eval-auc:0.73241

score:  0.732410339219
Fold 2
[0]	train-auc:0.706056	eval-auc:0.699573
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 50 rounds.
Stopping. Best iteration:
[70]	train-auc:0.738815	eval-auc:0.722259

score:  0.72225856585
Fold 3
[0]	train-auc:0.703632	eval-auc:0.708977
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 50 rounds.
Stopping. Best iteration:
[193]	train-auc:0.763728	eval-auc:0.735844

score:  0.735844002692
Fold 4
[0]	train-auc:0.698249	eval-auc:0.700059
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-au

In [307]:
best_nrounds=int(round(np.mean(best)))
X_train=xgb.DMatrix(df_train,Y)
X_test=xgb.DMatrix(df_test)

gbdt = xgb.train(xgb_params, X_train, best_nrounds, verbose_eval=1)
pred=gbdt.predict(X_test)
        
submit=pd.DataFrame()
submit['transaction_id'] = test['transaction_id'].values
submit['target'] = pred

submit.to_csv('xgb_submit.csv', index=False)