In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
import xgboost as xgb
import matplotlib.pyplot as plt

In [None]:
train = pd.read_csv('../input/train.csv', na_values = '-1')
test = pd.read_csv('../input/test.csv', na_values = '-1')

In [None]:
# Variable assigning:

y_train_all = train['target']                                  # save y_train
id_test = test['id']                                           # save test_d

train.drop(['id', 'target'], axis = 1, inplace = True)         # drop train's id, target
test.drop(['id'], axis = 1, inplace = True)                    # drop test's id

num_train = len(train)                                         # train_index
num_val = int(num_train * 0.2)                                 # validation_train_index
df_values = pd.concat([train, test], axis = 0)                 # concatenate both train & test

In [None]:
# as suggested by kostya17
# in https://www.kaggle.com/kostya17/simple-approach-to-handle-missing-values:
# 1. drop these two columns with ~60% and ~40% NaNs, respectively:

df_values.drop(["ps_car_03_cat", "ps_car_05_cat"], axis=1, inplace=True)

# 2. Replace
#  "cat" - categorical: fill missing values with mode value of particular column
#  "bit" - binary: fill missing values with mode value of particular column
#  all other - (continuous or ordinal): fill with mean value of particular column

cat_cols = [col for col in df_values.columns if 'cat' in col]
bin_cols = [col for col in df_values.columns if 'bin' in col]
con_cols = [col for col in df_values.columns if col not in bin_cols + cat_cols]

for col in cat_cols:
    df_values[col].fillna(value = df_values[col].mode()[0], inplace=True)
    
for col in bin_cols:
    df_values[col].fillna(value = df_values[col].mode()[0], inplace=True)
    
for col in con_cols:
    df_values[col].fillna(value = df_values[col].mean(), inplace=True)

In [None]:
# OneHotEncoding
# as demonstrated by bhavesh302
# in https://www.kaggle.com/bhavesh302/xgb-onehot-encoding-lb-0-281

cat_features = [a for a in df_values.columns if a.endswith('cat')]

for column in cat_features:
    temp = pd.get_dummies(pd.Series(df_values[column]), prefix = column)
    df_values = pd.concat([df_values,temp], axis=1)
    df_values = df_values.drop([column], axis=1)

In [None]:
# Define the gini metric - from https://www.kaggle.com/c/ClaimPredictionChallenge/discussion/703#5897
def gini(actual, pred, cmpcol = 0, sortcol = 1):
    assert( len(actual) == len(pred) )
    all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float)
    all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
    totalLosses = all[:,0].sum()
    giniSum = all[:,0].cumsum().sum() / totalLosses
    
    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)
 
def gini_normalized(a, p):
    return gini(a, p) / gini(a, a)

def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = gini_normalized(labels, preds)
    return 'gini', gini_score

In [None]:
X = df_values[:num_train].values
y = y_train_all.values
test = df_values[num_train:]
features = df_values.columns

sub = id_test.copy()
sub['target'] = 0

In [None]:
sub = pd.DataFrame({'id': id_test, 'target': 0})

In [None]:
# xgb parameters:

params = {'eta': 0.02, 'max_depth': 4, 'subsample': 0.9, 'colsample_bytree': 0.9, 
          'objective': 'binary:logistic', 'eval_metric': 'auc', 'silent': True}

In [None]:
nrounds = 200  # need to change to 2000
kfold = 2      # need to change to 5
skf = StratifiedKFold(n_splits = kfold, random_state = 0)

for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    print(' xgb kfold: {}  of  {} : '.format(i+1, kfold))
    X_train, X_valid = X[train_index], X[test_index]
    y_train, y_valid = y[train_index], y[test_index]
    d_train = xgb.DMatrix(X_train, y_train) 
    d_valid = xgb.DMatrix(X_valid, y_valid) 
    watchlist = [(d_train, 'train'), (d_valid, 'valid')]
    xgb_model = xgb.train(params, d_train, nrounds, watchlist, early_stopping_rounds=10, 
                          feval=gini_xgb, maximize=True, verbose_eval=10)
    sub['target'] += xgb_model.predict(xgb.DMatrix(test[features].values), 
                        ntree_limit=xgb_model.best_ntree_limit+50) / (kfold)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 28))
xgb.plot_importance(xgb_model, max_num_features=50, height=0.5, ax=ax)

In [None]:
sub.to_csv('sub.csv', index = False)