# Imports

In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import StratifiedKFold
import xgboost as xgb

# Load Data

In [8]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [9]:
features = train.drop(['target', 'id'], axis = 1)
targets = train.target.values

# Define Gini's functions

In [10]:
# Define the gini metric - from https://www.kaggle.com/c/ClaimPredictionChallenge/discussion/703#5897
def gini(actual, pred, cmpcol = 0, sortcol = 1):
    assert( len(actual) == len(pred) )
    all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float)
    all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
    totalLosses = all[:,0].sum()
    giniSum = all[:,0].cumsum().sum() / totalLosses
    
    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)
 
def gini_normalized(a, p):
    return gini(a, p) / gini(a, a)

def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = gini_normalized(labels, preds)
    return 'gini', gini_score

def calcginiindex(array):
    array = array.flatten()
    array += 0.0000001
    array = np.sort(array)
    index = np.arange(1,array.shape[0]+1)
    n = array.shape[0]
    return ((np.sum((2*index - n - 1)*array))/(n * np.sum(array)))

# Feature Engineering

In [11]:
unwanted = train.columns[train.columns.str.startswith('ps_calc_')]

train = train.drop(unwanted, axis = 1)
test = test.drop(unwanted, axis = 1)

In [12]:
def sum_nan(row):
    return np.sum(np.where(row == -1, 1, 0))

train['sum_nan'] = train.apply(sum_nan, axis = 1)
train['high_nan'] = train['sum_nan'].apply(lambda x: 1 if x > 4 else 0)
train['ps_car_13_ps_reg_03'] = train['ps_car_13'] * train['ps_reg_03']
train['ps_reg_mult'] = train['ps_reg_01'] * train['ps_reg_02'] * train['ps_reg_03']
train['ps_bin_sum'] = train['ps_ind_06_bin'] + train['ps_ind_07_bin'] + train['ps_ind_08_bin'] + \
        train['ps_ind_09_bin'] + train['ps_ind_10_bin'] + train['ps_ind_11_bin'] + \
        train['ps_ind_12_bin'] + train['ps_ind_13_bin'] + train['ps_ind_16_bin'] + \
        train['ps_ind_17_bin'] + train['ps_ind_18_bin']
        
test['sum_nan'] = test.apply(sum_nan, axis = 1)
test['high_nan'] = test['sum_nan'].apply(lambda x: 1 if x > 4 else 0)
test['ps_car_13_ps_reg_03'] = test['ps_car_13'] * test['ps_reg_03']
test['ps_reg_mult'] = test['ps_reg_01'] * test['ps_reg_02'] * test['ps_reg_03']
test['ps_bin_sum'] = test['ps_ind_06_bin'] + test['ps_ind_07_bin'] + test['ps_ind_08_bin'] + \
        test['ps_ind_09_bin'] + test['ps_ind_10_bin'] + test['ps_ind_11_bin'] + \
        test['ps_ind_12_bin'] + test['ps_ind_13_bin'] + test['ps_ind_16_bin'] + \
        test['ps_ind_17_bin'] + test['ps_ind_18_bin']

# Data preparation

In [13]:
X = train.drop(['id', 'target'], axis = 1).values
y = train.target.values

test_id = test.id.values
test = test.drop('id', axis = 1)

In [14]:
sub = pd.DataFrame()
sub['id'] = test_id
sub['target'] = np.zeros_like(test_id)

# Stratified KFold and XGBoost params

In [None]:
kfold = 5
skf = StratifiedKFold(n_splits=kfold, random_state=42)

params = {
    'min_child_weight': 10.0,
    'objective': 'binary:logistic',
    'max_depth': 8,
    'max_delta_step': 2.8,
    'colsample_bytree': 0.4,
    'subsample': 0.8,
    'eta': 0.025,
    'gamma': 0.65,
    'num_boost_round' : 700
    }

# Train the model

In [None]:
sum_valid_gini = 0

for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    print('[Fold %d/%d]' % (i + 1, kfold))
    X_train, X_valid = X[train_index], X[test_index]
    y_train, y_valid = y[train_index], y[test_index]
    # Convert our data into XGBoost format
    d_train = xgb.DMatrix(X_train, y_train)
    d_valid = xgb.DMatrix(X_valid, y_valid)
    d_test = xgb.DMatrix(test.values)
    watchlist = [(d_train, 'train'), (d_valid, 'valid')]

    # Train the model! We pass in a max of 2,000 rounds (with early stopping after 100)
    # and the custom metric (maximize=True tells xgb that higher metric is better)
    mdl = xgb.train(params, d_train, 4000, watchlist, early_stopping_rounds=130, feval=gini_xgb, maximize=True, verbose_eval=250,
                   learning_rates = lambda iter, ed: 0.025 * (0.998 ** iter)
                   )

    print('[Fold %d/%d Prediciton:]' % (i + 1, kfold))
    # Predict on our test data
    p_test = mdl.predict(d_test)
    p_valid = mdl.predict(d_valid)
    sub['target'] += p_test/kfold
    sum_valid_gini += gini_normalized(y_valid, p_valid)

[Fold 1/5]
[0]	train-gini:0.184465	valid-gini:0.173283
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 130 rounds.


# Estimated Local Gini

In [None]:
print('EstimatedGini: ', calcginiindex(sub['target'].values))
#sum_valid_gini = 0.279847 + 0.288112 + 0.282968 + 0.291635 + 0.27727
print('Valid gini:', sum_valid_gini/kfold)

# Submition File

In [20]:
sub.to_csv('skfold_dropuseless_newfeats'+ str(sum_valid_gini/kfold) +'.csv', index = False)