# Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb

# Load Data

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
features = train.drop(['target', 'id'], axis = 1)
targets = train.target.values

# Define Gini's functions

In [4]:
def gini(y, pred):
    g = np.asarray(np.c_[y, pred, np.arange(len(y)) ], dtype=np.float)
    g = g[np.lexsort((g[:,2], -1*g[:,1]))]
    gs = g[:,0].cumsum().sum() / g[:,0].sum()
    gs -= (len(y) + 1) / 2.
    return gs / len(y)

def gini_lgb(preds, dtrain):
    y = list(dtrain.get_label())
    score = gini(y, preds) / gini(y, y)
    return 'gini', score, True

def gini_normalized(a, p):
    return gini(a, p) / gini(a, a)


def calcginiindex(array):
    array = array.flatten()
    array += 0.0000001
    array = np.sort(array)
    index = np.arange(1,array.shape[0]+1)
    n = array.shape[0]
    return ((np.sum((2*index - n - 1)*array))/(n * np.sum(array)))

# Feature Engineering

In [5]:
unwanted = train.columns[train.columns.str.startswith('ps_calc_')]

train = train.drop(unwanted, axis = 1)
test = test.drop(unwanted, axis = 1)

In [6]:
def sum_nan(row):
    return np.sum(np.where(row == -1, 1, 0))

train['sum_nan'] = train.apply(sum_nan, axis = 1)
train['high_nan'] = train['sum_nan'].apply(lambda x: 1 if x > 4 else 0)
train['ps_car_13_ps_reg_03'] = train['ps_car_13'] * train['ps_reg_03']
train['ps_reg_mult'] = train['ps_reg_01'] * train['ps_reg_02'] * train['ps_reg_03']
train['ps_bin_sum'] = train['ps_ind_06_bin'] + train['ps_ind_07_bin'] + train['ps_ind_08_bin'] + \
        train['ps_ind_09_bin'] + train['ps_ind_10_bin'] + train['ps_ind_11_bin'] + \
        train['ps_ind_12_bin'] + train['ps_ind_13_bin'] + train['ps_ind_16_bin'] + \
        train['ps_ind_17_bin'] + train['ps_ind_18_bin']
        
test['sum_nan'] = test.apply(sum_nan, axis = 1)
test['high_nan'] = test['sum_nan'].apply(lambda x: 1 if x > 4 else 0)
test['ps_car_13_ps_reg_03'] = test['ps_car_13'] * test['ps_reg_03']
test['ps_reg_mult'] = test['ps_reg_01'] * test['ps_reg_02'] * test['ps_reg_03']
test['ps_bin_sum'] = test['ps_ind_06_bin'] + test['ps_ind_07_bin'] + test['ps_ind_08_bin'] + \
        test['ps_ind_09_bin'] + test['ps_ind_10_bin'] + test['ps_ind_11_bin'] + \
        test['ps_ind_12_bin'] + test['ps_ind_13_bin'] + test['ps_ind_16_bin'] + \
        test['ps_ind_17_bin'] + test['ps_ind_18_bin']

# Data preparation

In [7]:
X = train.drop(['id', 'target'], axis = 1).values
y = train.target.values

test_id = test.id.values
test = test.drop('id', axis = 1)

In [8]:
sub = pd.DataFrame()
sub['id'] = test_id
sub['target'] = np.zeros_like(test_id)

# Stratified KFold and lightGBM params

In [19]:
kfold = 5
skf = StratifiedKFold(n_splits=kfold, random_state=42)


    # specify your configurations as a dict
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'min_sum_hessian_in_leaf': 10,
    'min_gain_to_split': 0.65,
    'poisson_max_delta_step': 1.8,
    'num_leaves': 31,
    #'learning_rate': 0.025,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.9,
    'bagging_freq': 5,
    'max_depth': 7,
    'verbose': 2
}

# Train the model
### Learning Rate Scheduling

In [20]:
sub['target'] = np.zeros_like(test_id)
sum_valid_gini = 0

for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    print('[Fold %d/%d]' % (i + 1, kfold))
    X_train, X_valid = X[train_index], X[test_index]
    y_train, y_valid = y[train_index], y[test_index]
    X_test = test.values
    
    # create dataset for lightgbm
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train)

    print('Start training...')
    # train
    gbm = lgb.train(params,
                    lgb_train,
                    num_boost_round=4000,
                    valid_sets=lgb_eval,
                    early_stopping_rounds=130,
                    feval=gini_lgb, verbose_eval=250,
                    learning_rates = lambda iter: 0.025 * (0.9975 ** iter)
                    )

    print('Save model...')
    # save model to file
    gbm.save_model('model.txt')

    print('Start predicting...')
    # predict
    p_test = gbm.predict(X_test, num_iteration=gbm.best_iteration)
    p_valid = gbm.predict(X_valid, num_iteration=gbm.best_iteration)
    sub['target'] += p_test/kfold
    sum_valid_gini += gini_normalized(y_valid, p_valid)

[Fold 1/5]
Start training...
Training until validation scores don't improve for 130 rounds.
[250]	valid_0's gini: 0.272521
[500]	valid_0's gini: 0.281131
[750]	valid_0's gini: 0.283033
[1000]	valid_0's gini: 0.283536
[1250]	valid_0's gini: 0.28375
Early stopping, best iteration is:
[1315]	valid_0's gini: 0.283796
Save model...
Start predicting...
[Fold 2/5]
Start training...
Training until validation scores don't improve for 130 rounds.
[250]	valid_0's gini: 0.269838
[500]	valid_0's gini: 0.280572
[750]	valid_0's gini: 0.282832
[1000]	valid_0's gini: 0.283478
[1250]	valid_0's gini: 0.283781
[1500]	valid_0's gini: 0.283961
[1750]	valid_0's gini: 0.284008
[2000]	valid_0's gini: 0.284057
[2250]	valid_0's gini: 0.284083
Early stopping, best iteration is:
[2336]	valid_0's gini: 0.284088
Save model...
Start predicting...
[Fold 3/5]
Start training...
Training until validation scores don't improve for 130 rounds.
[250]	valid_0's gini: 0.27017
[500]	valid_0's gini: 0.280098
[750]	valid_0's gini

# Estimated Local Gini

In [21]:
print('Gini: ', calcginiindex(sub['target'].values))
#sum_valid_gini = 0.29361 + 0.273239 + 0.291849 + 0.273031 + 0.294119 + 0.264506 + 0.292655 + 0.287502 + 0.269919 +0.283226
print('Valid gini:', sum_valid_gini/kfold)

('Gini: ', 0.25948887615338728)
('Valid gini:', 0.28352626610683168)


# Submition file

In [12]:
sub.to_csv('skfold_lightgbm'+ str(sum_valid_gini/kfold) +'.csv', index = False)