# Imports

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb

# Load Data

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [4]:
features = train.drop(['target', 'id'], axis = 1)
targets = train.target.values

# Define Gini's functions

In [15]:
def gini(y, pred):
    g = np.asarray(np.c_[y, pred, np.arange(len(y)) ], dtype=np.float)
    g = g[np.lexsort((g[:,2], -1*g[:,1]))]
    gs = g[:,0].cumsum().sum() / g[:,0].sum()
    gs -= (len(y) + 1) / 2.
    return gs / len(y)

def gini_lgb(preds, dtrain):
    y = list(dtrain.get_label())
    score = gini(y, preds) / gini(y, y)
    return 'gini', score, True

def gini_normalized(a, p):
    return gini(a, p) / gini(a, a)


def calcginiindex(array):
    array = array.flatten()
    array += 0.0000001
    array = np.sort(array)
    index = np.arange(1,array.shape[0]+1)
    n = array.shape[0]
    return ((np.sum((2*index - n - 1)*array))/(n * np.sum(array)))

# Feature Engineering

In [6]:
unwanted = train.columns[train.columns.str.startswith('ps_calc_')]

train = train.drop(unwanted, axis = 1)
test = test.drop(unwanted, axis = 1)

# Data preparation

In [7]:
X = train.drop(['id', 'target'], axis = 1).values
y = train.target.values

test_id = test.id.values
test = test.drop('id', axis = 1)

In [8]:
sub = pd.DataFrame()
sub['id'] = test_id
sub['target'] = np.zeros_like(test_id)

# Stratified KFold and lightGBM params

In [9]:
kfold = 5
skf = StratifiedKFold(n_splits=kfold, random_state=42)


    # specify your configurations as a dict
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'min_sum_hessian_in_leaf': 10,
    'min_gain_to_split': 0.65,
    'poisson_max_delta_step': 0.8,
    'num_leaves': 31,
    #'learning_rate': 0.025,
    'feature_fraction': 0.4,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'max_depth': 10,
    'verbose': 2
}

# Train the model
### Learning Rate Scheduling

In [16]:
sub['target'] = np.zeros_like(test_id)
sum_valid_gini = 0

for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    print('[Fold %d/%d]' % (i + 1, kfold))
    X_train, X_valid = X[train_index], X[test_index]
    y_train, y_valid = y[train_index], y[test_index]
    X_test = test.values
    
    # create dataset for lightgbm
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train)

    print('Start training...')
    # train
    gbm = lgb.train(params,
                    lgb_train,
                    num_boost_round=4000,
                    valid_sets=lgb_eval,
                    early_stopping_rounds=130,
                    feval=gini_lgb, verbose_eval=250,
                    learning_rates = lambda iter: 0.025 * (0.999 ** iter)
                    )

    print('Save model...')
    # save model to file
    gbm.save_model('model.txt')

    print('Start predicting...')
    # predict
    p_test = gbm.predict(X_test, num_iteration=gbm.best_iteration)
    p_valid = gbm.predict(X_valid, num_iteration=gbm.best_iteration)
    sub['target'] += p_test/kfold
    sum_valid_gini += gini_normalized(y_valid, p_valid)

[Fold 1/5]
Start training...
Training until validation scores don't improve for 130 rounds.
[250]	valid_0's gini: 0.266366
[500]	valid_0's gini: 0.276455
[750]	valid_0's gini: 0.279409
[1000]	valid_0's gini: 0.280615
Early stopping, best iteration is:
[961]	valid_0's gini: 0.280695
Save model...
Start predicting...
[Fold 2/5]
Start training...
Training until validation scores don't improve for 130 rounds.
[250]	valid_0's gini: 0.271944
[500]	valid_0's gini: 0.280299
[750]	valid_0's gini: 0.282177
[1000]	valid_0's gini: 0.282997
Early stopping, best iteration is:
[1004]	valid_0's gini: 0.283013
Save model...
Start predicting...
[Fold 3/5]
Start training...
Training until validation scores don't improve for 130 rounds.
[250]	valid_0's gini: 0.267982
[500]	valid_0's gini: 0.277694
[750]	valid_0's gini: 0.279391
[1000]	valid_0's gini: 0.279634
Early stopping, best iteration is:
[892]	valid_0's gini: 0.279711
Save model...
Start predicting...
[Fold 4/5]
Start training...
Training until vali

# Estimated Local Gini

In [17]:
print('Gini: ', calcginiindex(sub['target'].values))
#sum_valid_gini = 0.29361 + 0.273239 + 0.291849 + 0.273031 + 0.294119 + 0.264506 + 0.292655 + 0.287502 + 0.269919 +0.283226
print('Valid gini:', sum_valid_gini/kfold)

('Gini: ', 0.26466119935060028)
('Valid gini:', 0.28080992782327074)


# Submition file

In [None]:
sub.to_csv('skfold_lightgbm'+ str(sum_valid_gini/kfold) +'.csv', index = False)