In [59]:
# Load in our libraries
import pandas as pd
import numpy as np
import re
import sklearn
import xgboost as xgb
import lightgbm as lgb
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

import warnings
warnings.filterwarnings('ignore')

# Going to use these 5 base models for the stacking
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import StratifiedKFold


def gini(y, pred):
    g = np.asarray(np.c_[y, pred, np.arange(len(y)) ], dtype=np.float)
    g = g[np.lexsort((g[:,2], -1*g[:,1]))]
    gs = g[:,0].cumsum().sum() / g[:,0].sum()
    gs -= (len(y) + 1) / 2.
    return gs / len(y)

def gini_normalized(estimator,a, p):
    
    curr = gini(p, estimator.predict_proba(a)) / gini(p, p)
    return curr 

def gini_lgb(preds, dtrain):
    y = list(dtrain.get_label())
    score = gini(y, preds) / gini(y, y)
    return 'gini', score, True

def calcginiindex(array):
    array = array.flatten()
    array += 0.0000001
    array = np.sort(array)
    index = np.arange(1,array.shape[0]+1)
    n = array.shape[0]
    return ((np.sum((2*index - n - 1)*array))/(n * np.sum(array)))

def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = gini_normalized(labels, preds)
    return 'gini', gini_score

# Data

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

features = train.drop(['target', 'id'], axis = 1)
targets = train.target.values

unwanted = train.columns[train.columns.str.startswith('ps_calc_')]

train = train.drop(unwanted, axis = 1)
test = test.drop(unwanted, axis = 1)

# Ensembling and stacking models
### SklearnHelper For RF, ET, AD, GB

In [4]:
# Some useful parameters which will come in handy later on
ntrain = train.shape[0]
ntest = test.shape[0]
SEED = 42 # for reproducibility

# Class to extend the Sklearn classifier
class SklearnHelper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict_proba(self, x):
        return self.clf.predict_proba(x)
    
    def fit(self,x,y):
        return self.clf.fit(x,y)
    
    def feature_importances(self,x,y):
        print(self.clf.fit(x,y).feature_importances_)
    
# Class to extend XGboost classifer

### Get predictions/new features from SklearnHelper

In [5]:
kfold = 5
skf = StratifiedKFold(n_splits=kfold, random_state=42)

def get_oof(clf, x_train, y_train, x_test):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((kfold, ntest))

    
    for i, (train_index, test_index) in enumerate(skf.split(x_train, y_train)):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]
        y_te = y_train[test_index]
        
        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict_proba(x_te)[:, 1]
        train_pred = clf.predict_proba(x_tr)[:, 1]
        oof_test_skf[i, :] = clf.predict_proba(x_test)[:, 1]
        print("Fold :", i,"Train Gini:", gini_normalized(train_pred, y_tr) ,"Valid Gini:", gini_normalized(oof_train[test_index], y_te))
        

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

# Data prep

In [6]:
X = train.drop(['id', 'target'], axis = 1).values
y = train.target.values

test_id = test.id.values
test = test.drop('id', axis = 1)

# SGDClassifier

In [11]:
best_gini = 0
sgd_params = {
    'loss': 'log',
    'penalty': 'l2',
    'alpha': 0.01,
    #'max_iter': 10,
    'n_jobs': 6,
   # 'learning_rate': 'invscaling'.
    'eta0': 0.025,
   # 'power_t': 0.998
}
sgd = SklearnHelper(clf=SGDClassifier, seed=SEED, params=sgd_params)

print('SGDClassifier')
sgd_oof_train, sgd_oof_test = get_oof(sgd,X, y, test) # Random Forest
curr_gini = calcginiindex(sgd_oof_test)
if curr_gini > best_gini:
    print('New best gini:', curr_gini)
    best_gini = curr_gini
    best_params = pd.DataFrame(sgd_params, index=[1])
else:
    print('Curr gini:', curr_gini)

SGDClassifier
('Fold :', 0, 'Train Gini:', 0.02535647019117844, 'Valid Gini:', 0.027761830977935893)
('Fold :', 1, 'Train Gini:', 0.024640211690390892, 'Valid Gini:', 0.024727967978524341)
('Fold :', 2, 'Train Gini:', 0.025038032253114323, 'Valid Gini:', 0.026111183852913156)
('Fold :', 3, 'Train Gini:', 0.027774535345091451, 'Valid Gini:', 0.025509442211765061)
('Fold :', 4, 'Train Gini:', 0.024716977178029786, 'Valid Gini:', 0.028781472736354653)
('New best gini:', 0.18478334553747211)


In [89]:
from sklearn.model_selection import RandomizedSearchCV

sgd_params = {
    'loss': ['modified_huber'], # optimized
    'penalty': ['l2'], # optimized
    'alpha': [0.0006, 0.0001, 0.001, 0.01, 0.1, 1, 10], # optimized
    'n_iter': [10, 50, 100, 250, 500],
    'n_jobs': [6],
    'learning_rate': ['invscaling', 'constant', 'optimal'],
    'eta0': [0.025, 0.020, 0.1, 0.25, 0.5, 0.01],
    'power_t': [0.1, 0.5, 0.65, 0.95, 0.998],
    'epsilon': [0.1, 0.3, 0.5, 0.01, 0.05, 0.001, 0.005]
}


rand_search = RandomizedSearchCV(SGDClassifier(n_jobs=6), param_distributions=sgd_params, 
                                 scoring=gini_normalized, cv = 5, n_iter = 350, verbose = 1)

rand_search.fit(X, y)

Fitting 5 folds for each of 350 candidates, totalling 1750 fits


[Parallel(n_jobs=1)]: Done 1750 out of 1750 | elapsed: 720.1min finished


RandomizedSearchCV(cv=5, error_score='raise',
          estimator=SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=6,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False),
          fit_params={}, iid=True, n_iter=350, n_jobs=1,
          param_distributions={'penalty': ['l2'], 'loss': ['modified_huber'], 'n_jobs': [6], 'eta0': [0.025, 0.02, 0.1, 0.25, 0.5, 0.01], 'n_iter': [10, 50, 100, 250, 500], 'alpha': [0.0006, 0.0001, 0.001, 0.01, 0.1, 1, 10], 'learning_rate': ['invscaling', 'constant', 'optimal'], 'epsilon': [0.1, 0.3, 0.5, 0.01, 0.05, 0.001, 0.005], 'power_t': [0.1, 0.5, 0.65, 0.95, 0.998]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score=True,
          scoring=<function gini_normalized at 0x000000001D758BA8>,
          verbose=1)

In [90]:
rand_search.best_params_

{'alpha': 0.0001,
 'epsilon': 0.3,
 'eta0': 0.02,
 'learning_rate': 'invscaling',
 'loss': 'modified_huber',
 'n_iter': 500,
 'n_jobs': 6,
 'penalty': 'l2',
 'power_t': 0.998}

In [91]:
calcginiindex(rand_search.predict_proba(test)[:,1])

0.99097520587585752

In [92]:
rand_search.best_score_

0.023455037851286546