### This notebook is for submitting predictions to kaggle

In [1]:
from __future__ import print_function, division
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from collections import Counter
from sklearn import preprocessing, metrics
from sklearn import ensemble
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn import neighbors
from sklearn import linear_model 
from itertools import combinations
%matplotlib inline



In [2]:
train = pd.read_csv('train.csv')
test= pd.read_csv('test.csv')

In [4]:
print("{} observations and {} features".format(train.shape[0], train.shape[1] - 2)) 

114321 observations and 131 features


### Data Pre-processing

#### Select variables with higher correlations against the target variable

In [3]:
var = ['target', 'v10', 'v12', 'v14', 'v21', 'v22', 'v24', 'v30', 'v31', 'v34', 'v38', 'v40', 'v47', 'v50', 'v52', 'v56', 'v62', 
'v66', 'v72', 'v74', 'v75', 'v79', 'v107','v112', 'v113', 'v114', 'v129']

In [9]:
print('%d duplicates' % sum(train.duplicated()))

0 duplicates


In [4]:
# Select the variables with higher correlations against the target variable
train_X = train.loc[:,var]
test_X = test.loc[:,var[1:]]
train_y = train.target

In [6]:
cat_cols = train_X.select_dtypes(include=['O']).columns
num_cols = train_X.select_dtypes(include=['int', 'float']).columns[1:]

#### Dealing  with null values

In [7]:
train_X.loc[:,num_cols]=train_X.loc[:,num_cols].fillna(-1) # Fill NaN with -1
train_X.loc[:,cat_cols]=train_X.loc[:,cat_cols].fillna('99') # Fill NaN with '99' for categorical columns
test_X.loc[:,num_cols]=test_X.loc[:,num_cols].fillna(-1) 
test_X.loc[:,cat_cols]=test_X.loc[:,cat_cols].fillna('99') 

#### Converting some numerical variables back to integers rounding numerical features

In [8]:
train_X.loc[:,'v10'] = round(train_X.loc[:,'v10'] / 0.02188170970971, 2)
train_X.loc[:,'v40'] = round(train_X.loc[:,'v40'] / 0.000723536569601, 2)
train_X.loc[:,'v50'] = round(train_X.loc[:,'v50'] / 0.00146724324324, 2)
test_X.loc[:,'v10'] = round(test_X.loc[:,'v10'] / 0.02188170970971, 2)
test_X.loc[:,'v40'] = round(test_X.loc[:,'v40'] / 0.000723536569601, 2)
test_X.loc[:,'v50'] = round(test_X.loc[:,'v50'] / 0.00146724324324, 2)

train_X = round(train_X, 2)
test_X = round(test_X, 2)

#### Creating new variables for capturing the interactions among categorical variables

In [9]:
# for c in cat_cols[1:]:
#     train.loc[:, 'v22'+c] = train_X.loc[:, 'v22'] + train_X.loc[:, c]
#     test.loc[:, 'v22'+c] = test_X.loc[:, 'v22'] + test_X.loc[:, c]

In [9]:
for c in combinations(cat_cols, 2):
    train_X.loc[:,c[0]+c[1]] = train_X.loc[:,c[0]] + train_X.loc[:,c[1]]
    test_X.loc[:,c[0]+c[1]] = test_X.loc[:,c[0]] + test_X.loc[:,c[1]]

In [10]:
for c in combinations(cat_cols[1:], 2):
    train_X.loc[:,'v22'+c[0]+c[1]] = train_X.loc[:,'v22'] + train_X.loc[:,c[0]] + train_X.loc[:,c[1]]
    test_X.loc[:,'v22'+c[0]+c[1]] = test_X.loc[:,'v22'] + test_X.loc[:,c[0]] + test_X.loc[:,c[1]]

In [11]:
train_X.loc[:, 'v22'+'v40-v50'] = train_X.loc[:, 'v22'] + (train_X.loc[:,'v40']-train_X.loc[:,'v50']).map(str)
test_X.loc[:, 'v22'+'v40-v50'] = test_X.loc[:, 'v22'] + (test_X.loc[:,'v40']-test_X.loc[:,'v50']).map(str)

### Categorical Encoding Using Target Statistics 
Transform individual value of a categorical attribute $X$ to a scalar, $S_i$, representing the estimate of probability that the target variable equals to 1 given $X=X_i$. $S_i$ is calculated using the formula: $$S_i=\beta(n_i)\frac{n_{iy}}{n_i}+(1-\beta(n_i))\frac{n_y}{n_{TR}}$$ 
where $n_y$ is total number of cases such that $y=1$ 

In [12]:
k = 1.5
f = 0.5
pt = sum(train_X.target)/len(train_X) # the prior probability of the dependent attribute
def beta(x, count, k, f):
    ni = 0 if x not in count else count[x]
    return 1 / (1 + np.exp(-(ni-k)/f))

def transform_to_scalar(x, mean, pt, beta):
    pi = pt if x not in mean else mean[x]
    return (beta * pi + (1 - beta) * pt) # * np.random.normal(1, 0.03)

#### Leave-One-Out Encoding Using Target Mean

In [13]:
cat_cols = train_X.select_dtypes(include=['O']).columns

In [None]:
n_rows = train_X.shape[0]
train_X = train_X.reset_index(drop=True)
train_y = train_y.reset_index(drop=True)

In [None]:
for col in cat_cols:
    mean = train_X.groupby([col])['target'].mean()
    count = train_X.groupby([col])['target'].count()
    test_X.loc[:, col] = test_X.loc[:,col].apply(lambda x: transform_to_scalar(x, mean, pt, beta(x, count, k, f)))
    for i in range(4):
        fold = range(i, n_rows, 4)
        mean = train_X.drop(fold, axis=0).groupby([col])['target'].mean()
        count = train_X.drop(fold, axis=0).groupby([col])['target'].count()
        train_X.loc[fold, col] = train_X.loc[fold,col].apply(lambda x: transform_to_scalar(x, mean, pt, beta(x, count, k, f)))  

In [None]:
train_X = train_X.drop('target', axis=1)

### Model Training

### XGBoost

In [18]:
train_X[cat_cols] = train_X[cat_cols].astype('float')
test_X[cat_cols] = test_X[cat_cols].astype('float')

In [19]:
XGB = XGBClassifier(learning_rate =0.01, n_estimators=2000, max_depth=8,
 min_child_weight=1.5, gamma=0, reg_alpha=0.01, subsample=0.8, colsample_bytree=0.8, objective= 'binary:logistic',
 nthread=4, scale_pos_weight=1)

In [20]:
XGB.fit(train_X, train_y, eval_metric='logloss')

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.8,
       gamma=0, learning_rate=0.01, max_delta_step=0, max_depth=8,
       min_child_weight=1.5, missing=None, n_estimators=2000, nthread=4,
       objective='binary:logistic', reg_alpha=0.01, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.8)

In [26]:
prob_xgb = XGB.predict_proba(test_X)[:,1]; prob_xgb

array([ 0.21000093,  0.96492577,  0.86594748, ...,  0.88996011,
        0.93076468,  0.48586789], dtype=float32)

In [27]:
prob_xgb

array([ 0.21000093,  0.96492577,  0.86594748, ...,  0.88996011,
        0.93076468,  0.48586789], dtype=float32)

### Stacking XGBoost and ExtraTreesClassifier for generalizing the model

In [88]:
np.random.seed(0)

In [89]:
clfs = [XGBClassifier(learning_rate =0.05, n_estimators=1000, max_depth=6, min_child_weight=1.5, gamma=0, reg_alpha=0.01,
                      subsample=0.8, colsample_bytree=0.8, objective= 'binary:logistic', nthread=4, scale_pos_weight=1),
        XGBClassifier(learning_rate =0.01, n_estimators=2000, max_depth=8, min_child_weight=1.5, gamma=0, reg_alpha=0.01,
                      subsample=0.8, colsample_bytree=0.8, objective= 'binary:logistic', nthread=4, scale_pos_weight=1),
        ensemble.ExtraTreesClassifier(n_estimators=1000, max_features= 6, criterion= 'entropy',min_samples_split= 4,
                                      max_depth= 25, min_samples_leaf= 2, n_jobs = -1),
        ensemble.ExtraTreesClassifier(n_estimators=1200, max_features= 8, criterion= 'entropy',min_samples_split= 4,
                                      max_depth= 35, min_samples_leaf= 2, n_jobs = -1)]


In [90]:
# Creating train and test sets for stacking
train_stack = np.zeros((train_X.shape[0], len(clfs)))
test_stack = np.zeros((test_X.shape[0], len(clfs)))


Creating train and test sets for stacking.


In [91]:
sfk = StratifiedKFold(n_splits=5) 
for j, clf in enumerate(clfs):
    print(j+1, clf)
    test_stack_j = np.zeros((test_X.shape[0], 5))
    folds = sfk.split(train_X, train_y)
    for i, (train_idx, test_idx) in enumerate(folds):
        print("Fold", i+1)
        X_train = train_X.iloc[train_idx,:]
        y_train = train_y[train_idx]
        X_test = train_X.iloc[test_idx,:]
        y_test = train_y[test_idx]
        if j == 0: 
            clf.fit(X_train, y_train, eval_metric='logloss')
        else:
            clf.fit(X_train, y_train)
        y_prob = clf.predict_proba(X_test)[:,1]
        train_stack[test_idx, j] = y_prob
        test_stack_j[:, i] = clf.predict_proba(test_X)[:,1]
    test_stack[:,j] = test_stack_j.mean(1)

0 XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.8,
       gamma=0, learning_rate=0.01, max_delta_step=0, max_depth=8,
       min_child_weight=1.5, missing=None, n_estimators=2000, nthread=4,
       objective='binary:logistic', reg_alpha=0.01, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.8)
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4
1 ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='entropy',
           max_depth=35, max_features=8, max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=2,
           min_samples_split=4, min_weight_fraction_leaf=0.0,
           n_estimators=1000, n_jobs=-1, oob_score=False,
           random_state=None, verbose=0, warm_start=False)
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4


In [92]:
print("Stacking...")
#clf = linear_model.LogisticRegression()
clf = XGBClassifier(learning_rate =0.1, n_estimators=150, max_depth=8,
                    min_child_weight=1.5, gamma=0, reg_alpha=0.01, subsample=0.8, 
                    colsample_bytree=0.8, objective= 'binary:logistic',
                    nthread=4, scale_pos_weight=1)
clf.fit(train_stack, train_y, eval_metric='logloss')
prob = clf.predict_proba(test_stack)[:,1]

Stacking...


In [28]:
submission = pd.DataFrame({'ID':test.ID, 'PredictedProb':prob_xgb})

In [None]:
submission = pd.DataFrame({'ID':test.ID, 'PredictedProb':prob})

In [29]:
submission.to_csv('submission.csv', index = False)

### References:
1. A Preprocessing Scheme for High-Cardinality Categorical Attributes in Classification and Prediction Problems 