#### This notebook contains the codes for data preprocessing and model training

In [1]:
from __future__ import print_function, division
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from collections import Counter
from sklearn import preprocessing, metrics
from sklearn import ensemble
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn import neighbors
from sklearn import linear_model 
from itertools import combinations
%matplotlib inline



In [2]:
train = pd.read_csv('train.csv')
test= pd.read_csv('test.csv')

In [4]:
print("{} observations and {} features".format(train.shape[0], train.shape[1] - 2)) 

114321 observations and 131 features


### Data Pre-processing

#### Select variables with higher correlations against the target variable

In [3]:
var = ['target', 'v10', 'v12', 'v14', 'v21', 'v22', 'v24', 'v30', 'v31', 'v34', 'v38', 'v40', 'v47', 'v50', 'v52', 'v56', 'v62', 
'v66', 'v72', 'v74', 'v75', 'v79', 'v107','v112', 'v113', 'v114', 'v129']


In [9]:
print('%d duplicates' % sum(train.duplicated()))

0 duplicates


In [4]:
# Select the variables with higher correlations against the target variable
train_X = train.loc[:,var]
train_y = train.target

In [5]:
cat_cols = train_X.select_dtypes(include=['O']).columns
num_cols = train_X.select_dtypes(include=['int', 'float']).columns[1:]

#### Dealing  with NaN

In [6]:
train_X.loc[:,num_cols]=train_X.loc[:,num_cols].fillna(-1) # Fill NaN with -1
train_X.loc[:,cat_cols]=train_X.loc[:,cat_cols].fillna('99') # Fill NaN with '99' for categorical columns

#### Converting some numerical variables back to integers

In [7]:
train_X.loc[:,'v10'] = round(train_X.loc[:,'v10'] / 0.02188170970971, 2)
train_X.loc[:,'v40'] = round(train_X.loc[:,'v40'] / 0.000723536569601, 2)
train_X.loc[:,'v50'] = round(train_X.loc[:,'v50'] / 0.00146724324324, 2)
train_X = round(train_X, 2)

#### Creating new variables for capturing the interactions among categorical variables

In [9]:
for c in cat_cols[1:]:
    train.loc[:, 'v22'+c] = train_X.loc[:, 'v22'] + train_X.loc[:, c]

In [8]:
for c in combinations(cat_cols, 2):
    train_X.loc[:,c[0]+c[1]] = train_X.loc[:,c[0]] + train_X.loc[:,c[1]]

In [9]:
for c in combinations(cat_cols[1:], 2):
    train_X.loc[:,'v22'+c[0]+c[1]] = train_X.loc[:,'v22'] + train_X.loc[:,c[0]] + train_X.loc[:,c[1]]

In [10]:
train_X.loc[:, 'v22'+'v40-v50'] = train_X.loc[:, 'v22'] + (train_X.loc[:,'v40']-train_X.loc[:,'v50']).map(str)

#### Creating a training set and an evaluation set

In [11]:
train_X, eval_X, train_y, eval_y = train_test_split(train_X, train_y, test_size=0.25, train_size=0.75, random_state=123)


In [8]:
# # Convert v40 and v50 back to integers by dividing a denominator and deduct v50 from v40
# train_X['v40-v50'] = round(train_X['v40']/0.000723536569601, 2) - round(train_X['v50']/0.00146724324324, 2)
# eval_X['v40-v50'] = round(eval_X['v40']/0.000723536569601, 2) - round(eval_X['v50']/0.00146724324324, 2)

### (To be removed) Estimate the target variable for the evaluation set 

In [19]:
cols_with_same_values = []
for col in categorical_cols:
    if set(train_X[col].unique()) == set(eval_X[col].unique()):
        cols_with_same_values.append(col)
cols_with_same_values

NameError: name 'categorical_cols' is not defined

In [12]:
k = 1.5
f = 0.5
pt = sum(train_X.target)/len(train_X) # the prior probability of the dependent attribute
def beta(x, t_count, k, f):
    ni = t_count[x]
    return 1 / (1 + np.exp(-(ni-k)/f))

def transform_to_scalar(x, t_mean, pt, beta):
    pi = t_mean[x]
    return beta * pi + (1 - beta) * pt

In [13]:
for col in cols_with_same_values:
    t_mean = train_X.groupby([col])['target'].mean()
    t_count = train_X.groupby([col])['target'].count()
    train_X[col] = train_X[col].apply(lambda x: transform_to_scalar(x, t_mean, pt, beta(x, t_count, k, f)))
    eval_X[col] = eval_X[col].apply(lambda x: transform_to_scalar(x, t_mean, pt, beta(x, t_count, k, f)))

In [14]:
ext = ensemble.ExtraTreesClassifier(n_estimators=1200, max_features= 8, criterion= 'entropy', min_samples_split= 2,
                            max_depth= 30, min_samples_leaf= 2, n_jobs = -1, random_state=1)    

In [15]:
cols_to_train = list(num_cols) + cols_with_same_values

In [16]:
ext.fit(train_X[cols_to_train], train_y)

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='entropy',
           max_depth=30, max_features=30, max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=2,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=1200, n_jobs=-1, oob_score=False, random_state=1,
           verbose=0, warm_start=False)

In [17]:
eval_X['target'] = ext.predict(eval_X[cols_to_train]) 

### Categorical Encoding Using Target Statistics 
Transform individual value of a categorical attribute $X$ to a scalar, $S_i$, representing the estimate of probability that the target variable equals to 1 given $X=X_i$. $S_i$ is calculated using the formula: $$S_i=\beta(n_i)\frac{n_{iy}}{n_i}+(1-\beta(n_i))\frac{n_y}{n_{TR}}$$ 
where $n_y$ is total number of cases such that $y=1$ 

In [12]:
k = 1.5
f = 0.5
pt = sum(train_X.target)/len(train_X) # the prior probability of the dependent attribute
def beta(x, count, k, f):
    ni = 0 if x not in count else count[x]
    return 1 / (1 + np.exp(-(ni-k)/f))

def transform_to_scalar(x, mean, pt, beta):
    pi = pt if x not in mean else mean[x]
    return (beta * pi + (1 - beta) * pt) # * np.random.normal(1, 0.03)

def beta_(x, count, k, f):
    ni = 0 if (x[0], x[1]) not in count else count.loc[x[0], x[1]]
    return 1 / (1 + np.exp(-(ni-k)/f))

def transform_to_scalar_(x, mean, pt, beta):
    pi = pt if (x[0], x[1]) not in mean else mean.loc[x[0], x[1]]
    return (beta * pi + (1 - beta) * pt) # * np.random.normal(1, 0.03)
    

### Leave-One-Out Encoding Using Target Mean

In [13]:
cat_cols = train_X.select_dtypes(include=['O']).columns

In [14]:
n_rows = train_X.shape[0]
train_X = train_X.reset_index(drop=True)
train_y = train_y.reset_index(drop=True)

In [15]:
for col in cat_cols:
    mean = train_X.groupby([col])['target'].mean()
    count = train_X.groupby([col])['target'].count()
    eval_X.loc[:, col] = eval_X.loc[:,col].apply(lambda x: transform_to_scalar(x, mean, pt, beta(x, count, k, f)))
    for i in range(5):
        fold = range(i, n_rows, 5)
        mean = train_X.drop(fold, axis=0).groupby([col])['target'].mean()
        count = train_X.drop(fold, axis=0).groupby([col])['target'].count()
        train_X.loc[fold, col] = train_X.loc[fold,col].apply(lambda x: transform_to_scalar(x, mean, pt, beta(x, count, k, f)))  

In [16]:
train_X = train_X.drop('target', axis=1)
eval_X = eval_X.drop('target', axis=1)

### Model Training

### XGBoost

In [17]:
train_X[cat_cols] = train_X[cat_cols].astype('float')

In [18]:
XGB = XGBClassifier(learning_rate =0.01, n_estimators=2000, max_depth=8,
 min_child_weight=1.5, gamma=0, reg_alpha=0.01, subsample=0.8, colsample_bytree=0.8, objective= 'binary:logistic',
 nthread=4, scale_pos_weight=1)

In [19]:
XGB.fit(train_X, train_y, eval_metric='logloss')

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.8,
       gamma=0, learning_rate=0.01, max_delta_step=0, max_depth=8,
       min_child_weight=1.5, missing=None, n_estimators=2000, nthread=4,
       objective='binary:logistic', reg_alpha=0.01, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.8)

In [20]:
prob_xgb = XGB.predict_proba(eval_X)[:,1]

In [21]:
prob_xgb

array([ 0.71808976,  0.38508865,  0.54427785, ...,  0.9025293 ,
        0.67543477,  0.97460437], dtype=float32)

In [22]:
print(metrics.log_loss(eval_y, prob_xgb)) #  0.443201485761 0.443116811922

0.443116811922


### Extra Trees Classifier

In [32]:
EXT = ensemble.ExtraTreesClassifier(n_estimators=1200, max_features= 8, criterion= 'entropy',min_samples_split= 2,
                            max_depth= 30, min_samples_leaf= 2, n_jobs = -1,random_state=1)    

In [33]:
EXT.fit(train_X, train_y)

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='entropy',
           max_depth=30, max_features=8, max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=2,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=1200, n_jobs=-1, oob_score=False, random_state=1,
           verbose=0, warm_start=False)

In [34]:
prob_ext = EXT.predict_proba(eval_X)[:,1]

In [35]:
print(metrics.log_loss(eval_y, prob_ext))

0.470208956498


### Stacking

In [88]:
np.random.seed(0)

In [89]:
clfs = [XGBClassifier(learning_rate =0.01, n_estimators=2000, max_depth=8, min_child_weight=1.5, gamma=0, reg_alpha=0.01,
                      subsample=0.8, colsample_bytree=0.8, objective= 'binary:logistic', nthread=4, scale_pos_weight=1),
        ensemble.ExtraTreesClassifier(n_estimators=1000, max_features= 8, criterion= 'entropy',min_samples_split= 4,
                                      max_depth= 35, min_samples_leaf= 2, n_jobs = -1)]


In [90]:
print("Creating train and test sets for stacking.")
train_stack = np.zeros((train_X.shape[0], len(clfs)))
test_stack = np.zeros((eval_X.shape[0], len(clfs)))


Creating train and test sets for stacking.


In [91]:
sfk = StratifiedKFold(n_splits=5) 
for j, clf in enumerate(clfs):
    print(j, clf)
    test_stack_j = np.zeros((eval_X.shape[0], 5))
    folds = sfk.split(train_X, train_y)
    for i, (train_idx, test_idx) in enumerate(folds):
        print("Fold", i)
        X_train = train_X.iloc[train_idx,:]
        y_train = train_y[train_idx]
        X_test = train_X.iloc[test_idx,:]
        y_test = train_y[test_idx]
        if j == 0: 
            clf.fit(X_train, y_train, eval_metric='logloss')
        else:
            clf.fit(X_train, y_train)
        y_prob = clf.predict_proba(X_test)[:,1]
        train_stack[test_idx, j] = y_prob
        test_stack_j[:, i] = clf.predict_proba(eval_X)[:,1]
    test_stack[:,j] = test_stack_j.mean(1)

0 XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.8,
       gamma=0, learning_rate=0.01, max_delta_step=0, max_depth=8,
       min_child_weight=1.5, missing=None, n_estimators=2000, nthread=4,
       objective='binary:logistic', reg_alpha=0.01, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.8)
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4
1 ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='entropy',
           max_depth=35, max_features=8, max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=2,
           min_samples_split=4, min_weight_fraction_leaf=0.0,
           n_estimators=1000, n_jobs=-1, oob_score=False,
           random_state=None, verbose=0, warm_start=False)
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4


In [92]:
print("Stacking...")
#clf = linear_model.LogisticRegression()
clf = XGBClassifier(learning_rate =0.1, n_estimators=150, max_depth=8,
                    min_child_weight=1.5, gamma=0, reg_alpha=0.01, subsample=0.8, 
                    colsample_bytree=0.8, objective= 'binary:logistic',
                    nthread=4, scale_pos_weight=1)
clf.fit(train_stack, train_y, eval_metric='logloss')
prob = clf.predict_proba(test_stack)[:,1]

Stacking...


In [93]:
print(metrics.log_loss(eval_y, prob))

0.468157543116
