This notebook contains the codes for data cleaning and model training

In [48]:
from __future__ import print_function, division
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from collections import Counter
from sklearn import preprocessing, metrics,cross_validation
from sklearn import ensemble
from sklearn import linear_model
from sklearn.grid_search import GridSearchCV 
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import StratifiedKFold
from sklearn import neighbors
from sklearn import svm
from sklearn import linear_model 
%matplotlib inline

In [3]:
train = pd.read_csv('train.csv')
test= pd.read_csv('test.csv')

In [4]:
print("{} observations and {} features".format(train.shape[0], train.shape[1] - 2)) 

114321 observations and 131 features


### Data Cleaning

In [10]:
# Check duplicates
sum(train.duplicated())

0

In [74]:
# Drop v34, v75, v107, v129
train_X = train.drop(['ID', 'v34', 'v75', 'v107', 'v129'], axis=1)
train_y = train.target

In [75]:
train_X = train_X.fillna(train_X.mean())

In [23]:
test_X = test.drop(['ID', 'v34', 'v75', 'v107', 'v129'], axis=1)

In [30]:
categorical_cols = train_X.select_dtypes(include=['O']).columns
numerical_cols = train_X.select_dtypes(include=['int', 'float']).columns

In [73]:
def beta(n, k, f):
    return 1 / (1 + np.exp(-(n-k)/f))
beta(1000, 500, 200)

0.92414181997875655

In [76]:
train_X

Unnamed: 0,target,v1,v2,v3,v4,v5,v6,v7,v8,v9,...,v121,v122,v123,v124,v125,v126,v127,v128,v130,v131
0,1,1.335739e+00,8.727474,C,3.921026,7.915266,2.599278,3.176895,0.012941,9.999999,...,0.803572,8.000000,1.989780,3.575369e-02,AU,1.804126,3.113719,2.024285,0.636365,2.857144e+00
1,1,1.630686e+00,7.464411,C,4.145098,9.191265,2.436402,2.483921,2.301630,9.031859,...,2.737596,6.822439,3.549938,5.988956e-01,AF,1.672658,3.239542,1.957825,1.925763,1.739389e+00
2,1,9.438769e-01,5.310079,C,4.410969,5.326159,3.979592,3.928571,0.019645,12.666667,...,2.238806,9.333333,2.477596,1.345191e-02,AE,1.773709,3.922193,1.120468,0.883118,1.176472e+00
3,1,7.974146e-01,8.304757,C,4.225930,11.627438,2.097700,1.987549,0.171947,8.965516,...,1.956521,7.018256,1.812795,2.267384e-03,CJ,1.415230,2.954381,1.990847,1.677108,1.034483e+00
4,1,1.630686e+00,7.464411,C,4.145098,8.742359,2.436402,2.483921,1.496569,9.031859,...,2.737596,6.822439,3.549938,9.198120e-01,Z,1.672658,3.239542,2.030373,1.925763,1.739389e+00
5,0,1.630686e+00,7.464411,C,4.145098,8.856791,2.436402,2.483921,0.359993,9.031859,...,2.737596,6.822439,3.549938,4.986116e-02,X,1.672658,3.239542,1.536222,1.925763,1.739389e+00
6,0,8.998057e-01,7.312995,C,3.494148,9.946200,1.926070,1.770427,0.066251,5.011287,...,2.232558,3.476299,1.992594,8.375832e-02,BJ,3.276100,1.623298,2.266575,2.263736,9.708730e-01
7,1,1.630686e+00,7.464411,C,4.145098,8.742359,2.436402,2.483921,1.496569,9.031859,...,2.737596,6.822439,3.549938,9.198120e-01,BY,1.672658,3.239542,2.030373,1.925763,1.739389e+00
8,0,2.078651e+00,8.462619,,3.739030,5.265636,1.573033,2.303371,0.015869,11.111111,...,1.276595,8.148148,1.875560,1.865950e-02,S,1.159637,5.582865,1.105283,1.170731,3.333334e+00
9,1,1.144802e+00,5.880606,C,3.244469,9.538384,2.500001,1.559405,0.412610,9.977529,...,2.715964,7.325843,4.896617,8.943653e-03,E,1.344550,1.601176,1.928009,3.174603,1.000000e+00


In [17]:
grouped = train.groupby(['v22', train.v40-train.v50])

In [3]:
# variables to remove(based on Kaggle's script)
removal = ['ID','target','v8','v23','v25','v31','v36','v37','v46','v51','v53','v54','v63','v73','v75','v79','v81','v82',
           'v89','v92','v95','v105','v107','v108','v109','v110','v116','v117','v118','v119','v123','v124','v128']

In [4]:
test_id = test.ID

In [5]:
Y_train = train.target

In [6]:
X_train = train.drop(removal,axis=1)

In [7]:
removal.remove('target')

In [8]:
X_test = test.drop(removal,axis=1)

In [9]:
# categorical columns 
cat = ['v3','v22','v24','v30','v47','v52','v56','v66','v71','v74','v91','v112','v113','v125'] 
# numerical columns
num = [v for v in list(X_train.columns.values) if v not in cat] 

In [10]:
for c in cat:
    X_train[c].replace(np.nan,' ', regex=True, inplace= True)
    X_test[c].replace(np.nan,' ', regex=True, inplace= True)

In [11]:
X_train.fillna(-1, inplace = True)
X_test.fillna(-1, inplace = True)

In [12]:
# label encode the categorical variables
for s in cat:
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(X_train[s])+list(X_test[s]))
    X_train[s] = lbl.transform(X_train[s])
    X_test[s] = lbl.transform(X_test[s])

### Feature Selection

#### Randomized Logistic Regression  

In [106]:
randomized_logistic = linear_model.RandomizedLogisticRegression(n_jobs=1,n_resampling=200,selection_threshold=0.000001,random_state=123)

In [107]:
randomized_logistic.fit(X_train,Y_train)

RandomizedLogisticRegression(C=1, fit_intercept=True,
               memory=Memory(cachedir=None), n_jobs=1, n_resampling=200,
               normalize=True, pre_dispatch='3*n_jobs', random_state=123,
               sample_fraction=0.75, scaling=0.5,
               selection_threshold=1e-06, tol=0.001, verbose=False)

In [108]:
idx = randomized_logistic.get_support(indices=False)
selected_features1 = X_train.columns.values[idx] # 32 variables

#### Random Forest

In [110]:
rf = ensemble.RandomForestClassifier(n_estimators=50,criterion='entropy',n_jobs=-1,random_state=123)

In [111]:
rf.fit(X_train,Y_train)
print ("Features sorted by their score:")
importance = sorted(zip(map(lambda x: round(x, 8), rf.feature_importances_), X_train.columns.values),reverse=True)

Features sorted by their score:


In [112]:
# select the features with importance greater than 0.005
selected_features2 = [importance[i][1] for i in  range(X_train.shape[1]) if importance[i][0] > 0.005] # 90 variables 

In [114]:
more_features = [x for x in selected_features2 if x not in selected_features1] 
selected_features = list(selected_features1) + more_features

In [115]:
np.array(selected_features) # 93

array(['v3', 'v10', 'v12', 'v14', 'v21', 'v24', 'v33', 'v34', 'v38', 'v39',
       'v40', 'v47', 'v50', 'v55', 'v56', 'v58', 'v62', 'v66', 'v71',
       'v72', 'v74', 'v83', 'v85', 'v88', 'v100', 'v111', 'v112', 'v113',
       'v114', 'v121', 'v129', 'v130', 'v22', 'v125', 'v52', 'v91', 'v30',
       'v98', 'v5', 'v87', 'v70', 'v6', 'v120', 'v1', 'v131', 'v28', 'v57',
       'v16', 'v99', 'v102', 'v78', 'v69', 'v126', 'v18', 'v45', 'v90',
       'v127', 'v115', 'v35', 'v68', 'v122', 'v27', 'v2', 'v103', 'v11',
       'v104', 'v97', 'v4', 'v9', 'v7', 'v43', 'v20', 'v80', 'v19', 'v86',
       'v84', 'v15', 'v42', 'v101', 'v13', 'v44', 'v26', 'v32', 'v60',
       'v49', 'v94', 'v77', 'v59', 'v93', 'v65', 'v61', 'v76', 'v67'], 
      dtype='<U4')

In [13]:
selected_features =['v3', 'v10', 'v12', 'v14', 'v21', 'v24', 'v33', 'v34', 'v38', 'v39',
       'v40', 'v47', 'v50', 'v55', 'v56', 'v58', 'v62', 'v66', 'v71',
       'v72', 'v74', 'v83', 'v85', 'v88', 'v100', 'v111', 'v112', 'v113',
       'v114', 'v121', 'v129', 'v130', 'v22', 'v125', 'v52', 'v91', 'v30',
       'v98', 'v5', 'v87', 'v70', 'v6', 'v120', 'v1', 'v131', 'v28', 'v57',
       'v16', 'v99', 'v102', 'v78', 'v69', 'v126', 'v18', 'v45', 'v90',
       'v127', 'v115', 'v35', 'v68', 'v122', 'v27', 'v2', 'v103', 'v11',
       'v104', 'v97', 'v4', 'v9', 'v7', 'v43', 'v20', 'v80', 'v19', 'v86',
       'v84', 'v15', 'v42', 'v101', 'v13', 'v44', 'v26', 'v32', 'v60',
       'v49', 'v94', 'v77', 'v59', 'v93', 'v65', 'v61', 'v76', 'v67']

### Ensembling

In [15]:
np.random.seed(0)

In [22]:
nfolds = 10

In [20]:
clfs = [XGBClassifier(learning_rate =0.01, n_estimators=2000, max_depth=8, min_child_weight=1.5, gamma=0, reg_alpha=0.01,
                      subsample=0.8, colsample_bytree=0.8, objective= 'binary:logistic', nthread=4, scale_pos_weight=1),
        ensemble.ExtraTreesClassifier(n_estimators=1000, max_features= 40, criterion= 'entropy',min_samples_split= 4,
                                      max_depth= 35, min_samples_leaf= 2, n_jobs = -1)]


In [None]:
stf = list(StratifiedKFold(Y_train, nfolds))

In [21]:
print ("Creating train and test sets for blending.")
    
train_blend = np.zeros((X_train.shape[0], len(clfs)))
test_blend = np.zeros((X_test.shape[0], len(clfs)))

Creating train and test sets for blending.


In [23]:
for j, clf in enumerate(clfs):
    print (j, clf)
    test_blend_j = np.zeros((X_test.shape[0], len(stf)))
    for i, (train, test) in enumerate(stf):
        print ("Fold", i)
        x_train = X_train[selected_features].iloc[train,:]
        y_train = Y_train[train]
        x_test = X_train[selected_features].iloc[test,:]
        y_test = Y_train[test]
        if j == 0: 
            clf.fit(x_train, y_train,eval_metric='logloss')
        else:
            clf.fit(x_train, y_train)
        y_prob = clf.predict_proba(x_test)[:,1]
        train_blend[test, j] = y_prob
        test_blend_j[:, i] = clf.predict_proba(X_test[selected_features])[:,1]
    test_blend[:,j] = test_blend_j.mean(1)

0 XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.8,
       gamma=0, learning_rate=0.01, max_delta_step=0, max_depth=8,
       min_child_weight=1.5, missing=None, n_estimators=1800, nthread=4,
       objective='binary:logistic', reg_alpha=0.01, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.8)
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4
1 ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='entropy',
           max_depth=35, max_features=45, max_leaf_nodes=None,
           min_samples_leaf=2, min_samples_split=4,
           min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4


In [33]:
print ("Blending...")
#clf = linear_model.LogisticRegression()
clf = XGBClassifier(learning_rate =0.1, n_estimators=150, max_depth=8,
                    min_child_weight=1.5, gamma=0, reg_alpha=0.01, subsample=0.8, 
                    colsample_bytree=0.8, objective= 'binary:logistic',
                    nthread=4, scale_pos_weight=1)
clf.fit(train_blend, Y_train,eval_metric='logloss')
y_submission = clf.predict_proba(test_blend)[:,1]

Blending...


In [34]:
y_submission

array([ 0.24433048,  0.92933369,  0.80345118, ...,  0.91320717,
        0.95779061,  0.52396667], dtype=float32)

In [28]:
y_submission

array([ 0.30858418,  0.90845704,  0.82227165, ...,  0.91441107,
        0.95158339,  0.44549522], dtype=float32)

### Logistic 

In [115]:
LOG  = linear_model.LogisticRegression(C=0.1,n_jobs=-1)

In [116]:
LOG.fit(X_train[selected_features],Y_train)

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [117]:
Prob_log = LOG.predict_proba(X_test[selected_features])[:,1]

In [174]:
pb_log = LOG.predict_proba(X_train[selected_features])[:,1]

### KNN

In [191]:
KNN = neighbors.KNeighborsClassifier(n_neighbors=500, weights='distance',n_jobs=-1)

In [241]:
KNN.fit(X_train[selected_features],Y_train)

TypeError: fit() missing 1 required positional argument: 'y'

In [193]:
Prob_knn = KNN.predict_proba(X_test[selected_features])[:,1]

In [195]:
pb_knn = KNN.predict(X_train[selected_features])[:,1]

In [240]:
help(KNN.fit)

Help on method fit in module sklearn.neighbors.base:

fit(X, y) method of sklearn.neighbors.classification.KNeighborsClassifier instance
    Fit the model using X as training data and y as target values
    
    Parameters
    ----------
    X : {array-like, sparse matrix, BallTree, KDTree}
        Training data. If array or matrix, shape [n_samples, n_features],
        or [n_samples, n_samples] if metric='precomputed'.
    
    y : {array-like, sparse matrix}
        Target values of shape = [n_samples] or [n_samples, n_outputs]



### XGBoost

In [103]:
XGB = XGBClassifier(learning_rate =0.01, n_estimators=2000, max_depth=8,
 min_child_weight=1.5, gamma=0, reg_alpha=0.01, subsample=0.8, colsample_bytree=0.8, objective= 'binary:logistic',
 nthread=4, scale_pos_weight=1)

In [104]:
XGB.fit(X_train[selected_features], Y_train,eval_metric='logloss')

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.8,
       gamma=0, learning_rate=0.01, max_delta_step=0, max_depth=8,
       min_child_weight=1.5, missing=None, n_estimators=2000, nthread=4,
       objective='binary:logistic', reg_alpha=0.01, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.8)

In [105]:
Prob_xgb = XGB.predict_proba(X_test[selected_features])[:,1]

In [176]:
pb_xgb = XGB.predict_proba(X_train[selected_features])[:,1]

### ExtraTreesClassifier

In [106]:
EXT = ensemble.ExtraTreesClassifier(n_estimators=1000, max_features= 45, criterion= 'entropy',min_samples_split= 4, 
                                    max_depth= 35, min_samples_leaf= 2, n_jobs = -1)

In [107]:
EXT.fit(X_train[selected_features], Y_train)

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='entropy',
           max_depth=35, max_features=45, max_leaf_nodes=None,
           min_samples_leaf=2, min_samples_split=4,
           min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [108]:
Prob_ext = EXT.predict_proba(X_test[selected_features])[:,1]

In [177]:
pb_ext = EXT.predict_proba(X_train[selected_features])[:,1]

### Random Forest

In [109]:
RF = ensemble.RandomForestClassifier(n_estimators=1000,criterion='entropy',n_jobs=-1)

In [110]:
RF.fit(X_train[selected_features],Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=800, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [111]:
Prob_rf = RF.predict_proba(X_test[selected_features])[:,1]

In [178]:
pb_rf = RF.predict_proba(X_train[selected_features])[:,1]

### GBM

In [112]:
GBM = ensemble.GradientBoostingClassifier(learning_rate=0.01, n_estimators=2000,loss='deviance')

In [113]:
GBM.fit(X_train[selected_features],Y_train)

GradientBoostingClassifier(init=None, learning_rate=0.01, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=2000,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [114]:
Prob_gbm = GBM.predict_proba(X_test[selected_features])[:,1]

In [179]:
pb_gbm = GBM.predict_proba(X_train[selected_features])[:,1]

### ExtraTreesRegressor

In [118]:
ETR = ensemble.ExtraTreesRegressor(n_estimators=1000, min_samples_split= 4, min_samples_leaf= 2, n_jobs =-1)

In [119]:
ETR.fit(X_train[selected_features],Y_train)

ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=None,
          max_features='auto', max_leaf_nodes=None, min_samples_leaf=2,
          min_samples_split=4, min_weight_fraction_leaf=0.0,
          n_estimators=1000, n_jobs=-1, oob_score=False, random_state=None,
          verbose=0, warm_start=False)

In [120]:
Prob_etr = ETR.predict(X_test[selected_features])

In [180]:
pb_etr = ETR.predict(X_train[selected_features])

### RandomForestRegressor

In [121]:
RFR= ensemble.RandomForestRegressor(n_estimators=1000,  min_samples_split= 4, min_samples_leaf= 2, n_jobs =-1)

In [122]:
RFR.fit(X_train[selected_features],Y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=2,
           min_samples_split=4, min_weight_fraction_leaf=0.0,
           n_estimators=1000, n_jobs=-1, oob_score=False,
           random_state=None, verbose=0, warm_start=False)

In [123]:
Prob_rfr = RFR.predict(X_test[selected_features])

In [169]:
pb_rfr = RFR.predict_prob(X_train[selected_features])

In [200]:
meta_features = pd.DataFrame({'LOG':pb_log,'XGB':pb_xgb,'EXT':pb_ext,'RF':pb_rf,'GBM':pb_gbm,'ETR':pb_etr,'RFR':pb_rfr})

In [231]:
test_features = pd.DataFrame({'XGB':Prob_xgb,'EXT':Prob_ext,'RF':Prob_rf,'GBM':Prob_gbm,'ETR':Prob_etr,'RFR':Prob_rfr})

In [217]:
clf = XGBClassifier(learning_rate =0.01, n_estimators=1500, max_depth=8,
 min_child_weight=1.5, gamma=0, reg_alpha=0.01, subsample=0.8, colsample_bytree=0.8, objective= 'binary:logistic',
 nthread=4, scale_pos_weight=1)

In [218]:
clf.fit(meta_features, Y_train,eval_metric='logloss')

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.8,
       gamma=0, learning_rate=0.01, max_delta_step=0, max_depth=8,
       min_child_weight=1.5, missing=None, n_estimators=1500, nthread=4,
       objective='binary:logistic', reg_alpha=0.01, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.8)

In [219]:
Prob = clf.predict_proba(test_features)[:,1]

In [234]:
AVG = 1/6*( Prob_xgb + Prob_ext + Prob_rf + Prob_gbm + Prob_etr + Prob_rfr)

In [166]:
test_features.to_csv('testfeatures.csv', index = False)

In [237]:
submission = pd.DataFrame({'ID':test_id, 'PredictedProb':AVG})

In [238]:
submission.to_csv('submission32.csv', index = False)

In [226]:
np.corrcoef(Prob_log,Prob_knn) 

array([[ 1.        ,  0.16503597],
       [ 0.16503597,  1.        ]])