In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from collections import Counter
from sklearn import preprocessing,metrics,cross_validation
from sklearn import ensemble
from sklearn.linear_model import RandomizedLogisticRegression
from sklearn import linear_model
from sklearn.grid_search import GridSearchCV 
from xgboost.sklearn import XGBClassifier
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import StratifiedKFold
%matplotlib inline

In [3]:
train = pd.read_csv('train.csv')
test= pd.read_csv('test.csv')

In [4]:
# variables to remove(based on Kaggle's script)
removal = ['ID','target','v8','v23','v25','v31','v36','v37','v46','v51','v53','v54','v63','v73','v75','v79','v81','v82',
           'v89','v92','v95','v105','v107','v108','v109','v110','v116','v117','v118','v119','v123','v124','v128']

In [None]:
#'v25','v46','v63','v64','v17','v33','v48','v17','v33','v48','v54','v107','v110'

In [5]:
Y_train = train.target

In [6]:
X_train = train.drop(removal, axis=1)

In [7]:
# categorical columns 
cat = ['v3','v22','v24','v30','v47','v52','v56','v66','v71','v74','v91','v112','v113','v125'] 
# numerical columns
num = [v for v in list(X_train.columns.values) if v not in cat] 

In [9]:
for c in cat:
    X_train[c].replace(np.nan,' ', regex=True, inplace= True)

In [10]:
X_train.fillna(-1, inplace = True)

In [11]:
# label encode the categorical variables
for s in cat:
    lbl = preprocessing.LabelEncoder()
    lbl.fit(X_train[s])
    X_train[s] = lbl.transform(X_train[s])

In [12]:
x_train, x_eval, y_train, y_eval= train_test_split(X_train, Y_train, test_size=0.25, train_size=0.75, random_state=123)

### Feature Selection

#### Randomized Logistic Regression  

In [139]:
randomized_logistic = RandomizedLogisticRegression(n_jobs=1,n_resampling=200,selection_threshold=0.000001,random_state=123)

In [140]:
randomized_logistic.fit(x_train,y_train)

RandomizedLogisticRegression(C=1, fit_intercept=True,
               memory=Memory(cachedir=None), n_jobs=1, n_resampling=200,
               normalize=True, pre_dispatch='3*n_jobs', random_state=123,
               sample_fraction=0.75, scaling=0.5,
               selection_threshold=1e-06, tol=0.001, verbose=False)

In [141]:
idx = randomized_logistic.get_support(indices=False)
selected_features1 = x_train.columns.values[idx]

#### Random Forest

In [142]:
rf = ensemble.RandomForestClassifier(n_estimators=50,criterion='entropy',n_jobs=-1,random_state=123)

In [143]:
rf.fit(x_train,y_train)
print ("Features sorted by their score:")
importance = sorted(zip(map(lambda x: round(x, 8), rf.feature_importances_), x_train.columns.values),reverse=True)

Features sorted by their score:


In [172]:
# select the features with importance greater than 0.005
selected_features2 = [importance[i][1] for i in  range(x_train.shape[1]) if importance[i][0] > 0.005] 
len(selected_features2)

90

In [173]:
more_features = [x for x in selected_features2 if x not in selected_features1] 
selected_features = list(selected_features1) + more_features

In [192]:
np.array(selected_features) # 93

array(['v3', 'v10', 'v12', 'v13', 'v14', 'v21', 'v24', 'v33', 'v34', 'v38',
       'v40', 'v47', 'v50', 'v55', 'v56', 'v58', 'v62', 'v66', 'v71',
       'v72', 'v74', 'v83', 'v84', 'v85', 'v91', 'v100', 'v111', 'v112',
       'v113', 'v114', 'v121', 'v129', 'v130', 'v22', 'v125', 'v52', 'v30',
       'v5', 'v98', 'v87', 'v70', 'v6', 'v88', 'v57', 'v131', 'v102',
       'v28', 'v120', 'v39', 'v1', 'v115', 'v16', 'v45', 'v99', 'v68',
       'v126', 'v80', 'v90', 'v18', 'v78', 'v26', 'v2', 'v44', 'v69',
       'v101', 'v127', 'v7', 'v35', 'v122', 'v97', 'v9', 'v27', 'v19',
       'v20', 'v15', 'v86', 'v60', 'v103', 'v43', 'v4', 'v11', 'v104',
       'v59', 'v42', 'v94', 'v61', 'v32', 'v93', 'v65', 'v49', 'v77',
       'v48', 'v41'], 
      dtype='<U4')

In [14]:
selected_features = ['v3', 'v10', 'v12', 'v13', 'v14', 'v21', 'v24', 'v33', 'v34', 'v38',
       'v40', 'v47', 'v50', 'v55', 'v56', 'v58', 'v62', 'v66', 'v71',
       'v72', 'v74', 'v83', 'v84', 'v85', 'v91', 'v100', 'v111', 'v112',
       'v113', 'v114', 'v121', 'v129', 'v130', 'v22', 'v125', 'v52', 'v30',
       'v5', 'v98', 'v87', 'v70', 'v6', 'v88', 'v57', 'v131', 'v102',
       'v28', 'v120', 'v39', 'v1', 'v115', 'v16', 'v45', 'v99', 'v68',
       'v126', 'v80', 'v90', 'v18', 'v78', 'v26', 'v2', 'v44', 'v69',
       'v101', 'v127', 'v7', 'v35', 'v122', 'v97', 'v9', 'v27', 'v19',
       'v20', 'v15', 'v86', 'v60', 'v103', 'v43', 'v4', 'v11', 'v104',
       'v59', 'v42', 'v94', 'v61', 'v32', 'v93', 'v65', 'v49', 'v77',
       'v48', 'v41']

### Model Training

In [19]:
RF = ensemble.RandomForestClassifier(n_estimators=1000,criterion='entropy',n_jobs=-1,random_state=123)

In [20]:
RF.fit(x_train[selected_features],y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=1000, n_jobs=-1, oob_score=False,
            random_state=123, verbose=0, warm_start=False)

In [15]:
EXT1 = ensemble.ExtraTreesClassifier(n_estimators=1200, max_features= 30, criterion= 'entropy',min_samples_split= 2,
                            max_depth= 30, min_samples_leaf= 2, n_jobs = -1,random_state=1)    

In [16]:
EXT1.fit(x_train[selected_features],y_train)

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='entropy',
           max_depth=30, max_features=30, max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=2,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=1200, n_jobs=-1, oob_score=False, random_state=1,
           verbose=0, warm_start=False)

In [190]:
EXT2 = ensemble.ExtraTreesClassifier(n_estimators=1200, max_features= 20, criterion= 'entropy',min_samples_split= 2,
                            max_depth= 30, min_samples_leaf= 2, n_jobs = -1,random_state=6)    

In [194]:
EXT2.fit(x_train[selected_features],y_train)

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='entropy',
           max_depth=30, max_features=20, max_leaf_nodes=None,
           min_samples_leaf=2, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=1200, n_jobs=-1,
           oob_score=False, random_state=3, verbose=0, warm_start=False)

In [13]:
xgboost_params = { 
    "objective": "binary:logistic",
    "booster": "gbtree", 
    "eval_metric": "logloss", 
    "eta": 0.01, 
    "subsample": 0.8,
    "colsample_bytree": 0.8, 
    "max_depth": 8,
    "seed":1
}

In [28]:
xgtrain = xgb.DMatrix(x_train[selected_features].values, y_train.values)
xgeval = xgb.DMatrix(x_eval[selected_features].values)

In [38]:
boost_round = 2000

In [39]:
XGB = xgb.train(xgboost_params, xgtrain, num_boost_round=boost_round, verbose_eval=True)


In [40]:
Prob_xgb = XGB.predict(xgeval, ntree_limit=XGB.best_iteration)

In [14]:
nfolds=10

In [34]:
stf = list(StratifiedKFold(y_train.index, nfolds))



In [54]:
XGB1 = XGBClassifier(learning_rate =0.01, n_estimators=2000, max_depth=8,
 min_child_weight=1.5, gamma=0, reg_alpha=0.01, subsample=0.8, colsample_bytree=0.8, objective= 'binary:logistic',
 nthread=4, scale_pos_weight=1, seed=1)
XGB2 = XGBClassifier(learning_rate =0.01, n_estimators=2000, max_depth=8,
 min_child_weight=1.5, gamma=0, reg_alpha=0.01, subsample=0.8, colsample_bytree=0.8, objective= 'binary:logistic',
 nthread=4, scale_pos_weight=1, seed=1)
XGB3 = XGBClassifier(learning_rate =0.01, n_estimators=2000, max_depth=8,
 min_child_weight=1.5, gamma=0, reg_alpha=0.01, subsample=0.8, colsample_bytree=0.8, objective= 'binary:logistic',
 nthread=4, scale_pos_weight=1, seed=1)

In [44]:
XGB1.fit(x_train[selected_features].iloc[:70000,:], y_train[:70000],eval_metric='logloss')

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.8,
       gamma=0, learning_rate=0.01, max_delta_step=0, max_depth=8,
       min_child_weight=1.5, missing=None, n_estimators=2000, nthread=4,
       objective='binary:logistic', reg_alpha=0.01, reg_lambda=1,
       scale_pos_weight=1, seed=1, silent=True, subsample=0.8)

In [45]:
Prob_xgb1 = XGB1.predict_proba(x_eval[selected_features])[:,1]

In [47]:
XGB2.fit(x_train[selected_features].iloc[-70000:,:], y_train[-70000:],eval_metric='logloss')

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.8,
       gamma=0, learning_rate=0.01, max_delta_step=0, max_depth=8,
       min_child_weight=1.5, missing=None, n_estimators=2000, nthread=4,
       objective='binary:logistic', reg_alpha=0.01, reg_lambda=1,
       scale_pos_weight=1, seed=1, silent=True, subsample=0.8)

In [48]:
Prob_xgb2 = XGB2.predict_proba(x_eval[selected_features])[:,1]

In [58]:
XGB3.fit(x_train[selected_features].iloc[10000:80000,:], y_train[10000:80000],eval_metric='logloss')

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.8,
       gamma=0, learning_rate=0.01, max_delta_step=0, max_depth=8,
       min_child_weight=1.5, missing=None, n_estimators=2000, nthread=4,
       objective='binary:logistic', reg_alpha=0.01, reg_lambda=1,
       scale_pos_weight=1, seed=1, silent=True, subsample=0.8)

In [63]:
Prob_xgb3 = XGB3.predict_proba(x_eval[selected_features])[:,1]

In [73]:
pp = 1/3*(Prob_xgb1+Prob_xgb2+Prob_xgb3)

In [74]:
print(metrics.log_loss(y_eval, pp))

0.463086361467


#### Evaluation

In [21]:
Prob_rf = RF.predict_proba(x_eval[selected_features])[:,1]

In [17]:
Prob_ext1 = EXT1.predict_proba(x_eval[selected_features])[:,1]

In [195]:
Prob_ext2 = EXT2.predict_proba(x_eval[selected_features])[:,1]

In [103]:
Prob_xgb1 = XGB1.predict_proba(x_eval[selected_features])[:,1]

In [207]:
Prob_xgb2 = XGB2.predict_proba(x_eval[selected_features])[:,1]

In [156]:
print(metrics.log_loss(y_eval, Prob_rf))

0.467575158768
0.459134466627
0.46489938548


In [22]:
print(metrics.log_loss(y_eval, Prob_rf)) # 0.467562991128

0.467562991128


In [18]:
print(metrics.log_loss(y_eval, Prob_ext1)) # 0.458669333061

0.458669333061


In [198]:
print(metrics.log_loss(y_eval, Prob_ext2)) # 0.461548874465

0.461548874465


In [104]:
print(metrics.log_loss(y_eval, Prob_xgb1)) # 0.462485055814

0.462242613464


In [75]:
Prob = 1/2 * (Prob_ext1 + Prob_xgb)

NameError: name 'Prob_ext1' is not defined

In [82]:
print(metrics.log_loss(y_eval, Prob)) # 0.456993773837

0.456997963019


In [94]:
np.corrcoef(Prob_xgb,Prob_rf)

array([[ 1.        ,  0.91883161],
       [ 0.91883161,  1.        ]])