# Data Loading

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import fancyimpute as fancy
import warnings
import keras
warnings.filterwarnings('ignore')

Using TensorFlow backend.


In [2]:
def Missing_values(data):
    total = data.isnull().sum().sort_values(ascending=False)
    percent = (data.isnull().sum()/data.isnull().count()).sort_values(ascending=False)
    missing_data = pd.concat([total,percent], axis=1, keys=['Total', 'Pourcentage'])
    #Affiche que les variables avec des na
    print(missing_data[(percent>0)],'\n')

In [2]:
train = pd.read_excel("Train.xlsx")
test = pd.read_excel("Test.xlsx")
train_out = pd.read_csv("train_output.csv", sep = ";")

In [3]:
train.month = pd.Categorical(train.month)
test.month = pd.Categorical(test.month)

train.country = pd.Categorical(train.country)
test.country = pd.Categorical(test.country)

train = train.set_index("ID")
train.index.name = None

test = test.set_index("ID")
test.index.name = None

train_out = train_out.set_index("ID")
train_out.index.name = None

# Preprocessing

## Reduction

In [4]:
train_with_target = train.copy()
train_with_target['Target'] = train_out['Target'].copy()
corr = train_with_target.corr()
z = corr[abs(corr['Target']) < 0.01]
var_inut = z.index.tolist()

In [5]:
train_filter = train.drop(var_inut,axis = 1)
test_filter = test.drop(var_inut,axis = 1)

## Completion NA

In [6]:
train_filled = fancy.KNN(k=10).complete(train_filter)

Imputing row 1/10159 with 0 missing, elapsed time: 60.605
Imputing row 101/10159 with 0 missing, elapsed time: 60.637
Imputing row 201/10159 with 0 missing, elapsed time: 60.637
Imputing row 301/10159 with 0 missing, elapsed time: 60.638
Imputing row 401/10159 with 0 missing, elapsed time: 60.640
Imputing row 501/10159 with 0 missing, elapsed time: 60.642
Imputing row 601/10159 with 0 missing, elapsed time: 60.643
Imputing row 701/10159 with 0 missing, elapsed time: 60.646
Imputing row 801/10159 with 0 missing, elapsed time: 60.646
Imputing row 901/10159 with 0 missing, elapsed time: 60.647
Imputing row 1001/10159 with 0 missing, elapsed time: 60.648
Imputing row 1101/10159 with 0 missing, elapsed time: 60.650
Imputing row 1201/10159 with 0 missing, elapsed time: 60.656
Imputing row 1301/10159 with 0 missing, elapsed time: 60.659
Imputing row 1401/10159 with 0 missing, elapsed time: 60.661
Imputing row 1501/10159 with 0 missing, elapsed time: 60.667
Imputing row 1601/10159 with 0 missi

## Expansion

In [16]:
df_train_filled = pd.DataFrame(train_filled)
df_train_filled.columns = train_filter.columns
df_train_filled['month'] = df_train_filled['month'].astype('category')
df_train_filled['country'] = df_train_filled['country'].astype('category')

In [17]:
train_expand = pd.get_dummies(df_train_filled)
test_expand = pd.get_dummies(test_filter)
train_out.index = train_expand.index

In [18]:
train_out.index = train_expand.index
test_expand.columns = train_expand.columns

# XGBoost et GridSearch

In [187]:
from sklearn.model_selection import GridSearchCV
from sklearn.cross_validation import KFold
from sklearn import preprocessing
from sklearn.cross_validation import train_test_split

from sklearn.cross_validation import *
from sklearn.grid_search import GridSearchCV

xgb_model = xgb.XGBRegressor()

parameters = {'nthread':[4], #when use hyperthread, xgboost may become slower
              'learning_rate': [0.03,0.05,0.1], #so called `eta` value
              'max_depth': [6,7,8,10],
              'colsample_bytree': [0.7],
              'n_estimators': [5,100,500]}


clf = GridSearchCV(xgb_model, parameters, n_jobs=5, 
                   cv=3, 
                   scoring='roc_auc',
                   verbose=2, refit=True)

X = train_expand
y = train_out

clf.fit(X,y)

clf.best_params_, clf.best_score_

Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=5)]: Done  31 tasks      | elapsed:  1.0min
[Parallel(n_jobs=5)]: Done 108 out of 108 | elapsed:  3.5min finished


({'colsample_bytree': 0.7,
  'learning_rate': 0.03,
  'max_depth': 6,
  'n_estimators': 500,
  'nthread': 4},
 0.8121641334664079)

# Stacking

In [13]:
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor

# define base model
def baseline_model():
    # create model
    model = Sequential()
    model.add(Dense(13, input_dim=13, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    # Compile model
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

Keras_reg = KerasRegressor(build_fn=baseline_model, nb_epoch=100, batch_size=5, verbose=0)

In [1]:
from vecstack import stacking
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import ExtraTreesRegressor
from xgboost import XGBRegressor

X_train = train_expand.copy()
y_train = train_out.copy()
X_test = test_expand.copy()
X_test = X_test.reset_index(drop = True)

X_train = X_train.as_matrix()
y_train = y_train.as_matrix()
X_test = X_test.as_matrix()

models = [RandomForestRegressor(n_jobs = -1),
          ExtraTreesRegressor(n_jobs = -1),
           XGBRegressor(colsample_bytree= 0.7,learning_rate= 0.03, max_depth= 6, n_estimators= 500,nthread= 4)]
    



NameError: name 'train_expand' is not defined

In [53]:
# Compute stacking features 
S_train, S_test = stacking(models, X_train, y_train, X_test, 
    regression = True, metric = roc_auc_score, n_folds = 4, 
    shuffle = True, random_state = 0, verbose = 2)

task:   [regression]
metric: [roc_auc_score]

model 0: [RandomForestRegressor]
    fold 0: [0.75803361]
    fold 1: [0.76299407]
    fold 2: [0.78033499]
    fold 3: [0.78392433]
    ----
    MEAN:   [0.77125819]

model 1: [BaggingRegressor]
    fold 0: [0.77501552]
    fold 1: [0.77660006]
    fold 2: [0.77705889]
    fold 3: [0.77898915]
    ----
    MEAN:   [0.77704058]

model 2: [XGBRegressor]
    fold 0: [0.80981292]
    fold 1: [0.80880142]
    fold 2: [0.82178481]
    fold 3: [0.81493769]
    ----
    MEAN:   [0.81353325]

model 3: [XGBRegressor]
    fold 0: [0.80981292]
    fold 1: [0.80880142]
    fold 2: [0.82178481]
    fold 3: [0.81493769]
    ----
    MEAN:   [0.81353325]

model 4: [XGBRegressor]
    fold 0: [0.80981292]
    fold 1: [0.80880142]
    fold 2: [0.82178481]
    fold 3: [0.81493769]
    ----
    MEAN:   [0.81353325]



In [54]:
# Initialize 2-nd level model
model = XGBRegressor()

In [55]:
from sklearn.model_selection import GridSearchCV

parameters = {'nthread':[4], #when use hyperthread, xgboost may become slower
              'learning_rate': [0.03,0.05,0.1], #so called `eta` value
              'max_depth': [6,7,8,10],
              'colsample_bytree': [0.7],
              'n_estimators': [100,500,1000,2000]}


clf = GridSearchCV(model, parameters, n_jobs=-1, 
                   cv=5, 
                   scoring='roc_auc',
                   verbose=2, refit=True)

X = S_train
y = y_train

clf.fit(X,y)

clf.best_params_, clf.best_score_

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   29.9s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:  4.1min finished


({'colsample_bytree': 0.7,
  'learning_rate': 0.03,
  'max_depth': 6,
  'n_estimators': 100,
  'nthread': 4},
 0.80844022373438795)

# Output

In [51]:
y_test = clf.predict(S_test)
sub = pd.DataFrame({"ID":test_expand.index, "Target": y_test},columns=["ID","Target"])
sub.to_csv("sub_2.csv",index=False,sep = ";")