# References
- preprocessing : http://scikit-learn.org/stable/modules/preprocessing.html
- extraction : http://scikit-learn.org/stable/modules/feature_extraction.html
- integrate with pipeline: http://scikit-learn.org/stable/modules/pipeline.html
- cross validation : http://scikit-learn.org/stable/modules/cross_validation.html
- grid search the parameter : http://scikit-learn.org/stable/modules/grid_search.html#exhaustive-grid-search

# Build Model
- define a model

In [None]:
%%writefile ./competition/models/impute_xgb.py
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer
from xgboost.sklearn import XGBModel
from sklearn.ensemble import RandomForestClassifier
import os
estimators = [('impute', Imputer(missing_values='NaN', strategy='most_frequent', axis=0)),
              ('xgb', XGBModel())]

estimator = Pipeline(estimators)
estimator_name = os.path.basename(__file__).split('.')[0]

- change the config file

In [None]:
%%writefile configCustom.py
dataDir = './_data/'
extractDir = './_extracted/'
resultDir = './_result/'

dataFile = 'store.db'
infoFile = 'info.pkl'

# extractor
from competition.extractors.sf_partial_2 import extractor,extractor_name

# model
from competition.models.impute_xgb import estimator,estimator_name

tuned_parameters = [{
    'impute__copy':[False],
    'xgb__colsample_bytree': [0.5],
    'xgb__gamma': [0],
    'xgb__learning_rate': [0.1],
    'xgb__max_depth': [3],
    'xgb__min_child_weight': [1],
    'xgb__objective': ['binary:logistic'],
    'xgb__subsample': [1]
},]

# other configs
bShuffle = True
bProb = True

- run the model

In [None]:
%run run_2_select_models.py

# view the model

In [None]:
from model import clf
clf

In [None]:
clf.best_params_

# predict

In [None]:
%run run_3_predict.py

# view result

In [None]:
from config import extractor_name,estimator_name
import pandas as pd
destdir = './_results/%s-%s/'%(extractor_name,estimator_name)
sub = pd.read_csv(destdir+'submission.csv')

sub.describe()