# References
- preprocessing : http://scikit-learn.org/stable/modules/preprocessing.html
- extraction : http://scikit-learn.org/stable/modules/feature_extraction.html
- integrate with pipeline: http://scikit-learn.org/stable/modules/pipeline.html
- cross validation : http://scikit-learn.org/stable/modules/cross_validation.html
- grid search the parameter : http://scikit-learn.org/stable/modules/grid_search.html#exhaustive-grid-search

# Tuning Model Inline

In [None]:
import pandas as pd
from util import loadExtracted
extractedFile = 'raw_merge_stats.db'
extracted = loadExtracted(extractedFile)

In [None]:
extracted

In [None]:
trX = extracted['trX']
trY = extracted['trY']
teX = extracted['teX']

In [None]:
from util import loadCVStore
cvStore = loadCVStore()
cvStore

In [None]:
from xgboost.sklearn import XGBModel
estimator = XGBModel()
estimator.get_params()

In [None]:
cvlist = cvStore['cv_simple'].as_matrix()  # we use simple cv set at first
for tr,te in cvlist:
    print len(tr),len(te)

In [None]:
tuned_parameters = {
#    'base_score': 0.5,
    'colsample_bylevel': [0.8],
    'colsample_bytree': [0.5,0.8],
#    'gamma': 0,
#    'learning_rate': 0.1,
#    'max_delta_step': 0,
    'max_depth': [5],
#    'min_child_weight': 1,
#    'missing': None,
    'n_estimators': [50],
#    'nthread': -1,
    'objective': ['binary:logistic'],
#    'reg_alpha': 0,
#    'reg_lambda': 1,
#    'scale_pos_weight': 1,
#    'seed': 0,
#    'silent': True,
    'subsample': [0.8]
};

In [None]:
from sklearn.model_selection import GridSearchCV
from competition.models import official_score

score = official_score
clf = GridSearchCV(estimator, tuned_parameters, cv=cvlist, scoring= score)

In [None]:
%%time
import datetime
start_tm = datetime.datetime.now()
clf.fit(trX,trY)
end_tm = datetime.datetime.now()
duration = '[%s] tunning paramter cost: %s'%(start_tm,end_tm-start_tm)
!echo $duration >> log.txt

In [None]:
clf.cv_results_

In [None]:
clf.best_params_

In [None]:
clf.best_estimator_

In [None]:
!cat log.txt

In [None]:
# save result:
mts = clf.cv_results_['mean_train_score']
mes = clf.cv_results_['mean_test_score']
with open('tune_results.txt','w') as f:
    f.write('mean_train_score\n')
    f.write(str(mts))
    f.write('\nmean_test_score\n')
    f.write(str(mes))
    f.write('\nbest param\n')
    f.write(str(clf.best_params_))
print 'done'

In [None]:
!cat tune_results.txt

# Save Model 

In [None]:
%%writefile ./competition/models/xgb.py
from xgboost.sklearn import XGBModel
estimator = XGBModel()
estimator_name = 'xgb'

In [None]:
!cat tune_results.txt

In [None]:
para_name = '_1'
bestPara = {'colsample_bytree': 0.8, 'colsample_bylevel': 0.8, 
            'n_estimators': 50, 'subsample': 0.8, 
            'objective': 'binary:logistic', 'max_depth': 5}

In [None]:
from competition.models.xgb import estimator,estimator_name
estimator.set_params(**bestPara)
estimator

- train on full set and save

In [None]:
from util import loadExtracted
extractFile = 'raw_merge_stats.db'
dset = loadExtracted(extractFile)
dset

In [None]:
estimator.fit(dset['trX'],dset['trY'])

In [None]:
print extractFile,estimator_name,para_name

In [None]:
from util import saveModel
saveModel(estimator,extractFile,estimator_name,para_name)

- predict result

In [None]:
from util import predictResult
predictResult(estimator,extractFile,estimator_name,para_name)

- view result

In [None]:
from util import getSubmission
getSubmission(extractFile,estimator_name,para_name)