# load data

In [None]:
import pandas as pd
extracted = pd.HDFStore('./_extracted/simple_feat.db')
extracted

# build a model

## 1. read data

In [None]:
# we define a constant model here
from sklearn.base import BaseEstimator
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

X_train = extracted['trX']
y_train = extracted['trY']
X_test = extracted['teX']

bShuffle = True
if bShuffle:
    N = X_train.shape[0]
    idx = np.random.permutation(N)
    X_train = X_train.as_matrix()[idx]
    y_train = y_train.as_matrix()[idx]

## 2. try to build a model
- we can use sklearn model directly
- can also add preprocessing or feature extraction algorithms here
  - preprocessing : http://scikit-learn.org/stable/modules/preprocessing.html
  - extraction : http://scikit-learn.org/stable/modules/feature_extraction.html
  - integrate with pipeline: http://scikit-learn.org/stable/modules/pipeline.html
- select parameter:
  - cross validation : http://scikit-learn.org/stable/modules/cross_validation.html
  - grid search the parameter : http://scikit-learn.org/stable/modules/grid_search.html#exhaustive-grid-search

In [None]:
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import Imputer
from sklearn.ensemble import RandomForestClassifier

estimators = [('impute', Imputer(missing_values='NaN', strategy='most_frequent', axis=0)),
              ('clf', RandomForestClassifier())]

pipe = Pipeline(estimators)
pipe.get_params()

In [None]:
# save model!

## tuning parameters

In [None]:
from models import official_score
score = official_score

tuned_parameters = [{
        'impute__copy':[False],
        'clf__n_estimators': [10],
        'clf__min_samples_split':[2],
        'clf__min_samples_leaf':[1],
        'clf__min_weight_fraction_leaf':[0],
        'clf__max_leaf_nodes':[None],
        'clf__min_impurity_split':[1e-7],
        'clf__warm_start':[True],
        'clf__oob_score':[False],
        'clf__class_weight':['balanced']
},]
print "# Tuning hyper-parameters for %s" % score
clf = GridSearchCV(pipe, tuned_parameters, cv=5, scoring= score)

print "fitting"
clf.fit(X_train, y_train)

In [None]:
print "Best parameters set found on development set:",
print clf.best_params_
print "Grid scores on development set:"
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print "%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params),
print ''

print "Detailed classification report:"
print "The model is trained on the full development set."
print "The scores are computed on the full evaluation set."
y_true, y_pred = y_train, clf.predict(X_train)
print classification_report(y_true, y_pred)

In [None]:
print official_score(clf,X_train,y_train)

# predict result

In [None]:
import os
curdir = os.getcwd()
destdir = './_results/rf-no-tune/'
if not os.path.exists(destdir):
    os.mkdir(destdir)

os.chdir(destdir)

testY = clf.predict(X_test)
fname = 'submission.csv'
fnamezip = fname+'.zip'
with open(fname,'w') as f:
    f.write('instanceId,prob\n')
    for i,y in enumerate(testY):
        f.write('%d,%.2f\n'%(i+1,y))

if os.path.exists(fnamezip):
    os.remove(fnamezip)
    
os.system('zip %s %s'%(fnamezip,fname))

os.chdir(curdir)

In [None]:
!ls $destdir

In [None]:
sub = pd.read_csv(destdir+'submission.csv')

In [None]:
sub.describe()