# build a model
- we can use sklearn model directly
- can also add preprocessing or feature extraction algorithms here
  - preprocessing : http://scikit-learn.org/stable/modules/preprocessing.html
  - extraction : http://scikit-learn.org/stable/modules/feature_extraction.html
  - integrate with pipeline: http://scikit-learn.org/stable/modules/pipeline.html
- select parameter:
  - cross validation : http://scikit-learn.org/stable/modules/cross_validation.html
  - grid search the parameter : http://scikit-learn.org/stable/modules/grid_search.html#exhaustive-grid-search

In [4]:
%%writefile ./competition/models/impute_rf.py
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer
from sklearn.ensemble import RandomForestClassifier

estimators = [('impute', Imputer(missing_values='NaN', strategy='most_frequent', axis=0)),
              ('clf', RandomForestClassifier())]

estimator = Pipeline(estimators)

tuned_parameters = [{
        'impute__copy':[False],
        'clf__n_estimators': [10],
        'clf__min_samples_split':[2],
        'clf__min_samples_leaf':[1],
        'clf__min_weight_fraction_leaf':[0],
        'clf__max_leaf_nodes':[None],
        'clf__min_impurity_split':[1e-7],
        'clf__warm_start':[False],
        'clf__oob_score':[False],
        'clf__class_weight':['balanced']
},]


Overwriting ./competition/models/impute_rf.py


In [None]:
# modify config.py
!cat config.py

# -*- coding:utf-8 -*-

# define at you will
# this file is intended to be used by the scripts in this folder


# convert_data, load_data
dataDir = './_data/'
extractDir = './_extracted/'
resultDir = './_result/'

dataFile = 'store.db'
infoFile = 'info.pkl'

# extract_features
from competition.featExtract.simple_features import SimpleFeatures as Extractor
extractor_name = 'simple_features'

# build_model
from competition.models.impute_rf import estimator,tuned_parameters
estimator_name = 'impute_rf'

# other configs
bShuffle = True


## tuning parameters

In [None]:
%run select_models.py

# predict result

In [None]:
%run save_result.py

# view result

In [None]:
from config import extractor_name,estimator_name
import pandas as pd
destdir = './_results/%s-%s/'%(extractor_name,estimator_name)
sub = pd.read_csv(destdir+'submission.csv')

In [None]:
sub.describe()