# load data

In [1]:
import pandas as pd
extracted = pd.HDFStore('./_extracted/use_raw.db')
extracted

<class 'pandas.io.pytables.HDFStore'>
File path: ./_extracted/use_raw
/teX            frame        (shape->[338489,6]) 
/trX            frame        (shape->[3749528,6])
/trY            series       (shape->[3749528])  

# build a model

## 1. read data

In [2]:
# we define a constant model here
from sklearn.base import BaseEstimator
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

X_train = extracted['trX']
y_train = extracted['trY']
X_test = extracted['teX']

bShuffle = True
if bShuffle:
    N = X_train.shape[0]
    idx = np.random.permutation(N)
    X_train = X_train.as_matrix()[idx]
    y_train = y_train.as_matrix()[idx]

## 2. try to build a model
- we can use sklearn model directly
- can also add preprocessing or feature extraction algorithms here
  - preprocessing : http://scikit-learn.org/stable/modules/preprocessing.html
  - extraction : http://scikit-learn.org/stable/modules/feature_extraction.html
  - integrate with pipeline: http://scikit-learn.org/stable/modules/pipeline.html
- select parameter:
  - cross validation : http://scikit-learn.org/stable/modules/cross_validation.html
  - grid search the parameter : http://scikit-learn.org/stable/modules/grid_search.html#exhaustive-grid-search

In [3]:
class Constant(BaseEstimator):
    def __init__(self,val=0):
        self.val = val
    def fit(self,X,y):
        return self
    def predict(self,X):
        return self.fit_predict(X)
    def fit_predict(self,X):
        N = X.shape[0]
        return np.ones([N])*self.val  

In [4]:
# save model into a file
!cat models/constant.py

from sklearn.base import BaseEstimator
import numpy as np

class Constant(BaseEstimator):
    def __init__(self,val=0):
        self.val = val
    def fit(self,X,y):
        return self
    def predict(self,X):
        return self.fit_predict(X)
    def fit_predict(self,X):
        N = X.shape[0]
        return np.ones([N])*self.val



## tuning parameters

In [5]:
from models import official_score
from models.constant import Constant

estimator = Constant
score = official_score

tuned_parameters = [{'val': [0,1],
                     },
                   ]
print "# Tuning hyper-parameters for %s" % score
clf = GridSearchCV(estimator(), tuned_parameters, cv=5, scoring= score)

print "fitting"
clf.fit(X_train, y_train)

# Tuning hyper-parameters for <function official_score at 0x1083deaa0>
fitting


GridSearchCV(cv=5, error_score='raise', estimator=Constant(val=0),
       fit_params={}, iid=True, n_jobs=1, param_grid=[{'val': [0, 1]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=<function official_score at 0x1083deaa0>, verbose=0)

In [6]:
print "Best parameters set found on development set:",
print clf.best_params_
print "Grid scores on development set:"
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print "%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params),
print ''

print "Detailed classification report:"
print "The model is trained on the full development set."
print "The scores are computed on the full evaluation set."
y_true, y_pred = y_train, clf.predict(X_train)
print classification_report(y_true, y_pred)

Best parameters set found on development set: {'val': 0}
Grid scores on development set:
-0.859 (+/-0.008) for {'val': 0} -33.680 (+/-0.008) for {'val': 1} 
Detailed classification report:
The model is trained on the full development set.
The scores are computed on the full evaluation set.
             precision    recall  f1-score   support

          0       0.98      1.00      0.99   3656266
          1       0.00      0.00      0.00     93262

avg / total       0.95      0.98      0.96   3749528



  'precision', 'predicted', average, warn_for)


# predict result

In [13]:
import os
curName = 'constant'
resultdir = './_results/'
curdir = resultdir+curName
if not os.path.exists(resultdir+curName):
    os.mkdir(resultdir+curName)

testY = clf.predict(X_test)
fname = curdir+'/'+'submission.csv'
fnamezip = fname+'.zip'
with open(fname,'w') as f:
    f.write('instanceId,prob\n')
    for i,y in enumerate(testY):
        f.write('%d,%d\n'%(i,y))
        

In [None]:
os.system('zip %s %s'%(fnamezip,fname))

In [15]:
!zip $fnamezip $fname

Copyright (c) 1990-2008 Info-ZIP - Type 'zip "-L"' for software license.
Zip 3.0 (July 5th 2008). Usage:
zip [-options] [-b path] [-t mmddyyyy] [-n suffixes] [zipfile list] [-xi list]
  The default action is to add or replace zipfile entries from list, which
  can include the special name - to compress standard input.
  If zipfile and list are omitted, zip compresses stdin to stdout.
  -f   freshen: only changed files  -u   update: only changed or new files
  -d   delete entries in zipfile    -m   move into zipfile (delete OS files)
  -r   recurse into directories     -j   junk (don't record) directory names
  -0   store only                   -l   convert LF to CR LF (-ll CR LF to LF)
  -1   compress faster              -9   compress better
  -q   quiet operation              -v   verbose operation/print version info
  -c   add one-line comments        -z   add zipfile comment
  -@   read names from stdin        -o   make zipfile as old as latest entry
  -x   exclude the