In [1]:
import pandas as pd
import numpy as np
import pylab as plt
from my_pipeline import Wrangler

In [2]:
from sklearn import cross_validation
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier,GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

In [3]:
raw_dtrain = pd.read_csv('data/train.csv',index_col = 0)
raw_dtest = pd.read_csv('data/test.csv',index_col = 0)

In [8]:
Wrangler(raw_dtrain, raw_dtest).Xtrain.head()

Unnamed: 0_level_0,SibSp,Parch,Fare,Sex,Age,Embarked_C,Embarked_Q,Embarked_S,Pclass_1,Pclass_2,...,Cabin_T,Cabin_n,Title_Army,Title_Dr,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Rev,Title_Upper
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.481104,-0.444829,-0.514509,0,-0.598531,0,0,1,0,0,...,0,1,0,0,0,0,1,0,0,0
2,0.481104,-0.444829,0.722382,1,0.61416,1,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
3,-0.478904,-0.444829,-0.501471,1,-0.295358,0,0,1,0,0,...,0,1,0,0,0,1,0,0,0,0
4,0.481104,-0.444829,0.371147,1,0.386781,0,0,1,1,0,...,0,0,0,0,0,0,0,1,0,0
5,-0.478904,-0.444829,-0.499056,0,0.386781,0,0,1,0,0,...,0,1,0,0,0,0,1,0,0,0


In [13]:
class Searcher(object):
    # self.model = sklearn model wish to test, build GridSearchCV() with params_grid = list consisting of parameters
    # self.X = features
    # self.y = target
    
    from sklearn.grid_search import GridSearchCV    
    
    def __init__(self, model, params, X, y, cv = 5, n_jobs = -1):
        self.searcher = GridSearchCV(estimator = model, param_grid = params, cv = cv, n_jobs = n_jobs)
        self.X = X
        self.y = y
        self.fitted = False
        
    def get_fit(self):
        self.searcher.fit(self.X, self.y)
        print('Fitted succeed!\n')
        self.fitted = True
        
    def report(self, n_top = 20):
        if not self.fitted:
            self.get_fit()
        grid_scores = self.searcher.grid_scores_
        top_scores = sorted(grid_scores, key=lambda x: x[1], reverse=True)[:n_top]
        for i, score in enumerate(top_scores):
            print("=====================================================")
            print("Model with rank: {0}".format(i + 1))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(score.mean_validation_score, np.std(score.cv_validation_scores)))
            print("Parameters: {0}".format(score.parameters))
            print('')
            
    def get_best_model(self):
        if not self.fitted:
            self.get_fit()
        return self.searcher.best_estimator_

In [55]:
class Pipeliner(object):
    # import Searcher
    
    def __init__(self, data_wrangler, modeler = None): 
        # data_wrangler should be a Wrangler object
        # modeler should be sklearn model-like
        self.fitted = False
        
        self.wrangler = data_wrangler
        self.modeler = modeler
        
        self.Xtrain = self.wrangler.Xtrain
        self.Xtest = self.wrangler.Xtest
        self.ytrain = self.wrangler.ytrain       
            
    def grid_search(self, search_list, n_top = 20):
        # search_list = [(model_name, model_to_search, params_to_search), ]
        for search_item in search_list:
            model_name, model_to_search, params_to_search = search_item
            print '\n\n\n========================= Working on %s==========================' % model_name     
            self.searcher = Searcher(model = model_to_search, params = params_to_search, 
                                     X = self.Xtrain, y = self.ytrain, cv = 5, n_jobs = -1)
            self.searcher.report(n_top = n_top)
 
    def set_modeler(self, modeler):
        self.modeler = modeler
        self.fitted = False
    
    def get_fit(self):
        if self.modeler == None:
            print 'This pipeline hasn\'t prepared well for the model!'
            return
        self.modeler.fit(self.Xtrain, self.ytrain)
        self.fitted = True
        print 'Training accuracy:',  self.modeler.score(self.Xtrain, self.ytrain)
        print
        
    def report(self, filename = 'submit'):
        if self.modeler == None:
            print 'This pipeline hasn\'t prepared well for the model!'
            return
        if not self.fitted:
            self.get_fit()
        submit = pd.DataFrame()
        submit['PassengerId'] = self.Xtest.index
        submit['Survived'] = self.modeler.predict(self.Xtest)
        submit.to_csv(filename + '.csv', index=False)

In [47]:
pipeline = Pipeliner(data_wrangler = Wrangler(raw_dtrain = pd.read_csv('data/train.csv',index_col = 0), 
                                              raw_dtest = pd.read_csv('data/test.csv',index_col = 0)), )
search_list = [('SVC',SVC(),
                [{'kernel': ['linear'], 'C': 0.01 * np.logspace(0, 5, 10, base = 4), 'cache_size': [2000]},
                 {'kernel': ['sigmoid'], 'C': 0.01 * np.logspace(0, 5, 10, base = 4), 'cache_size': [2000]},
                 {'kernel': ['poly'], 'C': 0.01 * np.logspace(0, 5, 10, base = 4), 'degree': np.arange(1, 10), 'cache_size': [2000]}, 
                 {'kernel': ['rbf'], 'C': 0.01 * np.logspace(0, 5, 10, base = 4), 'gamma': np.arange(0, 0.3, 0.006), 'cache_size': [2000]}, ]),]
pipeline.grid_search(search_list, n_top = 10)




Fitted succeed!

Model with rank: 1
Mean validation score: 0.837 (std: 0.022)
Parameters: {'cache_size': 2000, 'kernel': 'rbf', 'C': 1.015936673259648, 'gamma': 0.114}

Model with rank: 2
Mean validation score: 0.837 (std: 0.022)
Parameters: {'cache_size': 2000, 'kernel': 'rbf', 'C': 1.015936673259648, 'gamma': 0.12}

Model with rank: 3
Mean validation score: 0.837 (std: 0.022)
Parameters: {'cache_size': 2000, 'kernel': 'rbf', 'C': 2.1945445961038677, 'gamma': 0.059999999999999998}

Model with rank: 4
Mean validation score: 0.836 (std: 0.029)
Parameters: {'cache_size': 2000, 'kernel': 'poly', 'C': 10.24, 'degree': 3}

Model with rank: 5
Mean validation score: 0.836 (std: 0.023)
Parameters: {'cache_size': 2000, 'kernel': 'rbf', 'C': 1.015936673259648, 'gamma': 0.108}

Model with rank: 6
Mean validation score: 0.836 (std: 0.023)
Parameters: {'cache_size': 2000, 'kernel': 'rbf', 'C': 1.015936673259648, 'gamma': 0.126}

Model with rank: 7
Mean validation score: 0.836 (std: 0.021)
Parame

#  Test: 0.79904

In [48]:
pipeline.set_modeler(SVC(**{'cache_size': 2000, 'kernel': 'rbf', 'C': 2.1945445961038677, 'gamma': 0.059999999999999998}))
pipeline.report('svc_pipeline')

Training accuracy: 0.841750841751



In [49]:
pipeline.modeler

SVC(C=2.1945445961, cache_size=2000, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.06, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

# Test all model, and go sleep :)

In [50]:
pipeline = Pipeliner(data_wrangler = Wrangler(raw_dtrain = pd.read_csv('data/train.csv',index_col = 0), 
                                              raw_dtest = pd.read_csv('data/test.csv',index_col = 0)), )
search_list = [('SVC',SVC(),
                [{'kernel': ['linear'], 'C': 0.01 * np.logspace(0, 5, 10, base = 4), 'cache_size': [2000]},
                 {'kernel': ['sigmoid'], 'C': 0.01 * np.logspace(0, 5, 10, base = 4), 'cache_size': [2000]},
                 {'kernel': ['poly'], 'C': 0.01 * np.logspace(0, 5, 10, base = 4), 'degree': np.arange(1, 10), 'cache_size': [2000]}, 
                 {'kernel': ['rbf'], 'C': 0.01 * np.logspace(0, 5, 10, base = 4), 'gamma': np.arange(0, 0.3, 0.006), 'cache_size': [2000]}, ]),
               ('RF', RandomForestClassifier(),
                [{'n_estimators': [30,80,150,300,500], 
                  'criterion': ['gini','entropy'],
                  'min_samples_leaf': [1,2,3,4,5],
                  'max_features': ['sqrt','log2',None], 
                  'max_depth': np.arange(5,20,1),}]),
               ('GBDT', GradientBoostingClassifier(),
               [{'loss': ['deviance', 'exponential'],
                 'n_estimators': np.arange(100,2000,100), 
                 'min_samples_leaf': [1,2,3,4,5],
                 'max_features': ['sqrt','log2',None], 
                 'max_depth': [1,2,3,4,5],}]),
              ]
pipeline.grid_search(search_list, n_top = 30)




Fitted succeed!

Model with rank: 1
Mean validation score: 0.837 (std: 0.022)
Parameters: {'cache_size': 2000, 'kernel': 'rbf', 'C': 1.015936673259648, 'gamma': 0.114}

Model with rank: 2
Mean validation score: 0.837 (std: 0.022)
Parameters: {'cache_size': 2000, 'kernel': 'rbf', 'C': 1.015936673259648, 'gamma': 0.12}

Model with rank: 3
Mean validation score: 0.837 (std: 0.022)
Parameters: {'cache_size': 2000, 'kernel': 'rbf', 'C': 2.1945445961038677, 'gamma': 0.059999999999999998}

Model with rank: 4
Mean validation score: 0.836 (std: 0.029)
Parameters: {'cache_size': 2000, 'kernel': 'poly', 'C': 10.24, 'degree': 3}

Model with rank: 5
Mean validation score: 0.836 (std: 0.023)
Parameters: {'cache_size': 2000, 'kernel': 'rbf', 'C': 1.015936673259648, 'gamma': 0.108}

Model with rank: 6
Mean validation score: 0.836 (std: 0.023)
Parameters: {'cache_size': 2000, 'kernel': 'rbf', 'C': 1.015936673259648, 'gamma': 0.126}

Model with rank: 7
Mean validation score: 0.836 (std: 0.021)
Parame

In [51]:
pipeline.set_modeler(GradientBoostingClassifier(**{'max_features': None, 'loss': 'deviance', 'n_estimators': 200, 'max_depth': 3, 'min_samples_leaf': 3}))
pipeline.report('gbdt_pipeline')
pipeline.modeler

This pipeline hasn't prepared well for the model!


GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=3, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=200,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [52]:
pipeline.report('gbdt_pipeline')

This pipeline hasn't prepared well for the model!


In [54]:
pipeline.get_fit()

This pipeline hasn't prepared well for the model!


In [57]:
if pipeline.modeler:
    print 'xas'

In [58]:
pipeline = Pipeliner(data_wrangler = Wrangler(raw_dtrain = pd.read_csv('data/train.csv',index_col = 0), 
                                              raw_dtest = pd.read_csv('data/test.csv',index_col = 0)), )
pipeline.set_modeler(SVC(**{'cache_size': 2000, 'kernel': 'rbf', 'C': 2.1945445961038677, 'gamma': 0.059999999999999998}))
pipeline.get_fit()

Training accuracy: 0.841750841751



#  testing: 0.75120

In [61]:
pipeline.set_modeler(GradientBoostingClassifier(**{'max_features': None, 'loss': 'deviance', 'n_estimators': 200, 'max_depth': 3, 'min_samples_leaf': 3}))
pipeline.report('gbdt2_pipeline')
pipeline.modeler

Training accuracy: 0.918069584736



GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=3, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=200,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

#  testing: 0.73206

In [60]:
pipeline.set_modeler(GradientBoostingClassifier(**{'max_features': None, 'loss': 'deviance', 'n_estimators': 300, 
                                                   'max_depth': 3, 'min_samples_leaf': 2}))
pipeline.report('gbdt1_pipeline')
pipeline.modeler

Training accuracy: 0.949494949495



GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=2, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=300,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

#  testing: 0.74641

In [62]:
pipeline.set_modeler(GradientBoostingClassifier(**{'max_features': None, 'loss': 'deviance', 'n_estimators': 300, 'max_depth': 3, 'min_samples_leaf': 3}))
pipeline.report('gbdt3_pipeline')
pipeline.modeler

Training accuracy: 0.945005611672



GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=3, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=300,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

# testing: 0.76555

In [64]:
pipeline.set_modeler(GradientBoostingClassifier(**{'max_features': 'log2', 'loss': 'deviance', 'n_estimators': 600, 'max_depth': 2, 'min_samples_leaf': 3}))
pipeline.report('gbdt8_pipeline')
pipeline.modeler

Training accuracy: 0.885521885522



GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=2, max_features='log2', max_leaf_nodes=None,
              min_samples_leaf=3, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=600,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

#  testing: 0.79426

In [66]:
pipeline.set_modeler(SVC(**{'cache_size': 2000, 'kernel': 'rbf', 'C': 1.015936673259648, 'gamma': 0.12}))
pipeline.report('svc2_pipeline')
pipeline.modeler

Training accuracy: 0.842873176207



SVC(C=1.01593667326, cache_size=2000, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.12, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

#  testing: 0.77990

In [67]:
pipeline.set_modeler(RandomForestClassifier(**{'max_features': None, 'n_estimators': 150, 'criterion': 'gini', 'max_depth': 8, 'min_samples_leaf': 3}))
pipeline.report('rf4_pipeline')
pipeline.modeler

Training accuracy: 0.905723905724



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=8, max_features=None, max_leaf_nodes=None,
            min_samples_leaf=3, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=150, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

#  testing: 0.78947

In [68]:
pipeline.set_modeler(RandomForestClassifier(**{'max_features': None, 'n_estimators': 30, 'criterion': 'gini', 'max_depth': 8, 'min_samples_leaf': 4}))
pipeline.report('rf6_pipeline')
pipeline.modeler

Training accuracy: 0.8911335578



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=8, max_features=None, max_leaf_nodes=None,
            min_samples_leaf=4, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [None]:
{'max_features': None, 'n_estimators': 30, 'criterion': 'gini', 'max_depth': 8, 'min_samples_leaf': 4}