In [1]:
import pandas as pd
import numpy as np
import pylab as plt

In [2]:
from sklearn import cross_validation
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier,GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

In [3]:
class Wrangler(object):    
    import pandas as pd
    
    def __init__(self, raw_dtrain, raw_dtest):
        TITLE_AGE = {'Capt': 70.0,
                     'Col': 54.0,
                     'Don': 40.0,
                     'Dona': 39.0,
                     'Dr': 43.571428571428569,
                     'Jonkheer': 38.0,
                     'Lady': 48.0,
                     'Major': 48.5,
                     'Master': 5.4826415094339627,
                     'Miss': 21.774238095238097,
                     'Mlle': 24.0,
                     'Mme': 24.0,
                     'Mr': 32.252151462994838,
                     'Mrs': 36.994117647058822,
                     'Ms': 28.0,
                     'Rev': 41.25,
                     'Sir': 49.0,
                     'the Countess': 33.0}
        
        TITLE = {'Major': 'Army', 
                 'the Countess': 'Upper', 
                 'Don': 'Mr', 
                 'Sir': 'Upper', 
                 'Mlle': 'Upper',
                 'Capt': 'Upper', 
                 'Ms': 'Miss', 
                 'Jonkheer': 'Upper', 
                 'Col': 'Army', 
                 'Lady': 'Upper', 
                 'Mme': 'Upper', 
                 'Dona': 'Upper'}
            
        self.raw_dtrain = raw_dtrain
        self.raw_dtest = raw_dtest
        self.raw = pd.concat([raw_dtrain, raw_dtest])
        self.processed = pd.DataFrame()
        self.processed[['SibSp', 'Parch', 'Pclass','Fare']] = self.raw[['SibSp', 'Parch', 'Pclass','Fare']] 
        self.processed['Title'] = self.raw['Name'].map(lambda x: x.split(',')[1].split('.')[0][1:]) # extract "Title" from "Name"
        self.processed['Cabin'] = self.raw['Cabin'].map(lambda x: str(x)[0])
        self.processed['Sex'] = self.raw['Sex'].map(lambda x: 0 if x == 'male' else 1) # male: 0 female: 1
        
        # deal with NaN and 0
        self.processed['Age'] = self.raw['Age'].groupby(self.processed['Title']).apply(lambda g: g.fillna(TITLE_AGE[g.name])) # average age of Title
        self.processed['Embarked'] = self.raw['Embarked'].fillna('S')  # the most frequent item
        self.processed['Fare'] = self.processed['Fare'].groupby(self.processed['Pclass']).apply(lambda g: g.fillna(g.mean())) # the average Pclass fare
        self.processed['Fare'] = self.processed['Fare'].groupby(self.processed['Pclass']).apply(lambda g: g.replace(0, g.mean())) # the average Pclass fare
        
        # normalization
        self.mean = self.processed[['Age','SibSp','Parch','Fare']].mean()
        self.std = self.processed[['Age','SibSp','Parch','Fare']].std()
        self.processed[['Age','SibSp','Parch','Fare']] = (self.processed[['Age','SibSp','Parch','Fare']] - self.mean) / self.std 
        
        # then also merge some rare Title into commom ones 
        self.processed['Title'] = self.processed['Title'].map(lambda x: TITLE[x] if x in TITLE else x)
        
        # transfer category feature into dummy feature   
        category_Embarked = pd.get_dummies(self.processed['Embarked'], prefix='Embarked')
        category_Pclass = pd.get_dummies(self.processed['Pclass'], prefix='Pclass')
        category_Cabin = pd.get_dummies(self.processed['Cabin'], prefix='Cabin')
        category_Title = pd.get_dummies(self.processed['Title'], prefix='Title')
        self.processed = pd.concat([self.processed, category_Embarked, category_Pclass, category_Cabin, category_Title], axis=1)
        # drop features we don't need 
        self.processed = self.processed.drop(['Embarked', 'Pclass', 'Cabin', 'Title'], axis = 1) 
        
        # export X, y
        self.Xtrain = self.processed.ix[self.raw_dtrain.index,:]
        self.Xtest = self.processed.ix[self.raw_dtest.index,:]
        self.ytrain = self.raw_dtrain['Survived']

In [4]:
raw_dtrain = pd.read_csv('data/train.csv',index_col = 0)
raw_dtest = pd.read_csv('data/test.csv',index_col = 0)

In [5]:
Wrangler(raw_dtrain, raw_dtest).Xtest

Unnamed: 0_level_0,SibSp,Parch,Fare,Sex,Age,Embarked_C,Embarked_Q,Embarked_S,Pclass_1,Pclass_2,...,Cabin_T,Cabin_n,Title_Army,Title_Dr,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Rev,Title_Upper
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
892,-0.478904,-0.444829,-0.503321,0,0.348884,0,1,0,0,0,...,0,1,0,0,0,0,1,0,0,0
893,0.481104,-0.444829,-0.519338,1,1.296298,0,0,1,0,0,...,0,1,0,0,0,0,0,1,0,0
894,-0.478904,-0.444829,-0.467425,0,2.433196,0,1,0,0,1,...,0,1,0,0,0,0,1,0,0,0
895,-0.478904,-0.444829,-0.487225,0,-0.219565,0,0,1,0,0,...,0,1,0,0,0,0,1,0,0,0
896,0.481104,0.710492,-0.417203,1,-0.598531,0,0,1,0,0,...,0,1,0,0,0,0,0,1,0,0
897,-0.478904,-0.444829,-0.476359,0,-1.204876,0,0,1,0,0,...,0,1,0,0,0,0,1,0,0,0
898,-0.478904,-0.444829,-0.507184,1,0.007815,0,1,0,0,0,...,0,1,0,0,0,1,0,0,0,0
899,0.481104,0.710492,-0.094378,0,-0.295358,0,0,1,0,1,...,0,1,0,0,0,0,1,0,0,0
900,-0.478904,-0.444829,-0.514911,1,-0.901703,1,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
901,1.441112,-0.444829,-0.188062,0,-0.674324,0,0,1,0,0,...,0,1,0,0,0,0,1,0,0,0


In [6]:
class Pipeliner(object):
    def __init__(self, data, model):
        self.fitted = False
        
        self.wrangler = data
        self.modeler = model
        
        self.Xtrain = self.wrangler.Xtrain
        self.Xtest = self.wrangler.Xtest
        self.ytrain = self.wrangler.ytrain
    
    def get_fit(self):
        self.modeler.fit(self.Xtrain, self.ytrain)
        self.fitted = True
        print 'Training accuracy:',  self.modeler.score(self.Xtrain, self.ytrain)
        print
        
    def report(self, filename = 'submit'):
        if not self.fitted:
            self.get_fit()
        submit = pd.DataFrame()
        submit['PassengerId'] = self.Xtest.index
        submit['Survived'] = self.modeler.predict(self.Xtest)
        submit.to_csv(filename + '.csv', index=False)

In [7]:
pipeline = Pipeliner(data = Wrangler(raw_dtrain = pd.read_csv('data/train.csv',index_col = 0), 
                                              raw_dtest = pd.read_csv('data/test.csv',index_col = 0)), 
                     model = GradientBoostingClassifier(**{'max_features': None, 'loss': 'deviance', 
                                                           'n_estimators': 200, 'max_depth': 3, 'min_samples_leaf': 3})
                    )

In [8]:
pipeline.report('submit_gbc')

Training accuracy: 0.918069584736



In [9]:
pipeline.modeler

GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=3, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=200,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)