In [2]:
import graphlab
import pandas as pd
import numpy as np
import pylab as plt

In [3]:
class Wrangler(object):    
    import pandas as pd
    
    def __init__(self, raw_dtrain, raw_dtest):
        TITLE_AGE = {'Capt': 70.0,
                     'Col': 54.0,
                     'Don': 40.0,
                     'Dona': 39.0,
                     'Dr': 43.571428571428569,
                     'Jonkheer': 38.0,
                     'Lady': 48.0,
                     'Major': 48.5,
                     'Master': 5.4826415094339627,
                     'Miss': 21.774238095238097,
                     'Mlle': 24.0,
                     'Mme': 24.0,
                     'Mr': 32.252151462994838,
                     'Mrs': 36.994117647058822,
                     'Ms': 28.0,
                     'Rev': 41.25,
                     'Sir': 49.0,
                     'the Countess': 33.0}
        
        TITLE = {'Major': 'Army', 
                 'the Countess': 'Upper', 
                 'Don': 'Mr', 
                 'Sir': 'Upper', 
                 'Mlle': 'Upper',
                 'Capt': 'Upper', 
                 'Ms': 'Miss', 
                 'Jonkheer': 'Upper', 
                 'Col': 'Army', 
                 'Lady': 'Upper', 
                 'Mme': 'Upper', 
                 'Dona': 'Upper'}
            
        self.raw_dtrain = raw_dtrain
        self.raw_dtest = raw_dtest
        self.raw = pd.concat([raw_dtrain, raw_dtest])
        self.processed = pd.DataFrame()
        self.processed[['SibSp', 'Parch', 'Pclass','Fare']] = self.raw[['SibSp', 'Parch', 'Pclass','Fare']] 
        self.processed['Title'] = self.raw['Name'].map(lambda x: x.split(',')[1].split('.')[0][1:]) # extract "Title" from "Name"
        self.processed['Cabin'] = self.raw['Cabin'].map(lambda x: str(x)[0])
        self.processed['Sex'] = self.raw['Sex'].map(lambda x: 0 if x == 'male' else 1) # male: 0 female: 1
        
        # deal with NaN and 0
        self.processed['Age'] = self.raw['Age'].groupby(self.processed['Title']).apply(lambda g: g.fillna(TITLE_AGE[g.name])) # average age of Title
        self.processed['Embarked'] = self.raw['Embarked'].fillna('S')  # the most frequent item
        self.processed['Fare'] = self.processed['Fare'].groupby(self.processed['Pclass']).apply(lambda g: g.fillna(g.mean())) # the average Pclass fare
        self.processed['Fare'] = self.processed['Fare'].groupby(self.processed['Pclass']).apply(lambda g: g.replace(0, g.mean())) # the average Pclass fare
        
        # normalization, and we know test data :) 
        self.mean = self.processed[['Age','SibSp','Parch','Fare']].mean()
        self.std = self.processed[['Age','SibSp','Parch','Fare']].std()
        self.processed[['Age','SibSp','Parch','Fare']] = (self.processed[['Age','SibSp','Parch','Fare']] - self.mean) / self.std 
        
        # then also merge some rare Title into commom ones 
        self.processed['Title'] = self.processed['Title'].map(lambda x: TITLE[x] if x in TITLE else x)
        
        # transfer category feature into dummy feature   
        category_Embarked = pd.get_dummies(self.processed['Embarked'], prefix='Embarked')
        category_Pclass = pd.get_dummies(self.processed['Pclass'], prefix='Pclass')
        category_Cabin = pd.get_dummies(self.processed['Cabin'], prefix='Cabin')
        category_Title = pd.get_dummies(self.processed['Title'], prefix='Title')
        self.processed = pd.concat([self.processed, category_Embarked, category_Pclass, category_Cabin, category_Title], axis=1)
        # drop features we don't need 
        self.processed = self.processed.drop(['Embarked', 'Pclass', 'Cabin', 'Title'], axis = 1) 
        
        # export X, y
        self.Xtrain = self.processed.ix[self.raw_dtrain.index,:]
        self.Xtest = self.processed.ix[self.raw_dtest.index,:]
        self.ytrain = self.raw_dtrain['Survived']

In [4]:
raw_dtrain = pd.read_csv('data/train.csv',index_col = 0)
raw_dtest = pd.read_csv('data/test.csv',index_col = 0)

In [8]:
raw_train = graphlab.SFrame('data/train.csv')
raw_train.head()

------------------------------------------------------
Inferred types from first line of file as 
column_type_hints=[long,long,long,str,str,float,long,long,str,float,str,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs ...",female,38.0,1,0,PC 17599
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel) ...",female,35.0,1,0,113803
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450
6,0,3,"Moran, Mr. James",male,,0,0,330877
7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463
8,0,3,"Palsson, Master. Gosta Leonard ...",male,2.0,3,1,349909
9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina ...",female,27.0,0,2,347742
10,1,2,"Nasser, Mrs. Nicholas (Adele Achem) ...",female,14.0,1,0,237736

Fare,Cabin,Embarked
7.25,,S
71.2833,C85,C
7.925,,S
53.1,C123,S
8.05,,S
8.4583,,Q
51.8625,E46,S
21.075,,S
11.1333,,S
30.0708,,C


In [9]:
naive_gbm = graphlab.boosted_trees_classifier.create(raw_train, target = 'Survived', features = ['Pclass','Sex','Age','Cabin','Embarked','Fare','SibSp','Parch'], max_iterations = 88, )

PROGRESS: Creating a validation set from 5 percent of training data. This may take a while.
          You can set ``validation_set=None`` to disable validation tracking.



In [10]:
raw_test = graphlab.SFrame('data/test.csv')

------------------------------------------------------
Inferred types from first line of file as 
column_type_hints=[long,long,str,str,float,long,long,str,float,str,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


In [11]:
submit = graphlab.SFrame()

In [12]:
submit['PassengerId'] = raw_test['PassengerId']

In [15]:
submit['Survived'] = naive_gbm.predict(raw_test)

In [16]:
submit.export_csv('naive_gbm.csv')

# 0.74163
So, data wrangling is absolutely meaningful : )