# Titanic: Machine Learning from Disaster
## Data exploration section

In [91]:
#import various libraries which we will use
import pandas as pd
import numpy as np

#sk learn moduels 
from sklearn import preprocessing, cross_validation, ensemble

In [92]:
# Read the training dataset
titanic = pd.read_csv("data/train.csv")

In [93]:
#display the contents of the dataframe
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S


In [94]:
#quick description of the df.
#only numerics will be displayed.
#and the data types implied by pandas
def show_titanic():
    print titanic.describe()
    print ""
    print titanic.dtypes
    
show_titanic()

       PassengerId    Survived      Pclass         Age       SibSp  \
count   891.000000  891.000000  891.000000  714.000000  891.000000   
mean    446.000000    0.383838    2.308642   29.699118    0.523008   
std     257.353842    0.486592    0.836071   14.526497    1.102743   
min       1.000000    0.000000    1.000000    0.420000    0.000000   
25%     223.500000    0.000000    2.000000   20.125000    0.000000   
50%     446.000000    0.000000    3.000000   28.000000    0.000000   
75%     668.500000    1.000000    3.000000   38.000000    1.000000   
max     891.000000    1.000000    3.000000   80.000000    8.000000   

            Parch        Fare  
count  891.000000  891.000000  
mean     0.381594   32.204208  
std      0.806057   49.693429  
min      0.000000    0.000000  
25%      0.000000    7.910400  
50%      0.000000   14.454200  
75%      0.000000   31.000000  
max      6.000000  512.329200  

PassengerId      int64
Survived         int64
Pclass           int64
Name       

In [95]:
#Do we have any missing values?
print titanic.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [96]:
titanic.Embarked.unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [97]:
titanic[pd.isnull(titanic['Embarked'])]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
61,62,1,1,"Icard, Miss. Amelie",female,38,0,0,113572,80,B28,
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62,0,0,113572,80,B28,


In [98]:
#We might want to look at the cabin also. It might denote a "floor" 
#Lowest floor probably didn't get to the top based on the movie :)
print titanic.Cabin.unique()

#extract the first letter in cabin
titanic['Cabin_prefix'] = titanic.Cabin.str[0]

[nan 'C85' 'C123' 'E46' 'G6' 'C103' 'D56' 'A6' 'C23 C25 C27' 'B78' 'D33'
 'B30' 'C52' 'B28' 'C83' 'F33' 'F G73' 'E31' 'A5' 'D10 D12' 'D26' 'C110'
 'B58 B60' 'E101' 'F E69' 'D47' 'B86' 'F2' 'C2' 'E33' 'B19' 'A7' 'C49' 'F4'
 'A32' 'B4' 'B80' 'A31' 'D36' 'D15' 'C93' 'C78' 'D35' 'C87' 'B77' 'E67'
 'B94' 'C125' 'C99' 'C118' 'D7' 'A19' 'B49' 'D' 'C22 C26' 'C106' 'C65'
 'E36' 'C54' 'B57 B59 B63 B66' 'C7' 'E34' 'C32' 'B18' 'C124' 'C91' 'E40'
 'T' 'C128' 'D37' 'B35' 'E50' 'C82' 'B96 B98' 'E10' 'E44' 'A34' 'C104'
 'C111' 'C92' 'E38' 'D21' 'E12' 'E63' 'A14' 'B37' 'C30' 'D20' 'B79' 'E25'
 'D46' 'B73' 'C95' 'B38' 'B39' 'B22' 'C86' 'C70' 'A16' 'C101' 'C68' 'A10'
 'E68' 'B41' 'A20' 'D19' 'D50' 'D9' 'A23' 'B50' 'A26' 'D48' 'E58' 'C126'
 'B71' 'B51 B53 B55' 'D49' 'B5' 'B20' 'F G63' 'C62 C64' 'E24' 'C90' 'C45'
 'E8' 'B101' 'D45' 'C46' 'D30' 'E121' 'D11' 'E77' 'F38' 'B3' 'D6' 'B82 B84'
 'D17' 'A36' 'B102' 'B69' 'E49' 'C47' 'D28' 'E17' 'A24' 'C50' 'B42' 'C148']


##Feature creation

In [99]:
#Age has a lot of missing values. 
#We can impute the median
titanic["Age_imputed_median"] = titanic["Age"].fillna(titanic["Age"].median())

In [100]:
#We can probably do better by looking at the names of people (specifically their title)
#This will get us all the unique titles
titanic['Title'] = titanic['Name'].str.split(',').str[1].str.split('.').str[0].str.strip()
print titanic.Title.unique()

['Mr' 'Mrs' 'Miss' 'Master' 'Don' 'Rev' 'Dr' 'Mme' 'Ms' 'Major' 'Lady'
 'Sir' 'Mlle' 'Col' 'Capt' 'the Countess' 'Jonkheer']


In [101]:
#Group by the title where age is NOT null so we can calculate the median
title_groupby =  titanic[titanic['Age'].notnull()].groupby('Title', as_index=False).Age.median()
 
#rename and drop
title_groupby['better_age'] = title_groupby['Age']
del title_groupby['Age']

#show the group
title_groupby

Unnamed: 0,Title,better_age
0,Capt,70.0
1,Col,58.0
2,Don,40.0
3,Dr,46.5
4,Jonkheer,38.0
5,Lady,48.0
6,Major,48.5
7,Master,3.5
8,Miss,21.0
9,Mlle,24.0


In [102]:
#Merge title imputed age lookup
titanic = titanic.merge(title_groupby, on='Title', how='left')

In [103]:
#set the missing ages to a better age imputed median var
titanic["better_age_imputed_median"] = titanic["Age"].fillna(titanic["better_age"])
titanic[['Age','Age_imputed_median','better_age_imputed_median']][titanic.Age.isnull()].head()

Unnamed: 0,Age,Age_imputed_median,better_age_imputed_median
5,,28,30
17,,28,30
19,,28,35
26,,28,30
28,,28,21


In [104]:
#We should do some label encoding to ensure we have all numerics
list_to_encode = ['Sex', 'Embarked', 'Cabin_prefix', 'Title']

for feature in list_to_encode:
    le = preprocessing.LabelEncoder()
    le.fit(titanic[feature].unique())
    titanic[feature] = le.transform(titanic[feature])

In [105]:
#create a 3-Fold Validation & list of X/y variables
kf = cross_validation.KFold(len(titanic), n_folds=3, shuffle=True, random_state=13)
X = ["Pclass", "Sex", "better_age_imputed_median", "SibSp", "Parch", "Fare", "Embarked", "Cabin_prefix", 'Title']
y = ["Survived"]

#empty results list which we'll use once we score
results = []

In [106]:
#create our classifier with n trees in the Forest
rfc = ensemble.RandomForestClassifier(n_estimators=200)

#loop through the kfold for train and testing
#split them out and run the RandForrest
#at the end we want to zip our results together so we can compare
for train, test in kf:
    X_train = titanic[X].iloc[train]
    y_train = titanic[y].iloc[train]
    X_test = titanic[X].iloc[test]
    y_test = titanic[y].iloc[test]

    rfc.fit(X_train.values,y_train.Survived.values)
    predi = rfc.predict(X_test.values)
    results.append(zip(y_test.Survived.values,predi))

In [107]:
#fucntion to loop through the results and output the accuracy
#expects a list with tuples (ziped). Will compare the items to see if it matches
def cf_validation(results):
    for idx, x in enumerate(results):
        accur = []
        for y in x:
            actual, predicted = y
            if actual == predicted:
                accur.append(1)
            else:
                accur.append(0)
        print "CrossFold {0} Accuracy is: {1:.2%}".format(idx, float(sum(accur))/ len(accur))
    
cf_validation(results)

CrossFold 0 Accuracy is: 81.48%
CrossFold 1 Accuracy is: 80.13%
CrossFold 2 Accuracy is: 82.49%


In [108]:
#Next to do.
#Create a pipeline which will use a RF, LinReg, and GB.
#See if a ensmble of ensmbles will do better?