In [1]:
# determine if a Titanic's passenger would have survived, given her age, passenger class, and sex

import csv
import os
import numpy as np

with open('../data/titanic.csv', 'rb') as csvfile:
    titanic_reader = csv.reader(csvfile, delimiter=',', quotechar='"')
    
    # First row has headers
    row = titanic_reader.next()
    feature_names = np.array(row)
    
    # Load dataset, target classes
    titanic_X, titanic_y = [], []
    for row in titanic_reader:
        titanic_X.append(row)
        titanic_y.append(row[2])
        
    titanic_X = np.array(titanic_X)
    titanic_y = np.array(titanic_y)

print(feature_names)
print(titanic_X[0], titanic_y[0])

['row.names' 'pclass' 'survived' 'name' 'age' 'embarked' 'home.dest' 'room'
 'ticket' 'boat' 'sex']
(array(['1', '1st', '1', 'Allen, Miss Elisabeth Walton', '29.0000',
       'Southampton', 'St Louis, MO', 'B-5', '24160 L221', '2', 'female'], 
      dtype='|S62'), '1')


In [2]:
# Just taking a look at the data in pandas

import pandas as pd

df = pd.read_csv('../data/titanic.csv')
df.head()

Unnamed: 0,row.names,pclass,survived,name,age,embarked,home.dest,room,ticket,boat,sex
0,1,1st,1,"Allen, Miss Elisabeth Walton",29.0,Southampton,"St Louis, MO",B-5,24160 L221,2,female
1,2,1st,0,"Allison, Miss Helen Loraine",2.0,Southampton,"Montreal, PQ / Chesterville, ON",C26,,,female
2,3,1st,0,"Allison, Mr Hudson Joshua Creighton",30.0,Southampton,"Montreal, PQ / Chesterville, ON",C26,,(135),male
3,4,1st,0,"Allison, Mrs Hudson J.C. (Bessie Waldo Daniels)",25.0,Southampton,"Montreal, PQ / Chesterville, ON",C26,,,female
4,5,1st,1,"Allison, Master Hudson Trevor",0.9167,Southampton,"Montreal, PQ / Chesterville, ON",C22,,11,male


In [3]:
# Preprocessing data
titanic_X = titanic_X[:, [1, 4, 10]]
feature_names = feature_names[[1, 4, 10]]

print(feature_names)

# The age of this record is missing and has 'NA' instead
print(titanic_X[12], titanic_y[12])

['pclass' 'age' 'sex']
(array(['1st', 'NA', 'female'], 
      dtype='|S62'), '1')


In [4]:
# Let's fix the age data then
ages = titanic_X[:, 1]
mean_age = np.mean(titanic_X[ages != 'NA', 1].astype(np.float))
titanic_X[titanic_X[:, 1] == 'NA', 1] = mean_age

# Now let's see this record that we know to have 'NA'
print(titanic_X[12], titanic_y[12])

(array(['1st', '31.1941810427', 'female'], 
      dtype='|S62'), '1')


In [5]:
# Let's fix the data for sex to be numerical
from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()
label_encoder = enc.fit(titanic_X[:, 2])
print("Categorical classes:", label_encoder.classes_)

integer_classes = label_encoder.transform(label_encoder.classes_)
print("Integer classes:", integer_classes)

t = label_encoder.transform(titanic_X[:, 2])
titanic_X[:, 2] = t

print(feature_names)
print(titanic_X[12], titanic_y[12])

('Categorical classes:', array(['female', 'male'], 
      dtype='|S62'))
('Integer classes:', array([0, 1]))
['pclass' 'age' 'sex']
(array(['1st', '31.1941810427', '0'], 
      dtype='|S62'), '1')


In [8]:
# We want to change the 'pclass' attributes into something more numerical

from sklearn.preprocessing import OneHotEncoder
enc = LabelEncoder()
label_encoder = enc.fit(titanic_X[:, 0])
print("Categorical classes:", label_encoder.classes_)
integer_classes = label_encoder.transform(label_encoder.classes_).reshape(3, 1)
print("Integer classes:", integer_classes)

enc = OneHotEncoder()
one_hot_encoder = enc.fit(integer_classes)
num_of_rows = titanic_X.shape[0]
t = label_encoder.transform(titanic_X[:, 0]).reshape(num_of_rows, 1)
new_features = one_hot_encoder.transform(t)
titanic_X = np.concatenate([titanic_X, new_features.toarray()], axis=1)
titanic_X = np.delete(titanic_X, [0], 1)
feature_names = ['age', 'sex', 'first_class', 'second_class', 'third_class']
titanic_X = titanic_X.astype(float)
titanic_y = titanic_y.astype(float)

print(feature_names)
print(titanic_X[0], titanic_y[0])

('Categorical classes:', array(['1st', '2nd', '3rd'], 
      dtype='|S62'))
('Integer classes:', array([[0],
       [1],
       [2]]))
['age', 'sex', 'first_class', 'second_class', 'third_class']
(array([ 29.,   0.,   1.,   0.,   0.]), 1.0)
