# Column Definition
Variable	Definition	Key
survival	Survival	0 = No, 1 = Yes
pclass		Ticket class	1 = 1st = Upper, 2 = 2nd = Middle, 3 = 3rd = Lower
sex			Sex	
Age			Age in years (fractional if less than 1. If the age is estimated, is it in the form of xx.5)
sibsp		# of siblings / spouses aboard the Titanic
	Sibling = brother, sister, stepbrother, stepsister
	Spouse = husband, wife (mistresses and fiancés were ignored)
parch		# of parents / children aboard the Titanic	
	Parent = mother, father
	Child = daughter, son, stepdaughter, stepson
	Some children travelled only with a nanny, therefore parch=0 for them.
ticket		Ticket number	
fare		Passenger fare	
cabin		Cabin number	
embarked	Port of Embarkation	C = Cherbourg, Q = Queenstown, S = Southampton

# Import Packages

In [1]:
# import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
import re
import os
plt.style.use('ggplot')

# Load File

In [2]:
# Load titanic data frame, create new columns 'dataset' then concatenate the 2 data
titanic_train = pd.read_csv('train.csv')
titanic_test = pd.read_csv('test.csv')

titanic_train['dataset'] = 'train'
titanic_test['dataset'] = 'test'

df = [titanic_train, titanic_test]
titanic = pd.concat(df)

# EDA and Cleaning

In [3]:
def change_col(data):
    new_col = [i.lower().replace(' ','_').replace('#','no') for i in data.columns]
    data.columns = new_col
    return data


In [4]:
titanic = change_col(titanic)
titanic.columns

Index(['age', 'cabin', 'embarked', 'fare', 'name', 'parch', 'passengerid',
       'pclass', 'sex', 'sibsp', 'survived', 'ticket', 'dataset'],
      dtype='object')

In [5]:
# Create new columns based on names (LastName, FirstName and Title)
titanic['name_split'] = titanic.name.str.split('\, ')
titanic['lname'] = titanic.name_split.str.get(0)
titanic['fname'] = titanic.name_split.str.get(1)
titanic['fname'] = titanic.fname.str.split('\. ')
titanic['title'] = titanic.fname.str.get(0)
titanic['fname'] = titanic.fname.str.get(1)
del titanic['name_split']

Check for NULL fare since info above stated that there is 1 missing fare

In [6]:
# Dropping NULL fares since it will not affect our analysis
titanic = titanic.loc[titanic.fare.notnull()]

Since there are many values that are missing for Cabin, let us forget about it for now and focus on other columns. Let us create a Gender column separating the Male, Female and Child passengers.

In [7]:
def male_female_child(passenger):
    age, sex = passenger
    if age < 14:
        return 'child'
    else:
        return sex    

In [8]:
titanic['gender'] = titanic[['age','sex']].apply(male_female_child,axis=1)

In [9]:
# Getting value for embarked, checking as there are 2 NULLs found.
tfare = titanic.loc[titanic.embarked.isnull(),'fare'].mean()

# Setting value to C as found in above results
titanic.loc[titanic.embarked.isnull(),'embarked'] = 'C'

Adding new column for those people with Family and those that travelled alone.

In [10]:
titanic['w_family'] = titanic.parch + titanic.sibsp
titanic.loc[titanic.w_family > 0,'w_family'] = 1
titanic.loc[titanic.w_family == 0,'w_family'] = 0

In [11]:
titanic['cabin_letter'] = titanic.cabin.str[0]
titanic['cabin_number'] = titanic.cabin.str[-1]

def cabin_side(number):
    if number in ('1','3','5','7','9'):
        return 'right'
    elif number in ('2','4','6','8','0'):
        return 'left'
    else:
        return 'none'

titanic['cabin_side'] = titanic['cabin_number'].apply(cabin_side)

In [12]:
title_new = titanic.title.copy()

#Change Ms and Mlle to Miss
filt = ['Mr','Mrs','Miss','Master']
title_new[title_new.isin(['Ms','Mlle'])] = 'Miss' 
title_new[title_new == 'Mme'] = 'Mrs'
title_new[~title_new.isin(filt)] = 'Honorifics'

titanic.loc[:,'new_title'] = title_new
titanic.loc[np.logical_and(titanic.new_title == 'Honorifics',
                                 titanic.sex == 'male'),'new_title'] = 'Honorific_male'
titanic.loc[np.logical_and(titanic.new_title == 'Honorifics',
                                 titanic.sex == 'female'),'new_title'] = 'Honorific_female'

In [13]:
#Separate Actual Data with Holdout Data
titanic_train = titanic.loc[titanic['dataset'] == 'train']
titanic_test = titanic.loc[titanic['dataset'] == 'test']

# removed changed columns
data = titanic_train
data = data.drop(['name','parch','sibsp','sex','title','cabin'], axis = 1)


In [14]:
# Preparing to make the Prediction Model
data = data.loc[data.age.notnull()]

In [15]:
# Define combine_text_columns()
NUMERIC_COLUMNS = ['age','fare','survived','passengerid','pclass','dataset','w_family','fname','cabin_letter','cabin_side']

def combine_text_columns(data_frame, to_drop=NUMERIC_COLUMNS):
    """ converts all text in each row of data_frame to single vector """
    # Drop non-text columns that are in the df
    to_drop = set(to_drop) & set(data_frame.columns.tolist())
    text_data = data_frame.drop(to_drop, axis = 1)
    # Replace nans with blanks
    text_data.fillna("", inplace = True)
    # Join all text items in a row that have a space in between
    return text_data.apply(lambda x: " ".join(x), axis=1)


In [16]:
# Import functional utilities
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import FeatureUnion

# Modifying columns and Preprocessings
change_cols = FunctionTransformer(change_col, validate=False)


# Preprocessing before Classifier
get_text_data = FunctionTransformer(combine_text_columns, validate=False)
get_numeric_data = FunctionTransformer(lambda x: x.loc[:,['age','fare','w_family','pclass']], validate=False)
get_cabin_data = FunctionTransformer(lambda x: pd.get_dummies(x[['cabin_letter','cabin_side']]), validate=False)

# Create the token pattern: TOKENS_ALPHANUMERIC
TOKENS_ALPHANUMERIC = '[A-Za-z0-9]+(?=\\s+)'

In [17]:
data_test = titanic_test.loc[titanic_test.age.notnull()]

data_test.info()

data_test = data_test.drop(['parch','sibsp','sex','title','cabin'], axis = 1)
data_test = data_test.sort_values(by='name')
data_test = data_test.reset_index()

# loading survival data acquired online, this will be used to check the holdout data
titanicOnline = pd.read_csv('test_set.csv')
titanicOnline = titanicOnline.loc[np.logical_and(titanicOnline.Age.notnull(),titanicOnline.Fare.notnull()), ['PassengerID', 'Pclass','Name','Sex','Age','Fare','Embarked','Survived']]

titanic_val = titanicOnline.loc[:,['Name','Survived']].sort_values(by='Name')
titanic_val = titanic_val.reset_index()

data_test = data_test.sort_values(by='name')
data_test = data_test.reset_index()

validate = np.empty(len(data_test))
for x in np.arange(len(data_test)):
    data_test.loc[x,'survived'] = titanic_val.loc[x,'Survived']

data_test = data_test.drop(['level_0','index','name'], axis = 1)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 331 entries, 0 to 415
Data columns (total 22 columns):
age             331 non-null float64
cabin           87 non-null object
embarked        331 non-null object
fare            331 non-null float64
name            331 non-null object
parch           331 non-null int64
passengerid     331 non-null int64
pclass          331 non-null int64
sex             331 non-null object
sibsp           331 non-null int64
survived        0 non-null float64
ticket          331 non-null object
dataset         331 non-null object
lname           331 non-null object
fname           331 non-null object
title           331 non-null object
gender          331 non-null object
w_family        331 non-null int64
cabin_letter    87 non-null object
cabin_number    87 non-null object
cabin_side      331 non-null object
new_title       331 non-null object
dtypes: float64(3), int64(5), object(14)
memory usage: 59.5+ KB


In [18]:
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data.drop('survived',axis = 1), data.survived, test_size = 0.3, random_state=2)
holdout_X = data_test

## Creating the actual prediction model for kNN

In [19]:
# Import classifiers
from sklearn.neighbors import KNeighborsClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

# Import pipeline
from sklearn.pipeline import Pipeline

# Import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer

# Import other preprocessing modules
from sklearn.preprocessing import Imputer

In [20]:
# Using K-fold Cross validation
param_grid = {'n_neighbors': np.arange(1,50)}

# Using GridSearchCV to get the best Neighbor #
knn = KNeighborsClassifier()
pl1 = Pipeline([('union', FeatureUnion(transformer_list = [
                ('numeric_features', Pipeline([('selector', get_numeric_data),
                                            ('imputer', Imputer())])),
                ('cabin', get_cabin_data),
                ('text_features', Pipeline([('selector', get_text_data),
                                            ('vectorizer', HashingVectorizer(token_pattern=TOKENS_ALPHANUMERIC))
                                           ]))
                                    ])),    
        ('clf', GridSearchCV(knn, param_grid, cv = 5))])

pl1.fit(X_train, y_train)

Pipeline(steps=[('union', FeatureUnion(n_jobs=1,
       transformer_list=[('numeric_features', Pipeline(steps=[('selector', FunctionTransformer(accept_sparse=False,
          func=<function <lambda> at 0x000000000B12B400>, pass_y=False,
          validate=False)), ('imputer', Imputer(axis=0, copy=True, missi...3, 44, 45, 46, 47, 48, 49])},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0))])

In [21]:
pl = Pipeline([('union', FeatureUnion(transformer_list = [
                ('numeric_features', Pipeline([('selector', get_numeric_data),
                                            ('imputer', Imputer())])),
                ('cabin', get_cabin_data),
                ('text_features', Pipeline([('selector', get_text_data),
                                            ('vectorizer', HashingVectorizer(token_pattern=TOKENS_ALPHANUMERIC))
                                           ]))
                                    ])),    
        ('clf', OneVsRestClassifier(LogisticRegression()))])

# pl1.named_steps['clf'].best_params_['n_neighbors']

In [22]:
pl.fit(X_train,y_train)

Pipeline(steps=[('union', FeatureUnion(n_jobs=1,
       transformer_list=[('numeric_features', Pipeline(steps=[('selector', FunctionTransformer(accept_sparse=False,
          func=<function <lambda> at 0x000000000B12B400>, pass_y=False,
          validate=False)), ('imputer', Imputer(axis=0, copy=True, missi...=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
          n_jobs=1))])

In [23]:
predict = pl.predict(X_test)
p_holdout = pl.predict(data_test.drop('survived',axis = 1))

In [24]:
pl.score(X_test, y_test)

0.83255813953488367

In [25]:
pl.score(data_test.drop('survived',axis = 1),data_test.loc[:,'survived'])

0.68277945619335345

In [26]:
data_test.loc[:,'survived_pred'] = p_holdout

In [27]:
data_test.loc[:,['fname','lname','survived_pred','survived']].head(10)

Unnamed: 0,fname,lname,survived_pred,survived
0,Eugene Joseph,Abbott,1.0,0.0
1,Karen Marie,Abelseth,1.0,1.0
2,Olaus Jorgensen,Abelseth,0.0,1.0
3,Abraham August Johannes,Abrahamsson,0.0,0.0
4,Joseph (Sophie Halaut Easu),Abrahim,1.0,0.0
5,Philip Frank,Aks,1.0,1.0
6,Charles Augustus,Aldworth,0.0,0.0
7,Hudson Joshua Creighton,Allison,0.0,0.0
8,Albert Karvin,Andersen,0.0,0.0
9,Ida Augusta Margareta,Andersson,0.0,0.0


In [28]:
data_test.loc[np.logical_and(data_test.survived_pred == 1.0,data_test.survived == 0.0),'fname'].count()

55

In [29]:
data_test.loc[np.logical_and(data_test.survived_pred == 0.0,data_test.survived == 1.0),'fname'].count()

28

In [30]:
data_test.loc[data_test.survived_pred == data_test.survived,'fname'].count()

226