In [None]:
# import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
import re
import os
plt.style.use('ggplot')

In [None]:
# Load titanic data frame, create new columns 'dataset' then concatenate the 2 data
train = pd.read_csv('../datasets/titanic/train.csv')
test = pd.read_csv('../datasets/titanic/test.csv')

titanic_train = train.copy()
titanic_test = test.copy()
titanic_train['Dataset'] = 'train'
titanic_test['Dataset'] = 'test'


In [None]:
titanic_test.columns

In [None]:
sns.countplot(train['Survived'])

In [None]:
train['Survived'].groupby(train['Pclass']).mean()

In [None]:
# Plotting Survival with age, there are many 20 plus people that did not make it, 
# as for Pclass, a lot of class 3 passenger were not able to survive the tragedy.
sns.violinplot(x = 'Pclass', y = 'Age', data = train, inner = None, color = 'lightgray')
sns.stripplot(x = 'Pclass', y = 'Age', data = train, size = 4, jitter= True, hue = 'Survived')
plt.show()

In [None]:
def change_col(data):
    new_col = [i.lower().replace(' ','_').replace('#','no') for i in data.columns]
    data.columns = new_col
    return data


titanic_train = change_col(titanic_train)
titanic_test = change_col(titanic_test)
y = titanic_train['survived']
titanic_test['survived'] = 0

In [None]:
titanic = pd.concat([titanic_train, titanic_test], sort = False)
titanic.head()

In [None]:
titanic_train['name_split'] = titanic.name.str.split('\, ')
titanic_train['lname'] = titanic.name_split.str.get(0)
titanic_train['fname'] = titanic.name_split.str.get(1)
titanic_train['fname'] = titanic.fname.str.split('\. ')
titanic_train['title'] = titanic.fname.str.get(0)
titanic_train['fname'] = titanic.fname.str.get(1)
del titanic['name_split']

In [None]:
# title_new = titanic.title.copy()
# titanic.title.value_counts()

In [None]:
# filt = ['Mr','Mrs','Miss','Master']
# titanic.loc[~titanic.title.isin(filt),'title'].value_counts()

#Change Ms and Mlle to Miss
# title_new[title_new.isin(['Ms','Mlle','Miss'])] = 'Miss'
# title_new[title_new == 'Mme'] = 'Mrs'
# title_new[~title_new.isin(filt)] = 'Honorific'
# title_new.value_counts()

In [None]:
"""
titanic.loc[:,'new_title'] = title_new
titanic.loc[np.logical_and(titanic.new_title == 'Honorific',
                                 titanic.sex == 'male'),'new_title'] = 'Honorific_male'
titanic.loc[np.logical_and(titanic.new_title == 'Honorific',
                                 titanic.sex == 'female'),'new_title'] = 'Honorific_female'
                                 
"""

In [None]:
tickets = titanic.loc[:,'ticket'].value_counts()

from collections import defaultdict

ticket_cnt = defaultdict(int)
for x in range(len(tickets)):
    ticket_cnt[tickets.index[x]] = tickets.values[x]


In [None]:
# Dividing joint ticket fare within all passengers for that ticket
def get_new_fare(passenger):
    ticket, fare = passenger
    if ticket in ticket_cnt:
        return round(fare/ticket_cnt[ticket],2)

# Creating new column for number of passengers in joint ticket
def joint_ticket_check(passenger):
    ticket = passenger
    if ticket in ticket_cnt:
        return ticket_cnt[ticket]
    
titanic['new_fare'] = titanic[['ticket','fare']].apply(get_new_fare, axis =1)
# titanic['passengers_in_ticket'] = titanic['ticket'].apply(joint_ticket_check)

In [None]:
titanic.passengers_in_ticket
#titanic.loc[titanic.title == 'Master', 'survived'].value_counts()



In [None]:
# passengers are list of tuples that hold all info for NULL Age, found in Titanic Encyclopedia Website
from passengers import passengers

titanic.loc[(titanic.age.isnull()) & (titanic.title == 'Dr'), 'age'] = 46

# Going thru all the list of passengers in passengers.py
for x in passengers:
    titanic.loc[(titanic.ticket == x[0]) & (titanic.fname == x[1]), 'age'] = x[2]

# Using median for the rest of the missing age for 'Miss' title
titanic.loc[(round(titanic.fare, 2) == titanic.new_fare) & (titanic.sibsp == 0) & (titanic.parch == 0) 
            & (titanic.age.isnull()) & (titanic.new_title == 'Miss'), 'age'] = 24

# Setting age for passengers with title Mrs using mean with respect to pclass, 
titanic.loc[(titanic.age.isnull()) & (titanic.title == 'Mrs') & (titanic.pclass == 1), 'age'] = 45
titanic.loc[(titanic.age.isnull()) & (titanic.title == 'Mrs') & (titanic.pclass == 3), 'age'] = 31
titanic.loc[(round(titanic.fare, 2) == titanic.new_fare) & (titanic.sibsp == 0) & (titanic.parch == 0) 
            & (titanic.age.isnull()) & (titanic.title == 'Mr'), 'age'] = 29


In [None]:
# Getting new_fare for the fare that was NULL
titanic.loc[titanic.fare.isnull(), 'new_fare'] = 7.8

# Upon searching the NULL embarked, were seen to have embarked on S
titanic.loc[titanic.embarked.isnull(), 'embarked'] = 'S'

# Changing the relationship of joint ticket 2662, they were father and son
titanic.loc[titanic.ticket == '2662',['sibsp','parch']] = [1,1]
titanic.loc[(titanic.ticket == '2662') & (titanic.fname == 'Hanna'),['sibsp','parch']] = [0,2]

# Setting Cabin for Peter Family to match Anna's
titanic.loc[(titanic.ticket == '2668'), 'cabin'] = 'F E69'

# Changing Bourke's Age and its sibsp and parch
titanic.loc[(titanic.lname == 'Bourke') & (titanic.age.isnull()), ['sibsp', 'parch', 'age']] = [1, 0, 40]

# Converting Fare for Hagland as it is eronous
titanic.loc[titanic.lname == 'Hagland', 'new_fare'] = round(19.9667/3,2)
titanic.loc[(titanic.age.isnull()), 'title'].value_counts()



In [None]:
def male_female_child(passenger):
    age, sex, title = passenger
    if age < 15 and title != 'Mrs':
        return 'child'
    else:
        return sex    

titanic['gender'] = titanic[['age','sex','new_title']].apply(male_female_child,axis=1)

In [None]:
def age_group(passenger):
    age = passenger
    if age < 15:
        return 'child'
    elif 15 <= age <= 35: 
        return 'adult'
    elif 35 < age <= 55: 
        return 'mid_age'
    elif age > 55: 
        return 'senior'

titanic['age_group'] = titanic['age'].apply(age_group)

In [None]:
last_names = np.unique(titanic.loc[(titanic.gender == 'child'),['lname']].values)

def with_child(passenger):
    lname = passenger
    if lname in last_names:
        return 1
    else:
        return 0
    
titanic['w_child'] = titanic.lname.apply(with_child)

# Observation 1: Check Survivor based on companion in ticket, 
# Added number of people in joint ticket


In [None]:
df1 = pd.pivot_table(titanic, index=['lname'],values=['survived'], aggfunc=np.sum)

df2 = pd.pivot_table(titanic, index=['lname'],values=['survived'], aggfunc=len)

df3 = pd.concat([df1,df2],axis=1)

df3.columns = ['survived', 'count']
df3['died'] = df3['count'] - df3['survived']

df3.index

def how_many_died(passenger):
    lname = passenger
    if lname in df3.index:
        return df3.loc[lname,'survived']
    else:
        return 0 

def did_any_survived(passenger):
    lname = passenger
    if lname in df3.index:
        if df3.loc[lname,'survived'] != 0:
            return 1
        else:
            return 0 
    

titanic['how_many_survived'] = titanic['lname'].apply(how_many_died)
titanic['any_survivors'] = titanic['lname'].apply(did_any_survived)


In [None]:
titanic['how_many_survived'].value_counts()

In [None]:
titanic['any_survivors'].value_counts()

In [None]:
def remove_punctuation(input_string):
    """Remove everything except numbers in string"""
    return ''.join(re.findall(r'[0-9]', input_string))


In [None]:
titanic['w_family'] = titanic.parch + titanic.sibsp
titanic.loc[titanic.w_family > 0,'w_family'] = 1
titanic.loc[titanic.w_family == 0,'w_family'] = 0
titanic['w_family'].value_counts()

In [None]:
titanic.cabin.str[0].groupby(titanic.cabin.str[0]).count()

In [None]:
titanic.info()

In [None]:
#Separate Train Data with Test Data
titanic_train = titanic.loc[titanic['dataset'] == 'train']
titanic_test = titanic.loc[titanic['dataset'] == 'test']
titanic_test = titanic_test.drop(['survived'], axis=1)


In [None]:
titanic_train.head()

In [None]:
titanic_test.head()

In [None]:
# Clean columns, check age column first
plt.hist(titanic_train.age, bins = 20)
plt.show()

In [None]:
#It can be seen here that female has positive linear regression while male has negative,
# suggests that as male age increases chance of survival decreases and this is vice versa for females
sns.lmplot(x='age', y='survived', hue = 'sex', data = titanic_train)
plt.show()

In [None]:
# Gender and Survived Plot.
sns.catplot('age_group', hue = 'survived', data = titanic_train, kind = 'count', aspect = 1.25)
plt.show()

In [None]:
# Gender and Survived Plot.
sns.catplot('gender', hue = 'survived', data = titanic_train, kind = 'count', aspect = 1.25)
plt.show()

In [None]:
#Shows that fare might be a little bit related to survival as there are more survivors in higher fares
sns.violinplot(x = 'sex', y = 'new_fare', data = titanic_train, inner = None, color = 'lightgray')
sns.stripplot(x = 'sex', y = 'new_fare', data = titanic_train, size = 4, jitter= True, hue = 'survived')
plt.show()

In [None]:
sns.lmplot(x='any_survivors', y='survived', hue = 'sex', data = titanic_train)
plt.show()

In [None]:
# Gender and Survived Plot.
sns.catplot('any_survivors', hue = 'survived', data = titanic_train, kind = 'count', aspect = 1.25)
plt.show()

In [None]:
def pearson_r(x, y):
    """Compute Pearson correlation coefficient between two arrays."""
    # Compute correlation matrix: corr_mat
    corr_mat = np.corrcoef(x,y)
    # Return entry [0,1]
    return corr_mat[0,1]

In [None]:
print(pearson_r(x= titanic_train.any_survivors, y = titanic_train['survived']))
print(pearson_r(x= titanic_train.pclass, y = titanic_train['survived']))
print(pearson_r(x= titanic_train.new_fare, y = titanic_train['survived']))

In [None]:
sns.catplot('gender', hue = 'survived', data = titanic_train, kind = 'count', aspect = 1.25)
plt.show()

In [None]:
sns.catplot('pclass', hue = 'survived', data = titanic_train, kind = 'count', aspect = 1.25)
plt.show()

In [None]:
titanic_test = titanic_test.drop('dataset', axis = 1)
titanic_train = titanic_train.drop('dataset', axis = 1)

In [None]:
titanic_train.info()

In [None]:
y = titanic_train['survived']

data = titanic_train.copy()
data = data.drop(['sibsp','parch','how_many_survived','passengerid','name','sex','ticket',
                  'fare','cabin','lname','fname','title','age_group',
                  'survived'], axis = 1)
data_test = titanic_test.copy()
data_test = data_test.drop(['sibsp','parch','how_many_survived','passengerid','name','sex','ticket',
                  'fare','cabin','lname','fname','title','age_group',], axis = 1)



In [None]:
data_dummy = data
data_dummy = pd.get_dummies(data_dummy, drop_first = True)
data_dummy.info()

In [None]:
data_test_dummy = data_test
data_test_dummy = pd.get_dummies(data_test_dummy, drop_first = True)
# data_test_dummy.loc[:,'cabin_letter_T'] = 0
data_test_dummy.info()

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [None]:
X = data_dummy
test_X = data_test_dummy

In [None]:
from sklearn.model_selection import train_test_split

train_X, val_X, train_y, val_y = train_test_split(data_dummy, y, random_state=22)

In [None]:
import pandas
import numpy
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
test = SelectKBest(score_func=chi2, k=4)
fit = test.fit(train_X, train_y)
# summarize scores
numpy.set_printoptions(precision=3)
print(fit.scores_)
features = fit.transform(train_X)
# summarize selected features
print(features)


In [None]:
# Using K-fold Cross validation
param_grid = {'n_neighbors': np.arange(1,50)}

knn = KNeighborsClassifier()
knn_cv = GridSearchCV(knn, param_grid, cv = 5)

In [None]:
knn_cv.fit(train_X, train_y)
print(knn_cv.best_params_)
print(knn_cv.best_score_)

In [None]:
knn_2 = KNeighborsClassifier(n_neighbors = 1)
knn_2.fit(train_X,train_y)

In [None]:
knn_2.score(val_X, val_y)

In [None]:
knn_cv.fit(X, y)
print(knn_cv.best_params_)
print(knn_cv.best_score_)

In [None]:
knn = KNeighborsClassifier(n_neighbors = 1)
knn.fit(X,y)

In [None]:
model = RandomForestClassifier(n_estimators = 10)
model.fit(X,y)

In [None]:
predict = knn.predict(test_X)
rf_predict = model.predict(test_X)

In [None]:
# make predictions which we will submit. 
output = pd.DataFrame({'PassengerId': titanic_test.passengerid,
                       'Survived': predict})
output.to_csv('../datasets/titanic/submission.csv', index=False)

In [None]:
# make predictions which we will submit. 
output = pd.DataFrame({'PassengerId': titanic_test.passengerid,
                       'Survived': rf_predict})
output.to_csv('../datasets/titanic/rf_submission.csv', index=False)