In [1]:
datafile = "/Users/curtismitchell/Projects/tree-based-models/titanicdata.htm"

In [2]:
from bs4 import BeautifulSoup
with open(datafile,"r") as f:
    soup = BeautifulSoup(f,"html.parser")

In [3]:
table = soup.find('table')

In [4]:
import pandas as pd
import html5lib
data = pd.read_html(str(table),flavor='bs4')[0]

In [5]:
data.head()

Unnamed: 0,Name,Age,Class/Dept,Ticket,Joined,Job,Boat [Body],Unnamed: 7
0,"ABÄ«-AL-MUNÃ , Mr NÄ�sÄ«f QÄ�sim",27,3rd Class Passenger,2699Â£18 15s 9d,Cherbourg,Â,15Â,
1,"ABBING, Mr Anthony",42,3rd Class Passenger,5547Â£7 11s,Southampton,Blacksmith Â,Â Â,
2,"ABBOTT, Mrs Rhoda Mary 'Rosa'",39,3rd Class Passenger,CA2673Â£20 5s,Southampton,Â,AÂ,
3,"ABBOTT, Mr Rossmore Edward",16,3rd Class Passenger,CA2673Â£20 5s,Southampton,Jeweller Â,Â [190],
4,"ABBOTT, Mr Eugene Joseph",13,3rd Class Passenger,CA2673Â£20 5s,Southampton,Scholar Â,Â Â,


In [6]:
def cleanup(value):
    return value.encode('ascii', errors='replace').replace('?', ' ')

data['Name'] = data['Name'].apply(cleanup)
data['Boat [Body]'] = data['Boat [Body]'].apply(cleanup)
data['Age'] = data['Age'].apply(pd.to_numeric, errors='coerce')
data = data[['Name','Age','Class/Dept','Boat [Body]']]

data.head()

Unnamed: 0,Name,Age,Class/Dept,Boat [Body]
0,"AB -AL-MUN , Mr N s f Q sim",27.0,3rd Class Passenger,15
1,"ABBING, Mr Anthony",42.0,3rd Class Passenger,
2,"ABBOTT, Mrs Rhoda Mary 'Rosa'",39.0,3rd Class Passenger,A
3,"ABBOTT, Mr Rossmore Edward",16.0,3rd Class Passenger,[190]
4,"ABBOTT, Mr Eugene Joseph",13.0,3rd Class Passenger,


In [7]:
def checkPass(class_type):
    if 'Passenger' in class_type:
        return 'Passenger'
    else:
        return 'Crew'
    
data['Crew/Pass'] = data['Class/Dept'].apply(checkPass)
data.head()

Unnamed: 0,Name,Age,Class/Dept,Boat [Body],Crew/Pass
0,"AB -AL-MUN , Mr N s f Q sim",27.0,3rd Class Passenger,15,Passenger
1,"ABBING, Mr Anthony",42.0,3rd Class Passenger,,Passenger
2,"ABBOTT, Mrs Rhoda Mary 'Rosa'",39.0,3rd Class Passenger,A,Passenger
3,"ABBOTT, Mr Rossmore Edward",16.0,3rd Class Passenger,[190],Passenger
4,"ABBOTT, Mr Eugene Joseph",13.0,3rd Class Passenger,,Passenger


In [8]:
def checkClass(class_type):
    if 'Passenger' in class_type:
        return class_type.split(' ')[0]
    else:
        return 'Crew'
    
data['Class'] = data['Class/Dept'].apply(checkClass)
data.head()

Unnamed: 0,Name,Age,Class/Dept,Boat [Body],Crew/Pass,Class
0,"AB -AL-MUN , Mr N s f Q sim",27.0,3rd Class Passenger,15,Passenger,3rd
1,"ABBING, Mr Anthony",42.0,3rd Class Passenger,,Passenger,3rd
2,"ABBOTT, Mrs Rhoda Mary 'Rosa'",39.0,3rd Class Passenger,A,Passenger,3rd
3,"ABBOTT, Mr Rossmore Edward",16.0,3rd Class Passenger,[190],Passenger,3rd
4,"ABBOTT, Mr Eugene Joseph",13.0,3rd Class Passenger,,Passenger,3rd


In [9]:
def checkAdult(age):
    if age >= 18:
        return 'Adult'
    else:
        return 'Child'
    
data['Adult/Child'] = data['Age'].apply(checkAdult)
data.head()

Unnamed: 0,Name,Age,Class/Dept,Boat [Body],Crew/Pass,Class,Adult/Child
0,"AB -AL-MUN , Mr N s f Q sim",27.0,3rd Class Passenger,15,Passenger,3rd,Adult
1,"ABBING, Mr Anthony",42.0,3rd Class Passenger,,Passenger,3rd,Adult
2,"ABBOTT, Mrs Rhoda Mary 'Rosa'",39.0,3rd Class Passenger,A,Passenger,3rd,Adult
3,"ABBOTT, Mr Rossmore Edward",16.0,3rd Class Passenger,[190],Passenger,3rd,Child
4,"ABBOTT, Mr Eugene Joseph",13.0,3rd Class Passenger,,Passenger,3rd,Child


In [10]:
def checkGender(name):
    firstname = name[name.index(',')+2:]
    salutation = firstname.split(' ')[0]
    if salutation in ['Mr','Master']:
        return 'Male'
    else:
        return 'Female'

data['Gender'] = data['Name'].apply(checkGender)
data.head()

Unnamed: 0,Name,Age,Class/Dept,Boat [Body],Crew/Pass,Class,Adult/Child,Gender
0,"AB -AL-MUN , Mr N s f Q sim",27.0,3rd Class Passenger,15,Passenger,3rd,Adult,Male
1,"ABBING, Mr Anthony",42.0,3rd Class Passenger,,Passenger,3rd,Adult,Male
2,"ABBOTT, Mrs Rhoda Mary 'Rosa'",39.0,3rd Class Passenger,A,Passenger,3rd,Adult,Female
3,"ABBOTT, Mr Rossmore Edward",16.0,3rd Class Passenger,[190],Passenger,3rd,Child,Male
4,"ABBOTT, Mr Eugene Joseph",13.0,3rd Class Passenger,,Passenger,3rd,Child,Male


In [11]:
def checkSurvival(boat):
    if boat.strip() == '' or '[' in boat:
        return 0
    else:
        return 1
    
data['Survival'] = data['Boat [Body]'].apply(checkSurvival)
data.head()

Unnamed: 0,Name,Age,Class/Dept,Boat [Body],Crew/Pass,Class,Adult/Child,Gender,Survival
0,"AB -AL-MUN , Mr N s f Q sim",27.0,3rd Class Passenger,15,Passenger,3rd,Adult,Male,1
1,"ABBING, Mr Anthony",42.0,3rd Class Passenger,,Passenger,3rd,Adult,Male,0
2,"ABBOTT, Mrs Rhoda Mary 'Rosa'",39.0,3rd Class Passenger,A,Passenger,3rd,Adult,Female,1
3,"ABBOTT, Mr Rossmore Edward",16.0,3rd Class Passenger,[190],Passenger,3rd,Child,Male,0
4,"ABBOTT, Mr Eugene Joseph",13.0,3rd Class Passenger,,Passenger,3rd,Child,Male,0


In [13]:
data.groupby(['Crew/Pass'])['Survival'].sum()*100/data.groupby(['Crew/Pass'])['Survival'].count()

Crew/Pass
Crew         13.586957
Passenger    35.798817
Name: Survival, dtype: float64

In [15]:
# Attempt to find variable with largest differentiation to build decision tree
def compare(group,data):
    return data.groupby([group])['Survival'].sum()*100/data.groupby([group])['Survival'].count()

compare('Class', data)

Class
1st     57.428571
2nd     37.883959
3rd     24.259520
Crew    13.586957
Name: Survival, dtype: float64

In [16]:
compare('Gender', data)

Gender
Female    59.272097
Male      15.540181
Name: Survival, dtype: float64

In [17]:
compare('Adult/Child', data)

Adult/Child
Adult    25.078370
Child    33.183857
Name: Survival, dtype: float64