In [244]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

import re

In [245]:
train = pd.read_csv('train.csv')
test  = pd.read_csv('test.csv')
full_data = [train, test]

train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


First try to understand the data. There are 12 columns in our dataframe:

**"Survived"** - dependent variable and what we are trying to predict, binary datatype of 1 for survived and 0 for did not survive

**"PassengerID"** and **"Ticket"** - unique identifiers, probably don't help predict outcome and will therefore be dropped 

**"Pclass"** - ticket class with 1 = upper class, 2 = middle class, and 3 = lower class

**"Name"** - represents passenger name, could potentially be parsed to get useful features, so I'll keep it

**"Sex"** - categorical variable, either male or female, could be converted into numerical variable for analysis

**"Age"** and **"Fare"** - both continuous variables

**"SibSp"** - number of related siblings/spouse aboard

**"Parch"** - number of related parents/children aboard

**"Cabin"** - approximate position on ship when incident occured

**"Embarked"** - categorical variable for port where passenger embarked, C = Cherbourg, Q = Queenstown, S = Southampton, could be converted into numerical variable for analysis

In [246]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    418 non-null    int64  
 1   Name      418 non-null    object 
 2   Sex       418 non-null    object 
 3   Age       332 non-null    float64
 4   SibSp     418 non-null    int64  
 5   Parch     418 non-null    int64  
 6   Fare      417 non-null    float64
 7   Embarked  418 non-null    object 
dtypes: float64(2), int64(3), object(3)
memory usage: 26.2+ KB


Create a new feature called FamilySize that is sum of SibSp and Parch

In [247]:
for dataset in full_data:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
    
print (train[['FamilySize', 'Survived']].groupby(['FamilySize'], as_index=False).mean())

   FamilySize  Survived
0           1  0.303538
1           2  0.552795
2           3  0.578431
3           4  0.724138
4           5  0.200000
5           6  0.136364
6           7  0.333333
7           8  0.000000
8          11  0.000000


The size of the family seems to have an impact on the survival rates

From this I can create a further feature that I'll call 'IsAlone' that will be 1 if the passenger was alone on the ship and 0 otherwise

In [248]:
for dataset in full_data:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1
    
print (train[['IsAlone', 'Survived']].groupby(['IsAlone'], as_index=False).mean())

   IsAlone  Survived
0        0  0.505650
1        1  0.303538


Fill in any missing observations for 'Fare' with the median fare, then divide the category up into 5 different buckets for later analysis

In [249]:
for dataset in full_data:
    dataset['Fare'] = dataset['Fare'].fillna(train['Fare'].median())
    
train['CategoricalFare'] = pd.qcut(train['Fare'], 4)
print (train[['CategoricalFare', 'Survived']].groupby(['CategoricalFare'], as_index=False).mean())

   CategoricalFare  Survived
0   (-0.001, 7.91]  0.197309
1   (7.91, 14.454]  0.303571
2   (14.454, 31.0]  0.454955
3  (31.0, 512.329]  0.581081


There seems to be a pretty clear connection between fare price and survival rate. Passengers who paid more have much high survival rates

Since there are many missing values for Age, fill in the missing values by generating random numbers that are within a standard deviation of the mean. Then categorize age into 5 buckets

In [250]:
for dataset in full_data:
    age_avg = dataset['Age'].mean()
    age_std = dataset['Age'].std()
    age_null_count = dataset['Age'].isnull().sum()
    
    age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count)
    dataset['Age'][np.isnan(dataset['Age'])] = age_null_random_list
    dataset['Age'] = dataset['Age'].astype(int)
    
train['CategoricalAge'] = pd.cut(train['Age'], 5)

print (train[['CategoricalAge', 'Survived']].groupby(['CategoricalAge'], as_index=False).mean())

  CategoricalAge  Survived
0  (-0.08, 16.0]  0.523364
1   (16.0, 32.0]  0.359091
2   (32.0, 48.0]  0.367424
3   (48.0, 64.0]  0.434783
4   (64.0, 80.0]  0.090909


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['Age'][np.isnan(dataset['Age'])] = age_null_random_list


Survival is a lot higher for children than for other groups, and the elderly have by far the lowest survival rates 

While the names themselves probably won't tell us much, perhaps we can extract the titles from the names which might help

In [251]:
def title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    # If the title exists, extract and return it.
    if title_search:
        return title_search.group(1)
    return ""

for dataset in full_data:
    dataset['Title'] = dataset['Name'].apply(title)

print(pd.crosstab(train['Title'], train['Sex']))

Sex       female  male
Title                 
Capt           0     1
Col            0     2
Countess       1     0
Don            0     1
Dr             1     6
Jonkheer       0     1
Lady           1     0
Major          0     2
Master         0    40
Miss         182     0
Mlle           2     0
Mme            1     0
Mr             0   517
Mrs          125     0
Ms             1     0
Rev            0     6
Sir            0     1


Now try to categorize these. Mlle is an abbreviation for Mademoiselle so should be grouped in with Miss. Mme is an abbreviation for Madame so should be grouped in with Mrs. Then create a broader category of high status titles, like Countess, Don, Dr, Jonkheer, etc. Then anything that is either not common (like Mr. or Mrs.) or doesn't fit into these groupings should go into a misc category (like Rev or Major)

In [252]:
for dataset in full_data:
    dataset['Title'] = dataset['Title'].replace(['Countess','Don', 'Dr', 'Jonkheer', 'Lady', 'Sir'], 'HighStatus')
    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace(['Capt', 'Col', 'Major', 'Ms', 'Rev'], 'Misc')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
    
train['Title'].value_counts()

Mr            517
Miss          184
Mrs           126
Master         40
Misc           12
HighStatus     12
Name: Title, dtype: int64

Seems like whether someone was alone also has an impact on survival rates

Since 'S' is the most common value for Embarked, I'll just fill in the null values here with 'S'

In [253]:
for dataset in full_data:
    dataset['Embarked'] = dataset['Embarked'].fillna('S')
print (train[['Embarked', 'Survived']].groupby(['Embarked'], as_index=False).mean())

  Embarked  Survived
0        C  0.553571
1        Q  0.389610
2        S  0.339009


Map some of the features into numerical variables

In [254]:
for dataset in full_data:
    # Mapping Sex
    dataset['Sex'] = dataset['Sex'].map({'female': 0, 'male': 1}).astype(int)
    
    # Mapping titles
    title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Misc": 5, "HighStatus": 6}
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)
    
    
    # Mapping Embarked
    dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
    
    # Mapping Fare
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[ dataset['Fare'] > 31, 'Fare'] = 3
    dataset['Fare'] = dataset['Fare'].astype(int)
    
    # Mapping Age
    dataset.loc[ dataset['Age'] <= 16, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[ dataset['Age'] > 64, 'Age'] = 4

In [255]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FamilySize,IsAlone,CategoricalFare,CategoricalAge,Title
0,1,0,3,"Braund, Mr. Owen Harris",1,1,1,0,A/5 21171,0,,0,2,0,"(-0.001, 7.91]","(16.0, 32.0]",1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,0,PC 17599,3,C85,1,2,0,"(31.0, 512.329]","(32.0, 48.0]",3
2,3,1,3,"Heikkinen, Miss. Laina",0,1,0,0,STON/O2. 3101282,1,,0,1,1,"(7.91, 14.454]","(16.0, 32.0]",2
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,2,1,0,113803,3,C123,0,2,0,"(31.0, 512.329]","(32.0, 48.0]",3
4,5,0,3,"Allen, Mr. William Henry",1,2,0,0,373450,1,,0,1,1,"(7.91, 14.454]","(32.0, 48.0]",1


Now, I'll remove any features that are not useful. "PassengerID" and "Ticket" likely aren't relevant and most of the values in the 'Cabin' are null so I'll drop all three. "SibSp" and "Parch" have been combined to get "FamilySize" so drop those two as well. Then since "Name" was used to get "Title", it too can be dropped 

In [256]:
drop_elements = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp','Parch']
train = train.drop(drop_elements, axis = 1)

train = train.drop(['CategoricalAge', 'CategoricalFare'], axis = 1)
test  = test.drop(drop_elements, axis = 1)

In [257]:
#train = train.values
#test  = test.values