In [54]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

In [55]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


First try to understand the data. There are 12 columns in our dataframe:

**"Survived"** - dependent variable and what we are trying to predict, binary datatype of 1 for survived and 0 for did not survive

**"PassengerID"** and **"Ticket"** - unique identifiers, probably don't help predict outcome and will therefore be dropped 

**"Pclass"** - ticket class with 1 = upper class, 2 = middle class, and 3 = lower class

**"Name"** - represents passenger name, could potentially be parsed to get useful features, so I'll keep it

**"Sex"** - categorical variable, either male or female, could be one-hot encoded for later analysis

**"Age"** and **"Fare"** - both continuous variables

**"SibSp"** - number of related siblings/spouse aboard

**"Parch"** - number of related parents/children aboard

**"Cabin"** - approximate position on ship when incident occured

**"Embarked"** - categorical variable for port where passenger embarked, C = Cherbourg, Q = Queenstown, S = Southampton, could also be one-hot encoded

Start by cleaning the data, remove "PassengerID" and "Ticket" because they likely aren't relevant. Then remove the NaNs and anomalies

In [56]:
df = df.drop(["PassengerId", "Ticket"], axis=1)

In [66]:
len(df[df['Fare'].isnull()])

0

It seems like most of the values in the 'Cabin' are null so, drop that column as well

In [58]:
df = df.drop(["Cabin"], axis=1)

In [67]:
len(df[df['Embarked'].isnull()])

2

In [43]:
df_upper = df[df['Pclass'] == 1]
df_lower = df[df['Pclass'] == 3]
df_middle = df[df['Pclass'] == 2]

In [44]:
df_upper['Survived'].value_counts()

1    136
0     80
Name: Survived, dtype: int64

In [45]:
df_middle['Survived'].value_counts()

0    97
1    87
Name: Survived, dtype: int64

In [46]:
df_lower['Survived'].value_counts()

0    372
1    119
Name: Survived, dtype: int64

In [38]:
df['Age'].describe()

count    714.000000
mean      29.699118
std       14.526497
min        0.420000
25%       20.125000
50%       28.000000
75%       38.000000
max       80.000000
Name: Age, dtype: float64

In [48]:
upper_women = df_upper[df_upper['Sex'] == 'female']
upper_women['Survived'].value_counts()

1    91
0     3
Name: Survived, dtype: int64

In [49]:
lower_men = df_lower[df_lower['Sex'] == 'male']
lower_men['Survived'].value_counts()

0    300
1     47
Name: Survived, dtype: int64

In [39]:
df['Fare'].describe()

count    891.000000
mean      32.204208
std       49.693429
min        0.000000
25%        7.910400
50%       14.454200
75%       31.000000
max      512.329200
Name: Fare, dtype: float64

In [40]:
df_free = df[df['Fare'] == 0]

In [41]:
df_free

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
179,0,3,"Leonard, Mr. Lionel",male,36.0,0,0,0.0,S
263,0,1,"Harrison, Mr. William",male,40.0,0,0,0.0,S
271,1,3,"Tornquist, Mr. William Henry",male,25.0,0,0,0.0,S
277,0,2,"Parkes, Mr. Francis ""Frank""",male,,0,0,0.0,S
302,0,3,"Johnson, Mr. William Cahoone Jr",male,19.0,0,0,0.0,S
413,0,2,"Cunningham, Mr. Alfred Fleming",male,,0,0,0.0,S
466,0,2,"Campbell, Mr. William",male,,0,0,0.0,S
481,0,2,"Frost, Mr. Anthony Wood ""Archie""",male,,0,0,0.0,S
597,0,3,"Johnson, Mr. Alfred",male,49.0,0,0,0.0,S
633,0,1,"Parr, Mr. William Henry Marsh",male,,0,0,0.0,S
