In [1]:
# Find the Nan's and remove the columns which wont be required

In [2]:
# Import all the required packages first

import pandas as pd
import numpy as np
from numpy.random import randn
from pandas import Series,DataFrame

from scipy import stats

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [3]:
# Load the training data set

train = pd.read_csv('train.csv')

In [4]:
# First look at the data set

train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
# First let us examine missing values

# Number of missing values in each column of training data
missing_val_count_by_column = (train.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

Age         177
Cabin       687
Embarked      2
dtype: int64


In [7]:
# Remove the Cabin columns

train.drop('Cabin',axis=1,inplace=True)

# Check

train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


In [8]:
# Let us decide what to do about the missing embarked data

embark_ser = Series(train['Embarked'])
embark_ser.value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [10]:
# It seems the best way is to use to impute 'S', the mode, for the two missing entries

embark_ser.replace(np.nan, 'S',inplace=True)

# Check

embark_ser.value_counts()

S    646
C    168
Q     77
Name: Embarked, dtype: int64

In [11]:
# Let us check the changes have been made

missing_val_count_by_column = (train.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

Age    177
dtype: int64


In [21]:
# Finally we have to deal with the missing age data, as this is our first attempt let us just use
# imputation but keep track of this data. First keep track of the data

train_plus = train.copy()

cols_with_missing = [col for col in train.columns if train[col].isnull().any()]

for col in cols_with_missing:
    train_plus[col + '_was_missing'] = train_plus[col].isnull()

train_plus.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Age_was_missing
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,False
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,False
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S,False
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S,False
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S,False


In [25]:
missing_val_count_by_column = (train_plus.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

Age    177
dtype: int64


In [26]:
age_ser = train_plus.Age

In [27]:
age_ser.mean()

29.69911764705882

In [31]:
age_ser.replace(np.nan,age_ser.mean(),inplace=True)

In [32]:
train_plus.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Age_was_missing
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,False
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,False
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S,False
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S,False
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S,False


In [33]:
missing_val_count_by_column = (train_plus.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

Series([], dtype: int64)


In [36]:
# There are now no missing values. For simplicity we will drop a few more columns

train_plus.drop(['Name','PassengerId','Ticket','Fare'],axis=1,inplace=True)

In [37]:
train_plus.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked,Age_was_missing
0,0,3,male,22.0,1,0,S,False
1,1,1,female,38.0,1,0,C,False
2,1,3,female,26.0,0,0,S,False
3,1,1,female,35.0,1,0,S,False
4,0,3,male,35.0,0,0,S,False


In [None]:
# The categorical data will have to be converted before we proceed let us first start to analyse
# the data in the next step