# Import packages

In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.linear_model import LinearRegression
from statistics import mean
import inspect

# Import dataset

In [51]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [8]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


# Missing value imputation

In [77]:
def summary_NaN(dataset):
    for variable in dataset.columns.tolist():
        print('{}: {} missing values ({}%)'.format(variable, dataset[variable].isna().sum(), round(dataset[variable].isna().sum() * 100 / len(dataset[variable])),3))
    print('\n')

def get_name(var):
    callers_local_vars = inspect.currentframe().f_back.f_locals.items()
    names = [var_name for var_name, var_val in callers_local_vars if var_val is var]
    if len(names) > 0:
        return names[0]


summary_NaN(train)
summary_NaN(test)              

PassengerId: 0 missing values (0.0%)
Survived: 0 missing values (0.0%)
Pclass: 0 missing values (0.0%)
Name: 0 missing values (0.0%)
Sex: 0 missing values (0.0%)
Age: 177 missing values (20.0%)
Ticket: 0 missing values (0.0%)
Fare: 0 missing values (0.0%)
Cabin: 687 missing values (77.0%)
Embarked: 0 missing values (0.0%)
Relatives: 0 missing values (0.0%)


PassengerId: 0 missing values (0.0%)
Pclass: 0 missing values (0.0%)
Name: 0 missing values (0.0%)
Sex: 0 missing values (0.0%)
Age: 86 missing values (21.0%)
Ticket: 0 missing values (0.0%)
Fare: 0 missing values (0.0%)
Cabin: 327 missing values (78.0%)
Embarked: 0 missing values (0.0%)
Relatives: 0 missing values (0.0%)




In [78]:
train.corr().abs()

Unnamed: 0,PassengerId,Survived,Pclass,Age,Fare,Relatives
PassengerId,1.0,0.005007,0.035144,0.036847,0.012658,0.040143
Survived,0.005007,1.0,0.338481,0.077221,0.257307,0.016639
Pclass,0.035144,0.338481,1.0,0.369226,0.5495,0.065997
Age,0.036847,0.077221,0.369226,1.0,0.096067,0.301914
Fare,0.012658,0.257307,0.5495,0.096067,1.0,0.217138
Relatives,0.040143,0.016639,0.065997,0.301914,0.217138,1.0


In [79]:
test.corr().abs()

Unnamed: 0,PassengerId,Pclass,Age,Fare,Relatives
PassengerId,1.0,0.026751,0.034102,0.00873,0.030087
Pclass,0.026751,1.0,0.492143,0.577444,0.012736
Age,0.034102,0.492143,1.0,0.332159,0.093632
Fare,0.00873,0.577444,0.332159,1.0,0.250385
Relatives,0.030087,0.012736,0.093632,0.250385,1.0


The correlation coefficient between SibSp and Parch is 0.3068(train) and 0.4148(test). Looking back to the definition of SibSp and Parch, SibSp means the number of Sibling/Spouse aboard and Parch represents the number of Parents/Children aboard. Both of them refers to how many people in Passenger's family are on the Titanic.

Hence, SibSp and Parch can be considered as a part of variable 'Relatives'. Due to multicolinearity, SibSp and Parch should be merged as 'Relatives'.

In [52]:
train['Relatives'] = train.SibSp + train.Parch
train = train.drop(columns = ['SibSp', 'Parch'])
test['Relatives'] = test.SibSp + test.Parch
test = test.drop(columns = ['SibSp', 'Parch'])
train.info()
print('\n')
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
Relatives      891 non-null int64
dtypes: float64(2), int64(4), object(5)
memory usage: 76.6+ KB


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 10 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
Relatives      418 non-null 

Now, we focus on the missing value in Fare because just 1 missing value in this column.

In [65]:
test[test.Fare.isna()]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,Ticket,Fare,Cabin,Embarked,Relatives
152,1044,3,"Storey, Mr. Thomas",male,60.5,3701,,,S,0


From the correlation coefficient matrix, it is obvious that Pclass is relative to Fare. From the online materials, the Pclass is the class of the passenger's travel. Pclass = 1 is the upper class, Pclass = 2 is the median class and Pclass = 3 is the lower class. Therefore, the Fare decreases while the Pclass increases. 

In the above table, we notice that Relatives = 0 and Pclass = 3, which means this is a single passenger who take a lower class travel. Basically, we can fix the missing value by the mean value of the 'Pclass = 3' group with the conditions 'Relatives = 0' and 'Embarked = S'.

In [69]:
total_df = pd.concat([train.drop(columns = ['Survived']), test])
mean = total_df[(total_df['Pclass'] == 3) & (total_df.Relatives == 0) & (total_df.Embarked == 'S')].Fare.mean()
test.loc[test.Fare.isna(), 'Fare'] = mean

In [76]:
train[train.Embarked.isna()]

As for the 2 missing values in Embarked column, we can easilly fix it with true data by online materials.

In [None]:
train.loc[train.PassengerId == 62, 'Embarked'] = 'S'
train.loc[train.PassengerId == 830, 'Embarked'] = 'S'

As is known to all, Pclass refers to where the passenger live on the Titanic. Each class is supposed to relative to a specific cabin area. So, the Pclass column includes some important information of column Cabin. In other hand, in these two dataset, most of data(77% in 'train' and 78% in 'test' respectively) are missing. It is difficult to repair the Cabin column. Therefor, we will drop this column. 

In [85]:
train = train.drop(columns = ['Cabin'])
test = test.drop(columns = ['Cabin'])

Next part is Age. There is about 20% data missing in both train dataset and test dataset. From the correlation coefficient matrix, we find that the correlation coefficients between Age and Pclass are high, 0.3692 in 'train' and 0.4921 in 'test' respectively. In the other hands,  'Name' and 'Ticket' are attributes of each Passenger Id. Hence, there two columns have no influence to our prediction so that they are supposed to be dropped.

Now, we group the data by the Pclass against Age.

In [105]:
total_df = pd.concat([train.drop(columns = 'Survived'), test])
total_df = total_df.drop(columns = ['Name', 'Ticket'])
median_age = total_df[total_df.notna()].groupby(['Pclass', 'Sex']).median().Age

Pclass  Sex   
1       female    36.0
        male      42.0
2       female    28.0
        male      29.5
3       female    22.0
        male      25.0
Name: Age, dtype: float64

In [104]:
mean_age = total_df[total_df.notna()].groupby(['Pclass', 'Sex']).mean().Age

Pclass  Sex   
1       female    37.037594
        male      41.029272
2       female    27.499223
        male      30.815380
3       female    22.185329
        male      25.962264
Name: Age, dtype: float64

Compare with the mean and median of each group, theere are little difference between each mean and median. So, we consider that the missing value cause little influence to the data set. So, we use the median value of each group to replace the missing value in each group.

In [106]:
total_df[total_df.Age.isna()] 

Unnamed: 0,PassengerId,Pclass,Sex,Age,Fare,Embarked,Relatives
5,6,3,male,,8.4583,Q,0
17,18,2,male,,13.0000,S,0
19,20,3,female,,7.2250,C,0
26,27,3,male,,7.2250,C,0
28,29,3,female,,7.8792,Q,0
29,30,3,male,,7.8958,S,0
31,32,1,female,,146.5208,C,1
32,33,3,female,,7.7500,Q,0
36,37,3,male,,7.2292,C,0
42,43,3,male,,7.8958,C,0
