# 1. Importing Libraries

In [32]:
import pandas as pd
import numpy as np
import altair as alt

from sklearn.model_selection import train_test_split

# Reading Data

In [4]:
data_df = pd.read_csv("data/train.csv", index_col="PassengerId")
data_df

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...
887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


# Keeping aside test set

In [8]:
train_df, test_df = train_test_split(data_df, test_size=0.2, random_state=2020)
print(train_df.shape)
print(test_df.shape)

(712, 11)
(179, 11)


# Basic EDA

## High level summary

In [10]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 712 entries, 132 to 865
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  712 non-null    int64  
 1   Pclass    712 non-null    int64  
 2   Name      712 non-null    object 
 3   Sex       712 non-null    object 
 4   Age       575 non-null    float64
 5   SibSp     712 non-null    int64  
 6   Parch     712 non-null    int64  
 7   Ticket    712 non-null    object 
 8   Fare      712 non-null    float64
 9   Cabin     160 non-null    object 
 10  Embarked  710 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 66.8+ KB


In [12]:
train_df.describe(include="all")

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,712.0,712.0,712,712,575.0,712.0,712.0,712,712.0,160,710
unique,,,712,2,,,,565,,121,3
top,,,"Smith, Miss. Marion Elsie",male,,,,CA. 2343,,B96 B98,S
freq,,,1,461,,,,6,,4,513
mean,0.376404,2.307584,,,29.913478,0.526685,0.411517,,32.493392,,
std,0.484824,0.838288,,,14.782496,1.118501,0.835588,,48.544058,,
min,0.0,1.0,,,0.67,0.0,0.0,,0.0,,
25%,0.0,2.0,,,20.0,0.0,0.0,,7.925,,
50%,0.0,3.0,,,28.0,0.0,0.0,,14.5,,
75%,1.0,3.0,,,39.0,1.0,1.0,,31.275,,


## Analyzing Nulls

In [30]:
na_summary = train_df.isna().sum().reset_index(name="na_count").query("na_count != 0")
na_summary["perc"] = na_summary["na_count"] / train_df.shape[0] * 100
na_summary

Unnamed: 0,index,na_count,perc
4,Age,137,19.241573
9,Cabin,552,77.52809
10,Embarked,2,0.280899


Observations:
- We have 77% nulls in Cabin. Hence we will not use this feature
- We only have 2 missing values in Embarked. Creating a new category called `missing` may not be useful as we only have 2 records to learn from. We will impute this with most frequent value
- We have around 20% nulls in age. We will look into it in details to find out an imputation strategy

## Feature Types

In [31]:
train_df.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
132,0,3,"Coelho, Mr. Domingos Fernandeo",male,20.0,0,0,SOTON/O.Q. 3101307,7.05,,S
71,0,2,"Jenkin, Mr. Stephen Curnow",male,32.0,0,0,C.A. 33111,10.5,,S
782,1,1,"Dick, Mrs. Albert Adrian (Vera Gillespie)",female,17.0,1,0,17474,57.0,B20,S
509,0,3,"Olsen, Mr. Henry Margido",male,28.0,0,0,C 4001,22.525,,S
117,0,3,"Connors, Mr. Patrick",male,70.5,0,0,370369,7.75,,Q


From the column description and quick look at the values, we distribute the features in below categories:

In [19]:
numeric_features = ["Age", "SibSp" ,"Parch", "Fare"]
categorical_features = ["Sex", "Embarked"]
ordinal_features = ["Pclass"]
remainder_features = []
drop_features = ["Name", "Ticket", "Cabin"]

We are dropping:
- `Name` as its a text field
- `Ticket` as its a text field
- `Cabin` because most of the values are `na`

In [80]:
age_plot = (
    alt.Chart(train_df)
    .mark_line(interpolate="step")
    .encode(x=alt.X("Age", bin=alt.Bin(maxbins=20)), y="count()", color="Survived:N")
    .properties(width=400, height=200)
)

age_plot + alt.Chart(
    pd.DataFrame(
        data={"age": np.repeat(np.mean(train_df.Age.dropna()), 100), "all": np.arange(0, 100)}
    )
).mark_line(color="red").encode(x="age", y="all") + alt.Chart(
    pd.DataFrame(
        data={"age": np.repeat(np.median(train_df.Age.dropna()), 100), "all": np.arange(0, 100)}
    )
).mark_line(color="green").encode(x="age", y="all")

In [67]:
np.median(train_df.Age.dropna())

28.0