In [43]:
# library imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [44]:
# autoreload module
%load_ext autoreload
% autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [45]:
# import cleaner script
import cleaning_helper

In [46]:
dataframe = pd.read_csv("data/train.csv")

In [92]:
dataframe.isnull().sum()

Survived        0
Pclass          0
SibSp           0
Parch           0
dis_Age         0
dis_Fare        0
dis_Embarked    0
dis_Sex         0
dis_name        0
dtype: int64

## Expected problems
    1. Cabin has NaN values. No need to fix since concluding from the Tableau analysis we know that this is not important
    2. Age has NaN values. But this is important for modelling the data. So we need to figure out a way to fill these values without disturbing the data.
    3. Embarked has two missing values. We can simply fill these with the most frequent values.

In [48]:
dataframe.drop(labels=['Cabin', 'Ticket', 'PassengerId'], axis=1, inplace=True)

In [49]:
dataframe[dataframe['Embarked'].isnull()]

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
61,1,1,"Icard, Miss. Amelie",female,38.0,0,0,80.0,
829,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,80.0,


In [50]:
cleaning_helper.fix_embarked(dataframe)

In [51]:
dataframe.isnull().sum()

Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      0
dtype: int64

## Working with the [Age](https://public.tableau.com/profile/gauscian#!/vizhome/tab-wkb/TitanicDataSetAnalysis?publish=yes) field
    1. Problem 1 : Want to convert the continuous to a nominal variable.
    2. Problem 2 : Remove the NaN values by replacing the values from roughly the same distribution.

![alt text](tableau-exports/AgeSurvived.png)

In [52]:
cleaning_helper.cont_discrete(dataframe, 10, 'Age')

In [53]:
dataframe.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,dis_Age
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S,0
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,1
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S,2
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S,1
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S,1


## Working with the [Fare](https://public.tableau.com/profile/gauscian#!/vizhome/tab-wkb/TitanicDataSetAnalysis?publish=yes) field
    1. As we know from the tableau analysis, this field has a direct impact on the number people surviving.

In [54]:
# Using the small plugin I just made for Age. Works for this too. 
cleaning_helper.cont_discrete(dataframe, 23, 'Fare')

![alt text](tableau-exports/Fare.png)

#### Cleaning up the older fields.

In [55]:
dataframe.drop(['Age', 'Fare'], inplace=True, axis=1)

In [56]:
dataframe.head()

Unnamed: 0,Survived,Pclass,Name,Sex,SibSp,Parch,Embarked,dis_Age,dis_Fare
0,0,3,"Braund, Mr. Owen Harris",male,1,0,S,0,0
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,1,0,C,1,1
2,1,3,"Heikkinen, Miss. Laina",female,0,0,S,2,0
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,1,0,S,1,2
4,0,3,"Allen, Mr. William Henry",male,0,0,S,1,0


In [57]:
# It is important to acknowledge that we do not have any more null values in our dataframe
dataframe.isnull().sum()

Survived    0
Pclass      0
Name        0
Sex         0
SibSp       0
Parch       0
Embarked    0
dis_Age     0
dis_Fare    0
dtype: int64

### Moving forward let's quickly create the nominal-string fields into nominal-integer fields 
This will enable us to use these fields for data modelling.

In [58]:
cleaning_helper.discretize_field(dataframe, 'Embarked')
cleaning_helper.discretize_field(dataframe, 'Sex')
# 0-male, 1-female, S - 0, C - 1, Q - 2

![alt text](tableau-exports/Class1SurvivedMore.png)

![alt text](tableau-exports/C'sPassengerSurviveMore.png)

![alt text](tableau-exports/FemalesSurvivedMore.png)

In [59]:
dataframe.head()

Unnamed: 0,Survived,Pclass,Name,Sex,SibSp,Parch,Embarked,dis_Age,dis_Fare,dis_Embarked,dis_Sex
0,0,3,"Braund, Mr. Owen Harris",male,1,0,S,0,0,0,0
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,1,0,C,1,1,1,1
2,1,3,"Heikkinen, Miss. Laina",female,0,0,S,2,0,0,1
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,1,0,S,1,2,0,1
4,0,3,"Allen, Mr. William Henry",male,0,0,S,1,0,0,0


In [60]:
dataframe.drop(['Embarked', 'Sex'], axis=1,inplace=True)

In [61]:
dataframe.head()

Unnamed: 0,Survived,Pclass,Name,SibSp,Parch,dis_Age,dis_Fare,dis_Embarked,dis_Sex
0,0,3,"Braund, Mr. Owen Harris",1,0,0,0,0,0
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,0,1,1,1,1
2,1,3,"Heikkinen, Miss. Laina",0,0,2,0,0,1
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,0,1,2,0,1
4,0,3,"Allen, Mr. William Henry",0,0,1,0,0,0


## Working with the name column.
    1. The salutations of the names can directly help for inferencing the status of the person
    2. The status of the person directly related to his Survival.

In [None]:
cleaning_helper.working_with_names(dataframe)

In [79]:
dataframe.dis_name.value_counts()

1    517
0    308
2     40
3     26
Name: dis_name, dtype: int64

In [80]:
dataframe.drop(['Name'], axis=1, inplace=True)

In [82]:
dataframe.head()

Unnamed: 0,Survived,Pclass,SibSp,Parch,dis_Age,dis_Fare,dis_Embarked,dis_Sex,dis_name
0,0,3,1,0,0,0,0,0,1
1,1,1,1,0,1,1,1,1,0
2,1,3,0,0,2,0,0,1,0
3,1,1,1,0,1,2,0,1,0
4,0,3,0,0,1,0,0,0,1


## Working with the [SibSp and Parch](https://public.tableau.com/profile/gauscian#!/vizhome/tab-wkb/TitanicDataSetAnalysis?publish=yes).
    1. People with some one are more likely to survive since they stick together

In [95]:
cleaning_helper.have_siblings_not(dataframe)

![alt text](tableau-exports/RelationshipEffect.png)

In [96]:
dataframe.head()

Unnamed: 0,Survived,Pclass,SibSp,Parch,dis_Age,dis_Fare,dis_Embarked,dis_Sex,dis_name,hasSomeOne
0,0,3,1,0,0,0,0,0,1,1
1,1,1,1,0,1,1,1,1,0,1
2,1,3,0,0,2,0,0,1,0,0
3,1,1,1,0,1,2,0,1,0,1
4,0,3,0,0,1,0,0,0,1,0


In [97]:
dataframe.drop(['SibSp','Parch'], axis=1, inplace=True)
dataframe.head()

Unnamed: 0,Survived,Pclass,dis_Age,dis_Fare,dis_Embarked,dis_Sex,dis_name,hasSomeOne
0,0,3,0,0,0,0,1,1
1,1,1,1,1,1,1,0,1
2,1,3,2,0,0,1,0,0
3,1,1,1,2,0,1,0,1
4,0,3,1,0,0,0,1,0


In [99]:
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
Survived        891 non-null int64
Pclass          891 non-null int64
dis_Age         891 non-null int32
dis_Fare        891 non-null int32
dis_Embarked    891 non-null int64
dis_Sex         891 non-null int64
dis_name        891 non-null int64
hasSomeOne      891 non-null int64
dtypes: int32(2), int64(6)
memory usage: 48.8 KB


# Modelling the Data
## At this point we can conclude that the data is now ready for making predictions