# kaggle Titanic competition

In [1835]:
# import required pakages
import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib as mpl
from matplotlib import pyplot as plt
import seaborn as sns

Load the training and testing data.

In [1836]:
train_data = pd.read_csv("data/train.csv")
test_data = pd.read_csv("data/test.csv")
index = test_data.PassengerId
index.head()

0    892
1    893
2    894
3    895
4    896
Name: PassengerId, dtype: int64

### Get insights into the training & tasting data
Use `train_data.head(10)` for a tabular overview of the training data.

In [1837]:
train_data.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


Use `train_data.info()` to get further insight into the training data.

In [1838]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [1839]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


Display an overview of the missing values.

In [1840]:
def displayMissingValues(data):
    for attribute in data.columns.tolist():
        print('Missing values within {}: {}'.format(attribute, data[attribute].isnull().sum()))

In [1841]:
displayMissingValues(train_data)

Missing values within PassengerId: 0
Missing values within Survived: 0
Missing values within Pclass: 0
Missing values within Name: 0
Missing values within Sex: 0
Missing values within Age: 177
Missing values within SibSp: 0
Missing values within Parch: 0
Missing values within Ticket: 0
Missing values within Fare: 0
Missing values within Cabin: 687
Missing values within Embarked: 2


In [1842]:
displayMissingValues(test_data)

Missing values within PassengerId: 0
Missing values within Pclass: 0
Missing values within Name: 0
Missing values within Sex: 0
Missing values within Age: 86
Missing values within SibSp: 0
Missing values within Parch: 0
Missing values within Ticket: 0
Missing values within Fare: 1
Missing values within Cabin: 327
Missing values within Embarked: 0


`train_data.info()` and `displayMissingValues` reveal some missing data within the columns `Age`, `Cabin`and `Embarked` for the training data. For the testing data there is in addition one missing value at the `Fare` attribute.
<br><br>
In general there are four possibilities to **handle missing values**:
* delete the entries with the missing values
* delete the hole attribute with the missing values
* fill the missing values with the median value of that attribute
* try to get the missing values afterwards

Furthermore `train_data.info()` reveals that the attributes have three data types: 
* `int64` -> which is okay for further processing.
* `float64` -> which is okay for further processing.
* `object` -> these attributes have to be adopted for further processing.
<br>

First let's have a look at these object attributes and their importance for the upcoming machine-learning process:
* `Name` is an individual value with no valuable information but the title may inherit some valuable information for machine learning and will be undergo further preprocessing.
* `Sex` is important for further processing. We will use one-hot-encoding to make this attribute processable.
* `Ticket` is an individual number with no valuable information and therefore not important for further processing.
* `Cabin` is a valuable information source but there are just 204 out of 981 values available. 
* `Embarked` can be interesting for further processing. We will use one-hot-encoding to make this attribute processable.

Use `train_data.describe()` to get an insight of the numerical attributes especially their variance.

In [1843]:
train_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


There are two attributes that should undergo additional preprocessing:
* `Age` with a standard deviation of 14.52 and a max / min difference of 79.58.
* `Fare` with a standard deviation of 49.69 and a max / min difference of about 512.
<br>

Both attributes will undergo a standardization according the following formula:

$$ x_j^{(i)} = \frac{x_j^{(i)} - \mu_j}{s_j} \\ $$

Here $x_j^{(i)}$ is the value of attribute $j$ of the $i's$ example. $\mu_j$ is the mean of attribute $j$ and $s_j$ is the standard deviation of attribute $j$.

Check the correlation between the numerical values:

In [1844]:
train_data.corr()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
PassengerId,1.0,-0.005007,-0.035144,0.036847,-0.057527,-0.001652,0.012658
Survived,-0.005007,1.0,-0.338481,-0.077221,-0.035322,0.081629,0.257307
Pclass,-0.035144,-0.338481,1.0,-0.369226,0.083081,0.018443,-0.5495
Age,0.036847,-0.077221,-0.369226,1.0,-0.308247,-0.189119,0.096067
SibSp,-0.057527,-0.035322,0.083081,-0.308247,1.0,0.414838,0.159651
Parch,-0.001652,0.081629,0.018443,-0.189119,0.414838,1.0,0.216225
Fare,0.012658,0.257307,-0.5495,0.096067,0.159651,0.216225,1.0


In [1845]:
train_data.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


### Prepare the training and testing data

##### Embarked - attribute
Lets have a closer look at the entries with the missing `Embarked` values.

In [1846]:
train_data.loc[train_data['Embarked'].isnull()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
61,62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,


Miss Amelie Icard and Mrs. George Nelson (Martha Evelyn) Stone had the same ticket number and lived in the same cabin. 
Searching for Mrs. George Nelson (Martha Evelyn) Stone on google led to the folowing link [encyclopedia-titanica.org](https://www.encyclopedia-titanica.org/titanic-survivor/martha-evelyn-stone.html). 
<br> 

Here you can find the following information:"Mrs Stone boarded the Titanic in **Southampton** on 10 April 1912 and was travelling in first class with her maid Amelie Icard. She occupied cabin B-28."
<br>

Now we know, that both women enbarked in Southampton which is abbreviated with an `S`. Lets fill this in:

In [1847]:
train_data.loc[train_data['Embarked'].isnull(), 'Embarked'] = 'S'

Check the insertion

In [1848]:
train_data.loc[train_data.PassengerId == 62]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
61,62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,S


In [1849]:
displayMissingValues(train_data)

Missing values within PassengerId: 0
Missing values within Survived: 0
Missing values within Pclass: 0
Missing values within Name: 0
Missing values within Sex: 0
Missing values within Age: 177
Missing values within SibSp: 0
Missing values within Parch: 0
Missing values within Ticket: 0
Missing values within Fare: 0
Missing values within Cabin: 687
Missing values within Embarked: 0


##### Fare - attribute (just testing data)
Lets have a closer look at the entry with the missing Value for the `Fare` attribute.

In [1850]:
test_data.loc[test_data.Fare.isnull()]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
152,1044,3,"Storey, Mr. Thomas",male,60.5,0,0,3701,,,S


Calculate the median of the `Fare` attribute for the 3rd class and substitute it for the missing value. 

In [1851]:
test_data.loc[(test_data.Fare.isnull()), 'Fare'] = test_data.loc[(test_data.Pclass == 3)]['Fare'].median()

In [1852]:
test_data.loc[(test_data.PassengerId == 1044)]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
152,1044,3,"Storey, Mr. Thomas",male,60.5,0,0,3701,7.8958,,S


##### PassengerId & Ticket - attribute
Remove the attributes `PassengerId` and `Ticket` from the training and the test set.

In [1853]:
train_data = train_data.drop(['PassengerId', 'Ticket'], axis=1)
test_data = test_data.drop(['PassengerId', 'Ticket'], axis=1)
train_data.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C85,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,C123,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,,S


In [1854]:
test_data.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,3,"Kelly, Mr. James",male,34.5,0,0,7.8292,,Q
1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,7.0,,S
2,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,9.6875,,Q
3,3,"Wirz, Mr. Albert",male,27.0,0,0,8.6625,,S
4,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,12.2875,,S


In [1855]:
displayMissingValues(train_data)

Missing values within Survived: 0
Missing values within Pclass: 0
Missing values within Name: 0
Missing values within Sex: 0
Missing values within Age: 177
Missing values within SibSp: 0
Missing values within Parch: 0
Missing values within Fare: 0
Missing values within Cabin: 687
Missing values within Embarked: 0


##### Age - attribute
###### Replace missing values (by hand)

There is also the possibility to replace the missing values with the sklearn.impute.SimpleImputer class.
Use [`sklearn.impute.SimpleImputer`](https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html) class for replacing missing values.
<br>
We replace the missing values here by hand because we want to distinguish between the classes and within the classes between the gender. 

In [1856]:
sample_age_incomplete_rows_train = train_data[train_data['Age'].isnull()].head(10)
sample_age_incomplete_rows_train

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
5,0,3,"Moran, Mr. James",male,,0,0,8.4583,,Q
17,1,2,"Williams, Mr. Charles Eugene",male,,0,0,13.0,,S
19,1,3,"Masselmani, Mrs. Fatima",female,,0,0,7.225,,C
26,0,3,"Emir, Mr. Farred Chehab",male,,0,0,7.225,,C
28,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,7.8792,,Q
29,0,3,"Todoroff, Mr. Lalio",male,,0,0,7.8958,,S
31,1,1,"Spencer, Mrs. William Augustus (Marie Eugenie)",female,,1,0,146.5208,B78,C
32,1,3,"Glynn, Miss. Mary Agatha",female,,0,0,7.75,,Q
36,1,3,"Mamee, Mr. Hanna",male,,0,0,7.2292,,C
42,0,3,"Kraeff, Mr. Theodor",male,,0,0,7.8958,,C


In [1857]:
sample_age_incomplete_rows_test = test_data[test_data['Age'].isnull()].head(10)
sample_age_incomplete_rows_test

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
10,3,"Ilieff, Mr. Ylio",male,,0,0,7.8958,,S
22,1,"Flegenheim, Mrs. Alfred (Antoinette)",female,,0,0,31.6833,,S
29,3,"Samaan, Mr. Elias",male,,2,0,21.6792,,C
33,3,"Johnston, Mrs. Andrew G (Elizabeth Lily"" Watson)""",female,,1,2,23.45,,S
36,3,"Roth, Miss. Sarah A",female,,0,0,8.05,,S
39,3,"Hee, Mr. Ling",male,,0,0,56.4958,,S
41,1,"Franklin, Mr. Thomas Parham",male,,0,0,26.55,D34,S
47,3,"Shaughnessy, Mr. Patrick",male,,0,0,7.75,,Q
54,2,"Mangiavacchi, Mr. Serafino Emilio",male,,0,0,15.5792,,C
58,3,"Davison, Mr. Thomas Henry",male,,1,0,16.1,,S


Calculate the median values depending on the class and the gender.

In [1858]:
medianAges = np.zeros((2,3))

for i in range(0, 2):
    for j in range(0, 3):
        if(i == 0):
            medianAges[i, j] = train_data.loc[(train_data.Pclass == j+1) 
                                              & (train_data.Sex == 'female')]['Age'].median()
        else:
            medianAges[i, j] = train_data.loc[(train_data.Pclass == j+1) 
                                              & (train_data.Sex == 'male')]['Age'].median()

medianAges

array([[35. , 28. , 21.5],
       [40. , 30. , 25. ]])

In [1859]:
def addMissingAgeValues(dataset, medianAges):
    for i in range(medianAges.shape[0]):
        for j in range(medianAges.shape[1]):
            if(i == 0):
                dataset.loc[(dataset.Pclass == j+1) & (dataset.Age.isnull()) 
                            & (dataset.Sex == 'female'), 'Age'] = medianAges[i, j]
            else:
                dataset.loc[(dataset.Pclass == j+1) & (dataset.Age.isnull()) 
                            & (dataset.Sex == 'male'), 'Age'] = medianAges[i, j]

Add the missing `Age` values to train_data.

In [1860]:
addMissingAgeValues(train_data, medianAges)

Check if the replacement was successful.

In [1861]:
train_data.loc[sample_age_incomplete_rows_train.index.values]

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
5,0,3,"Moran, Mr. James",male,25.0,0,0,8.4583,,Q
17,1,2,"Williams, Mr. Charles Eugene",male,30.0,0,0,13.0,,S
19,1,3,"Masselmani, Mrs. Fatima",female,21.5,0,0,7.225,,C
26,0,3,"Emir, Mr. Farred Chehab",male,25.0,0,0,7.225,,C
28,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,21.5,0,0,7.8792,,Q
29,0,3,"Todoroff, Mr. Lalio",male,25.0,0,0,7.8958,,S
31,1,1,"Spencer, Mrs. William Augustus (Marie Eugenie)",female,35.0,1,0,146.5208,B78,C
32,1,3,"Glynn, Miss. Mary Agatha",female,21.5,0,0,7.75,,Q
36,1,3,"Mamee, Mr. Hanna",male,25.0,0,0,7.2292,,C
42,0,3,"Kraeff, Mr. Theodor",male,25.0,0,0,7.8958,,C


Add the missing `Age` values to test_data.

In [1862]:
addMissingAgeValues(test_data, medianAges)

Check if the replacement was successful.

In [1863]:
test_data.loc[sample_age_incomplete_rows_test.index.values]

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
10,3,"Ilieff, Mr. Ylio",male,25.0,0,0,7.8958,,S
22,1,"Flegenheim, Mrs. Alfred (Antoinette)",female,35.0,0,0,31.6833,,S
29,3,"Samaan, Mr. Elias",male,25.0,2,0,21.6792,,C
33,3,"Johnston, Mrs. Andrew G (Elizabeth Lily"" Watson)""",female,21.5,1,2,23.45,,S
36,3,"Roth, Miss. Sarah A",female,21.5,0,0,8.05,,S
39,3,"Hee, Mr. Ling",male,25.0,0,0,56.4958,,S
41,1,"Franklin, Mr. Thomas Parham",male,40.0,0,0,26.55,D34,S
47,3,"Shaughnessy, Mr. Patrick",male,25.0,0,0,7.75,,Q
54,2,"Mangiavacchi, Mr. Serafino Emilio",male,30.0,0,0,15.5792,,C
58,3,"Davison, Mr. Thomas Henry",male,25.0,1,0,16.1,,S


##### Cabin - attribute
Replace the cabin attribute by the more general deck attribute which encapsulates lots of cabins. There where 9 decks on the Titanic named A to G. A was the uppermost one and G was the lowermost one. The deck assignment is taken from the fist letter of the cabin. If the cabin has no value than we substitute it by MISSING.

In [1864]:
train_data['Deck'] = train_data.Cabin.str.get(0)
train_data['Deck'] = train_data['Deck'].fillna('MIS_DECK')
test_data['Deck'] = test_data.Cabin.str.get(0)
test_data['Deck'] = test_data['Deck'].fillna('MIS_DECK')

Check the assigned values for train_data.

In [1865]:
a = train_data['Deck'].unique()
np.sort(a)

array(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'MIS_DECK', 'T'], dtype=object)

Check the assigned values for test_data.

In [1866]:
b = test_data['Deck'].unique()
np.sort(b)

array(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'MIS_DECK'], dtype=object)

In [1867]:
train_data['Deck'].value_counts()

MIS_DECK    687
C            59
B            47
D            33
E            32
A            15
F            13
G             4
T             1
Name: Deck, dtype: int64

In [1868]:
test_data['Deck'].value_counts()

MIS_DECK    327
C            35
B            18
D            13
E             9
F             8
A             7
G             1
Name: Deck, dtype: int64

There is a T Deck within the training set which is not a correct Deck! Check where this comes from.

In [1869]:
train_data[train_data.Deck == 'T']

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Deck
339,0,1,"Blackwell, Mr. Stephen Weart",male,45.0,0,0,35.5,T,S,T


A Search on google led to the following result again from [encyclopedia-titanica.org](https://www.encyclopedia-titanica.org/titanic-victim/stephen-weart-blackwell.html): Blackwell occupied cabin "T" on the **Boat Deck** (ticket number 113784 which cost £35, 10s). And from [wikipedia](https://en.wikipedia.org/wiki/RMS_Titanic) you get the following information: "The Boat Deck, on which the lifeboats were housed..." Thus we know, that cabin T is located closest to deck A and therefore we replace T by A.  

In [1870]:
train_data.Deck.replace('T', 'A', inplace=True)
train_data['Deck'].value_counts()

MIS_DECK    687
C            59
B            47
D            33
E            32
A            16
F            13
G             4
Name: Deck, dtype: int64

Add -Deck to each deck description for better column naming after one-hot encoded this attribute.

In [1871]:
train_data.Deck.replace(['A','B','C','D','E','F','G'], 
                        ['A-Deck','B-Deck','C-Deck','D-Deck','E-Deck','F-Deck','G-Deck'], inplace=True)

In [1872]:
test_data.Deck.replace(['A','B','C','D','E','F','G'], 
                        ['A-Deck','B-Deck','C-Deck','D-Deck','E-Deck','F-Deck','G-Deck'], inplace=True)

Finally delete the `Cabin` attribute.

In [1873]:
train_data = train_data.drop('Cabin', axis=1)
test_data = test_data.drop('Cabin', axis=1)
train_data.head(2)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Deck
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S,MIS_DECK
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,C-Deck


In [1874]:
test_data.head(3)

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Deck
0,3,"Kelly, Mr. James",male,34.5,0,0,7.8292,Q,MIS_DECK
1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,7.0,S,MIS_DECK
2,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,9.6875,Q,MIS_DECK


##### Name - attribute
Extract the titles out of the name attribute and create a new attribute `Title`.

In [1875]:
#Title = train_data['Name'].str.split(',').str.get(1).str.split('.').str.get(0).str.strip()

In [1876]:
train_data['Title'] = train_data['Name'].str.split(',').str.get(1).str.split('.').str.get(0).str.strip()
test_data['Title'] = test_data['Name'].str.split(',').str.get(1).str.split('.').str.get(0).str.strip()
train_data.Title.unique()

array(['Mr', 'Mrs', 'Miss', 'Master', 'Don', 'Rev', 'Dr', 'Mme', 'Ms',
       'Major', 'Lady', 'Sir', 'Mlle', 'Col', 'Capt', 'the Countess',
       'Jonkheer'], dtype=object)

In [1877]:
test_data.Title.unique()

array(['Mr', 'Mrs', 'Miss', 'Master', 'Ms', 'Col', 'Rev', 'Dr', 'Dona'],
      dtype=object)

In [1878]:
train_data['Title'].value_counts()

Mr              517
Miss            182
Mrs             125
Master           40
Dr                7
Rev               6
Major             2
Col               2
Mlle              2
Mme               1
Jonkheer          1
Ms                1
Sir               1
Lady              1
Don               1
the Countess      1
Capt              1
Name: Title, dtype: int64

In [1879]:
train_data[train_data['Title'] == 'Ms']

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Deck,Title
443,1,2,"Reynaldo, Ms. Encarnacion",female,28.0,0,0,13.0,S,MIS_DECK,Ms


Title substitution to reduce the amounts of attributes:
* The titles `Miss`, `Mrs`, `Ms` `Mme`(Madam) and `Mlle`(Mademoiselle) will be substituted by `Miss\Ms`.
* The titles `Mr`, `Master` will be substituted by `Mr\Master`.
* The titles `Dr`, `Rev`, `Major`, `Col`, `Jonkheer`, `Sir`, `Lady`, `Don`, `the Countess` and `Capt` will be substituted by `ImpPers`.

In [1880]:
train_data.Title.replace(['Miss', 'Mrs', 'Ms', 'Mme', 'Mlle'], 'Miss\Ms', inplace=True)
train_data.Title.replace(['Mr', 'Master'], 'Mr\Master', inplace=True)
train_data.Title.replace(['Dr', 'Rev', 'Major', 'Col', 'Jonkheer', 'Sir', 
                          'Lady', 'Don', 'the Countess', 'Capt'], 'ImpPers', inplace=True)
train_data['Title'].value_counts()

Mr\Master    557
Miss\Ms      311
ImpPers       23
Name: Title, dtype: int64

In [1881]:
test_data.Title.replace(['Miss', 'Mrs', 'Ms'], 'Miss\Ms', inplace=True)
test_data.Title.replace(['Mr', 'Master'], 'Mr\Master', inplace=True)
test_data.Title.replace(['Dr', 'Rev', 'Col', 'Dona'], 'ImpPers', inplace=True)
test_data['Title'].value_counts()

Mr\Master    261
Miss\Ms      151
ImpPers        6
Name: Title, dtype: int64

Finally delete the `Name` attribute.

In [1882]:
train_data = train_data.drop('Name', axis=1)
test_data = test_data.drop('Name', axis=1)
train_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Deck,Title
0,0,3,male,22.0,1,0,7.25,S,MIS_DECK,Mr\Master
1,1,1,female,38.0,1,0,71.2833,C,C-Deck,Miss\Ms
2,1,3,female,26.0,0,0,7.925,S,MIS_DECK,Miss\Ms
3,1,1,female,35.0,1,0,53.1,S,C-Deck,Miss\Ms
4,0,3,male,35.0,0,0,8.05,S,MIS_DECK,Mr\Master


##### SibSp & Parch - attribute
Get some insight into the siblings / spouses and patents / children attributes

In [1883]:
train_data['SibSp'].value_counts()

0    608
1    209
2     28
4     18
3     16
8      7
5      5
Name: SibSp, dtype: int64

In [1884]:
train_data.Parch.value_counts()

0    678
1    118
2     80
5      5
3      5
4      4
6      1
Name: Parch, dtype: int64

In [1885]:
train_data.loc[(train_data.SibSp >= 1)]['Survived'].sum()

132

In [1886]:
train_data.loc[(train_data.Parch >= 1)]['Survived'].sum()

109

In [1887]:
train_data.loc[(train_data.SibSp >= 1) | (train_data.Parch >= 1)]['Survived'].sum()

179

A combination of the two attributes `SibSp` and `Parch` seems to be useful. We make a distinction between traveling with family or traveling alone. Family is sibling, spouse, parents or children.

In [1888]:
train_data['Family'] = ((train_data.SibSp >= 1) | (train_data.Parch >= 1))
test_data['Family'] = ((test_data.SibSp >= 1) | (test_data.Parch >= 1))
train_data.head(3)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Deck,Title,Family
0,0,3,male,22.0,1,0,7.25,S,MIS_DECK,Mr\Master,True
1,1,1,female,38.0,1,0,71.2833,C,C-Deck,Miss\Ms,True
2,1,3,female,26.0,0,0,7.925,S,MIS_DECK,Miss\Ms,False


Use [`OrdinalEncoder`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OrdinalEncoder.html#sklearn.preprocessing.OrdinalEncoder) to encode True to 1 and False to 0.

In [1889]:
from sklearn.preprocessing import OrdinalEncoder

ordinal_encoder = OrdinalEncoder()
train_data['Family'] = ordinal_encoder.fit_transform(train_data[['Family']])
test_data['Family'] = ordinal_encoder.fit_transform(test_data[['Family']])
train_data.head(3)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Deck,Title,Family
0,0,3,male,22.0,1,0,7.25,S,MIS_DECK,Mr\Master,1.0
1,1,1,female,38.0,1,0,71.2833,C,C-Deck,Miss\Ms,1.0
2,1,3,female,26.0,0,0,7.925,S,MIS_DECK,Miss\Ms,0.0


In [1890]:
test_data.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Deck,Title,Family
0,3,male,34.5,0,0,7.8292,Q,MIS_DECK,Mr\Master,0.0
1,3,female,47.0,1,0,7.0,S,MIS_DECK,Miss\Ms,1.0
2,2,male,62.0,0,0,9.6875,Q,MIS_DECK,Mr\Master,0.0
3,3,male,27.0,0,0,8.6625,S,MIS_DECK,Mr\Master,0.0
4,3,female,22.0,1,1,12.2875,S,MIS_DECK,Miss\Ms,1.0


Delete the attributes `SibSp` and `Parch` from the training  and test data.

In [1891]:
train_data = train_data.drop(['SibSp', 'Parch'], axis=1)
test_data = test_data.drop(['SibSp', 'Parch'], axis=1)
train_data.head(3)

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Deck,Title,Family
0,0,3,male,22.0,7.25,S,MIS_DECK,Mr\Master,1.0
1,1,1,female,38.0,71.2833,C,C-Deck,Miss\Ms,1.0
2,1,3,female,26.0,7.925,S,MIS_DECK,Miss\Ms,0.0


In [1892]:
test_data.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,Deck,Title,Family
0,3,male,34.5,7.8292,Q,MIS_DECK,Mr\Master,0.0
1,3,female,47.0,7.0,S,MIS_DECK,Miss\Ms,1.0
2,2,male,62.0,9.6875,Q,MIS_DECK,Mr\Master,0.0
3,3,male,27.0,8.6625,S,MIS_DECK,Mr\Master,0.0
4,3,female,22.0,12.2875,S,MIS_DECK,Miss\Ms,1.0


##### Age - attribute

In [1893]:
np.count_nonzero(train_data.Age.unique())

89

##### Fare - Attribute

##### Survived - attribute
Drop the label attribute `Survived` from the training data set and move it into the label set `y_train`.

In [1894]:
y_train = train_data['Survived'].copy()
X_train = train_data.drop('Survived', axis=1)
X_test = test_data
X_train.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,Deck,Title,Family
0,3,male,22.0,7.25,S,MIS_DECK,Mr\Master,1.0
1,1,female,38.0,71.2833,C,C-Deck,Miss\Ms,1.0
2,3,female,26.0,7.925,S,MIS_DECK,Miss\Ms,0.0
3,1,female,35.0,53.1,S,C-Deck,Miss\Ms,1.0
4,3,male,35.0,8.05,S,MIS_DECK,Mr\Master,0.0


### Preprocessing categorical attributes
Extract the categorical attributes `Sex`, `Embarked`, `Deck`and `Title` and store them within cat_attributes. 

In [1895]:
cat_attributes_train = X_train[['Sex', 'Embarked', 'Deck', 'Title']]
cat_attributes_test = X_test[['Sex', 'Embarked', 'Deck', 'Title']]
cat_attributes.head()

Unnamed: 0,Sex,Embarked,Deck,Title
0,male,S,MIS_DECK,Mr\Master
1,female,C,C-Deck,Miss\Ms
2,female,S,MIS_DECK,Miss\Ms
3,female,S,C-Deck,Miss\Ms
4,male,S,MIS_DECK,Mr\Master


#### One-Hot-Encoder
Use [`OneHotEncoder`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html) to transform the categorical values of the attributes `Sex` and `Embarked` into numerical values. The result is a matrix with $n$ columns representing the $n$ different categorical values. In this case $n = 5$. We do not want a sparse matrix but an array, therefore we use `sparse=False`.

In [1896]:
from sklearn.preprocessing import OneHotEncoder

cat_encoder = OneHotEncoder(sparse=False)
X_train_1hot = cat_encoder.fit_transform(cat_attributes_train)
X_test_1hot = cat_encoder.fit_transform(cat_attributes_test)
X_train_1hot

array([[0., 1., 0., ..., 0., 0., 1.],
       [1., 0., 1., ..., 0., 1., 0.],
       [1., 0., 0., ..., 0., 1., 0.],
       ...,
       [1., 0., 0., ..., 0., 1., 0.],
       [0., 1., 1., ..., 0., 0., 1.],
       [0., 1., 0., ..., 0., 0., 1.]])

Get the category names out of the cat_encoder and store them within a list.

In [1897]:
categories_1hot = cat_encoder.categories_
categories_1hot_list = []
for i in range(len(categories_1hot)):
    for j in range(len(categories_1hot[i])):
        categories_1hot_list.append(categories_1hot[i][j])
    
categories_1hot_list

['female',
 'male',
 'C',
 'Q',
 'S',
 'A-Deck',
 'B-Deck',
 'C-Deck',
 'D-Deck',
 'E-Deck',
 'F-Deck',
 'G-Deck',
 'MIS_DECK',
 'ImpPers',
 'Miss\\Ms',
 'Mr\\Master']

Delete the categorical attributes from X-train. They will be replaced by the one-hot encoded values.

In [1898]:
X_train = X_train.drop(['Sex', 'Embarked', 'Deck', 'Title'], axis = 1)
X_test = X_test.drop(['Sex', 'Embarked', 'Deck', 'Title'], axis = 1)
X_train.head()

Unnamed: 0,Pclass,Age,Fare,Family
0,3,22.0,7.25,1.0
1,1,38.0,71.2833,1.0
2,3,26.0,7.925,0.0
3,1,35.0,53.1,1.0
4,3,35.0,8.05,0.0


Convert the array of one-hot encoder into a pandas DataFrame.

In [1899]:
X_train_1hot = pd.DataFrame(X_train_1hot, columns=categories_1hot_list)
X_test_1hot = pd.DataFrame(X_test_1hot, columns=categories_1hot_list)
X_train_1hot.head(10)

Unnamed: 0,female,male,C,Q,S,A-Deck,B-Deck,C-Deck,D-Deck,E-Deck,F-Deck,G-Deck,MIS_DECK,ImpPers,Miss\Ms,Mr\Master
0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
5,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
6,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
7,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
8,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
9,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


In [1900]:
X_train_1hot.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 16 columns):
female       891 non-null float64
male         891 non-null float64
C            891 non-null float64
Q            891 non-null float64
S            891 non-null float64
A-Deck       891 non-null float64
B-Deck       891 non-null float64
C-Deck       891 non-null float64
D-Deck       891 non-null float64
E-Deck       891 non-null float64
F-Deck       891 non-null float64
G-Deck       891 non-null float64
MIS_DECK     891 non-null float64
ImpPers      891 non-null float64
Miss\Ms      891 non-null float64
Mr\Master    891 non-null float64
dtypes: float64(16)
memory usage: 111.5 KB


Concatenate the one-hot encoded categories within `X_train_1hot` to `X_train` 

In [1901]:
X_train = pd.concat([X_train, X_train_1hot], axis=1)
X_test = pd.concat([X_test, X_test_1hot], axis=1)
X_train.head()

Unnamed: 0,Pclass,Age,Fare,Family,female,male,C,Q,S,A-Deck,B-Deck,C-Deck,D-Deck,E-Deck,F-Deck,G-Deck,MIS_DECK,ImpPers,Miss\Ms,Mr\Master
0,3,22.0,7.25,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,1,38.0,71.2833,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,3,26.0,7.925,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,1,35.0,53.1,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,3,35.0,8.05,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [1902]:
X_test.head()

Unnamed: 0,Pclass,Age,Fare,Family,female,male,C,Q,S,A-Deck,B-Deck,C-Deck,D-Deck,E-Deck,F-Deck,G-Deck,MIS_DECK,ImpPers,Miss\Ms,Mr\Master
0,3,34.5,7.8292,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,3,47.0,7.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,2,62.0,9.6875,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,3,27.0,8.6625,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,3,22.0,12.2875,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


In [1903]:
classes_cat_train = X_train['Pclass']
classes_cat_test = X_test['Pclass']
classes_cat_train = classes_cat_train.as_matrix()
classes_cat_test = classes_cat_test.as_matrix()

  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


In [1904]:
classes_cat_train = classes_cat_train.reshape(-1, 1) 
classes_cat_test = classes_cat_test.reshape(-1, 1)

classes_cat_train = cat_encoder.fit_transform(classes_cat_train)
classes_cat_test = cat_encoder.fit_transform(classes_cat_test)

In [1905]:
classes_cat_1hot_train = pd.DataFrame(classes_cat_train, columns=['class1', 'class2', 'class3'])
classes_cat_1hot_test = pd.DataFrame(classes_cat_test, columns=['class1', 'class2', 'class3'])
classes_cat_1hot_train.head(10)

Unnamed: 0,class1,class2,class3
0,0.0,0.0,1.0
1,1.0,0.0,0.0
2,0.0,0.0,1.0
3,1.0,0.0,0.0
4,0.0,0.0,1.0
5,0.0,0.0,1.0
6,1.0,0.0,0.0
7,0.0,0.0,1.0
8,0.0,0.0,1.0
9,0.0,1.0,0.0


In [1906]:
classes_cat_1hot_test.head()

Unnamed: 0,class1,class2,class3
0,0.0,0.0,1.0
1,0.0,0.0,1.0
2,0.0,1.0,0.0
3,0.0,0.0,1.0
4,0.0,0.0,1.0


In [1907]:
X_train.head()

Unnamed: 0,Pclass,Age,Fare,Family,female,male,C,Q,S,A-Deck,B-Deck,C-Deck,D-Deck,E-Deck,F-Deck,G-Deck,MIS_DECK,ImpPers,Miss\Ms,Mr\Master
0,3,22.0,7.25,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,1,38.0,71.2833,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,3,26.0,7.925,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,1,35.0,53.1,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,3,35.0,8.05,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [1908]:
X_train.drop('Pclass', axis=1, inplace=True)
X_test.drop('Pclass', axis=1, inplace=True)
X_train.head()

Unnamed: 0,Age,Fare,Family,female,male,C,Q,S,A-Deck,B-Deck,C-Deck,D-Deck,E-Deck,F-Deck,G-Deck,MIS_DECK,ImpPers,Miss\Ms,Mr\Master
0,22.0,7.25,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,38.0,71.2833,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,26.0,7.925,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,35.0,53.1,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,35.0,8.05,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [1909]:
X_train = pd.concat([X_train, classes_cat_1hot_train], axis=1)
X_test = pd.concat([X_test, classes_cat_1hot_test], axis=1)

X_train.head()

Unnamed: 0,Age,Fare,Family,female,male,C,Q,S,A-Deck,B-Deck,...,E-Deck,F-Deck,G-Deck,MIS_DECK,ImpPers,Miss\Ms,Mr\Master,class1,class2,class3
0,22.0,7.25,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
1,38.0,71.2833,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,26.0,7.925,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
3,35.0,53.1,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
4,35.0,8.05,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0


In [1910]:
X_test.head()

Unnamed: 0,Age,Fare,Family,female,male,C,Q,S,A-Deck,B-Deck,...,E-Deck,F-Deck,G-Deck,MIS_DECK,ImpPers,Miss\Ms,Mr\Master,class1,class2,class3
0,34.5,7.8292,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
1,47.0,7.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
2,62.0,9.6875,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
3,27.0,8.6625,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
4,22.0,12.2875,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0


In [1911]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 22 columns):
Age          891 non-null float64
Fare         891 non-null float64
Family       891 non-null float64
female       891 non-null float64
male         891 non-null float64
C            891 non-null float64
Q            891 non-null float64
S            891 non-null float64
A-Deck       891 non-null float64
B-Deck       891 non-null float64
C-Deck       891 non-null float64
D-Deck       891 non-null float64
E-Deck       891 non-null float64
F-Deck       891 non-null float64
G-Deck       891 non-null float64
MIS_DECK     891 non-null float64
ImpPers      891 non-null float64
Miss\Ms      891 non-null float64
Mr\Master    891 non-null float64
class1       891 non-null float64
class2       891 non-null float64
class3       891 non-null float64
dtypes: float64(22)
memory usage: 153.2 KB


In [1912]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 22 columns):
Age          418 non-null float64
Fare         418 non-null float64
Family       418 non-null float64
female       418 non-null float64
male         418 non-null float64
C            418 non-null float64
Q            418 non-null float64
S            418 non-null float64
A-Deck       418 non-null float64
B-Deck       418 non-null float64
C-Deck       418 non-null float64
D-Deck       418 non-null float64
E-Deck       418 non-null float64
F-Deck       418 non-null float64
G-Deck       418 non-null float64
MIS_DECK     418 non-null float64
ImpPers      418 non-null float64
Miss\Ms      418 non-null float64
Mr\Master    418 non-null float64
class1       418 non-null float64
class2       418 non-null float64
class3       418 non-null float64
dtypes: float64(22)
memory usage: 71.9 KB


Use StandardScaler from sklearn to scale the attribute values.
StandardScaler uses the method:
$$ x_j^{(i)} = \frac{x_j^{(i)} - \mu_j}{s_j} \\ $$

Again $x_j^{(i)}$ is the value of attribute $j$ of the $i's$ example. $\mu_j$ is the mean of attribute $j$ and $s_j$ is the standard deviation of attribute $j$.

In [1913]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

X_train_scaled

array([[-0.53489116, -0.50244517,  1.2316449 , ..., -0.56568542,
        -0.51015154,  0.90258736],
       [ 0.66839176,  0.78684529,  1.2316449 , ...,  1.76776695,
        -0.51015154, -1.10792599],
       [-0.23407043, -0.48885426, -0.81192233, ..., -0.56568542,
        -0.51015154,  0.90258736],
       ...,
       [-0.57249375, -0.17626324,  1.2316449 , ..., -0.56568542,
        -0.51015154,  0.90258736],
       [-0.23407043, -0.04438104, -0.81192233, ...,  1.76776695,
        -0.51015154, -1.10792599],
       [ 0.21716066, -0.49237783, -0.81192233, ..., -0.56568542,
        -0.51015154,  0.90258736]])

**Now the training and testing data has been prepared for machine learning!**

* The data has been loaded.
* The data has been studied. 
* The correlation between the numerical attributed has been performed.
* The "label" have been separated from the training data.
* We have checked for and handled missing (numerical) values within the attributes.
* Categorical attributes have been preprocessed by one-hot encoding.
* Scale the date to fit into the machine learning algorithms

**Results of the preprocessing steps:**
* `X_train`: the prepared training data
* `y_train`: the labeled data for training

## Train and evaluate different classification models
Use [`GridSearchCV`](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html#sklearn.model_selection.GridSearchCV) for model evaluation.

In [1914]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

**Logistic Regression model:** [`LogisticRegression`](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html)

In [1915]:
from sklearn.linear_model import LogisticRegression

parameters = {'solver':('newton-cg', 'lbfgs', 'liblinear', 'sag'), 'C':[0.1, 1, 10], 'max_iter':[100, 1000]}
log_reg = LogisticRegression(random_state=0, max_iter=100)
grid_log_reg = GridSearchCV(log_reg, parameters, cv=5, return_train_score=False)
grid_log_reg.fit(X_train_scaled, y_train)

grid_log_results = grid_log_reg.cv_results_
print('Best estimator:', grid_log_reg.best_estimator_)
print('Best score:', grid_log_reg.best_score_)
#pd.DataFrame(grid_log_results)



Best estimator: LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=0, solver='newton-cg',
          tol=0.0001, verbose=0, warm_start=False)
Best score: 0.7968574635241302


**SVC:**[`SVC`](https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC)

In [1916]:
from sklearn.svm import SVC, LinearSVC

parameters = {'kernel':('linear', 'poly', 'rbf', 'sigmoid'), 'C':[0.1, 1, 2, 3, 4, 5, 6, 7, 8, 10],}
svc_clas = SVC(gamma='scale')
grid_svc = GridSearchCV(svc_clas, parameters, cv=5, return_train_score=False)
grid_svc.fit(X_train_scaled, y_train)

grid_svc_results = grid_svc.cv_results_
print('Best estimator:', grid_svc.best_estimator_)
print('Best score:', grid_svc.best_score_)

Best estimator: SVC(C=6, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='scale', kernel='poly',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
Best score: 0.8181818181818182


**LinearSVC:**[`LinearSVC`](https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html#sklearn.svm.LinearSVC)

In [1917]:
parameters = {'C':[0.1, 1, 2, 3, 4, 5, 6, 7, 8, 10],}
lsvc_clas = LinearSVC(max_iter=100000)
grid_lsvc = GridSearchCV(lsvc_clas, parameters, cv=5, return_train_score=False)
grid_lsvc.fit(X_train_scaled, y_train)

grid_lsvc_results = grid_lsvc.cv_results_
print('Best estimator:', grid_lsvc.best_estimator_)
print('Best score:', grid_lsvc.best_score_)

Best estimator: LinearSVC(C=1, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=100000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)
Best score: 0.792368125701459


**SGDClassifier:**[`SGDClassifier`](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html#sklearn.linear_model.SGDClassifier)

In [1918]:
from sklearn.linear_model import SGDClassifier

parameters = {}

sgd_clas = SGDClassifier(max_iter=1000, tol=1e-3)
grid_sgd = GridSearchCV(sgd_clas, parameters, cv=5, return_train_score=False)
grid_sgd.fit(X_train_scaled, y_train)

grid_sgd_results = grid_sgd.cv_results_
print('Best estimator:', grid_sgd.best_estimator_)
print('Best score:', grid_sgd.best_score_)

Best estimator: SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=1000,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2',
       power_t=0.5, random_state=None, shuffle=True, tol=0.001,
       validation_fraction=0.1, verbose=0, warm_start=False)
Best score: 0.6734006734006734




**DecisionTreeClassifier:** [`DecisionTreeClassifier`](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html#sklearn.tree.DecisionTreeClassifier)

In [1919]:
from sklearn.tree import DecisionTreeClassifier

tree_clas = DecisionTreeClassifier()

parameters = {'min_samples_split':[2, 4, 6, 8, 10, 20, 30, 35, 40, 45], 'min_samples_leaf':[1, 2, 3, 4, 6, 8]}
grid_tree = GridSearchCV(tree_clas, parameters, cv=5, return_train_score=False)
grid_tree.fit(X_train_scaled, y_train)

grid_tree_results = grid_tree.cv_results_
print('Best estimator:', grid_tree.best_estimator_)
print('Best score:', grid_tree.best_score_)

Best estimator: DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=6, min_samples_split=4,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
Best score: 0.8294051627384961


**RandomForestClassifier:**[`RandomForestClassifier`](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier)

In [1920]:
from sklearn.ensemble import RandomForestClassifier

forest_clas = RandomForestClassifier(n_estimators=100)

parameters = {'min_samples_split':[2, 4, 6, 8, 10, 20, 30, 35, 40, 45], 'min_samples_leaf':[1, 2, 3, 4, 6, 8]}
grid_forest = GridSearchCV(forest_clas, parameters, cv=5, return_train_score=False)
grid_forest.fit(X_train_scaled, y_train)

grid_forest_results = grid_forest.cv_results_
print('Best estimator:', grid_forest.best_estimator_)
print('Best score:', grid_forest.best_score_)

Best estimator: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=4,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
Best score: 0.8204264870931538


**KNeighborsClassifier:**[`KNeighborsClassifier`](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html#sklearn.neighbors.KNeighborsClassifier)

In [1921]:
from sklearn.neighbors import KNeighborsClassifier

neighbor_clas = KNeighborsClassifier()

parameters = {'algorithm':('auto', 'ball_tree', 'kd_tree', 'brute')}
grid_neighbor = GridSearchCV(neighbor_clas, parameters, cv=5, return_train_score=False)
grid_neighbor.fit(X_train_scaled, y_train)

grid_neighbor_results = grid_neighbor.cv_results_
print('Best estimator:', grid_neighbor.best_estimator_)
print('Best score:', grid_neighbor.best_score_)

Best estimator: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')
Best score: 0.8035914702581369


**GaussianNB:**[`GaussianNB`](https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html#sklearn.naive_bayes.GaussianNB)

In [1922]:
from sklearn.naive_bayes import GaussianNB

gauss_clas = GaussianNB()

parameters = {}
grid_gauss = GridSearchCV(gauss_clas, parameters, cv=5, return_train_score=False)
grid_gauss.fit(X_train_scaled, y_train)

grid_gauss_results = grid_gauss.cv_results_
print('Best estimator:', grid_gauss.best_estimator_)
print('Best score:', grid_gauss.best_score_)

Best estimator: GaussianNB(priors=None, var_smoothing=1e-09)
Best score: 0.7485970819304153


**Perceptron:**[`Perceptron`](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Perceptron.html#sklearn.linear_model.Perceptron)

In [1923]:
from sklearn.linear_model import Perceptron

perceptron_clas = Perceptron(max_iter=1000, tol=1e-3)

parameters = {'penalty':(None, 'l2', 'l1', 'elasticnet'), 
              'alpha':[1, 0.3, 0.1, 0.03, 0.01, 0.003, 0.001, 0.0003, 0.0001], 
              'max_iter':[100, 300, 1000]}
grid_perceptron = GridSearchCV(perceptron_clas, parameters, cv=5, return_train_score=False)
grid_perceptron.fit(X_train_scaled, y_train)

grid_perceptron_results = grid_perceptron.cv_results_
print('Best estimator:', grid_perceptron.best_estimator_)
print('Best score:', grid_perceptron.best_score_)

Best estimator: Perceptron(alpha=1, class_weight=None, early_stopping=False, eta0=1.0,
      fit_intercept=True, max_iter=100, n_iter=None, n_iter_no_change=5,
      n_jobs=None, penalty=None, random_state=0, shuffle=True, tol=0.001,
      validation_fraction=0.1, verbose=0, warm_start=False)
Best score: 0.7609427609427609




**MLPClassifier:**[`MLPClassifier`](https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html#sklearn.neural_network.MLPClassifier)

In [1924]:
from sklearn.neural_network import MLPClassifier
activateNN = False

nn_clas = MLPClassifier(hidden_layer_sizes=(200, 100, 50), max_iter=1000, random_state=1)

parameters = {'activation':('identity', 'logistic', 'tanh', 'relu'), 
              'alpha':[1, 0.3, 0.1, 0.03, 0.01, 0.003, 0.001, 0.0003, 0.0001], 
              'solver':('lbfgs', 'sgd', 'adam')}
grid_nn = GridSearchCV(nn_clas, parameters, cv=5, return_train_score=False)

if activateNN == True:
    grid_nn.fit(X_train_scaled, y_train)

    grid_nn_results = grid_nn.cv_results_
    print('Best estimator:', grid_nn.best_estimator_)
    print('Best score:', grid_nn.best_score_)

In [1925]:
from sklearn.neural_network import MLPClassifier

nn_clas = MLPClassifier(hidden_layer_sizes=(200, 100, 50), max_iter=1000, random_state=1)

parameters = {'activation':['tanh'], 
              'alpha':[0.1], 
              'solver':['sgd']}
grid_nn = GridSearchCV(nn_clas, parameters, cv=5, return_train_score=False)
grid_nn.fit(X_train_scaled, y_train)

grid_nn_results = grid_nn.cv_results_
print('Best estimator:', grid_nn.best_estimator_)
print('Best score:', grid_nn.best_score_)

Best estimator: MLPClassifier(activation='tanh', alpha=0.1, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(200, 100, 50), learning_rate='constant',
       learning_rate_init=0.001, max_iter=1000, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=1, shuffle=True, solver='sgd', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)
Best score: 0.8215488215488216


In [1926]:
tree_clas = DecisionTreeClassifier(min_samples_leaf=6, min_samples_split=8)
tree_clas.fit(X_train_scaled, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=6, min_samples_split=8,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [1927]:
pred = tree_clas.predict(X_test_scaled)
pred

array([0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1,
       1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,

In [1928]:
submission = pd.DataFrame({'PassengerId' : index , 'Survived': pred})
submission.to_csv('submit.csv', index = False)
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
