# kaggle Titanic competition

In [274]:
# import required pakages
import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib as mpl
from matplotlib import pyplot as plt
import seaborn as sns

Load the training and testing data.

In [275]:
train_data = pd.read_csv("data/train.csv")
test_data = pd.read_csv("data/test.csv")
#gender_data = pd.read_csv("data/gender_submission.csv")

### Get insights into the training data
Use `train_data.head(10)` for a tabular overview of the training data.

In [276]:
train_data.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


Use `train_data.info()` to get further insight into the training data.

In [277]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


Display an overview of the missing values.

In [278]:
def displayMissingValues(data):
    for attribute in data.columns.tolist():
        print('Missing values within {}: {}'.format(attribute, data[attribute].isnull().sum()))

In [279]:
displayMissingValues(train_data)

Missing values within PassengerId: 0
Missing values within Survived: 0
Missing values within Pclass: 0
Missing values within Name: 0
Missing values within Sex: 0
Missing values within Age: 177
Missing values within SibSp: 0
Missing values within Parch: 0
Missing values within Ticket: 0
Missing values within Fare: 0
Missing values within Cabin: 687
Missing values within Embarked: 2


`train_data.info()` and `displayMissingValues` reveal some missing data within the columns `Age`, `Cabin`and `Embarked`.
<br><br>
In general there are four possibilities to **handle missing values**:
* delete the entries with the missing values
* delete the hole attribute with the missing values
* fill the missing values with the median value of that attribute
* try to get the missing values afterwards

Furthermore `train_data.info()` reveals that the attributes have three data types: 
* `int64` -> which is okay for further processing.
* `float64` -> which is okay for further processing.
* `object` -> these attributes have to be adopted for further processing.
<br>

First let's have a look at these object attributes and their importance for the upcoming machine-learning process:
* `Name` is an individual value with no valuable information but the title may inherit some valuable information for machine learning and will be undergo further preprocessing.
* `Sex` is important for further processing. We will use one-hot-encoding to make this attribute processable.
* `Ticket` is an individual number with no valuable information and therefore not important for further processing.
* `Cabin` is a valuable information source but there are just 204 out of 981 values available. 
* `Embarked` can be interesting for further processing. We will use one-hot-encoding to make this attribute processable.

Use `train_data.describe()` to get an insight of the numerical attributes especially their variance.

In [280]:
train_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


There are two attributes that should undergo additional preprocessing:
* `Age` with a standard deviation of 14.52 and a max / min difference of 79.58.
* `Fare` with a standard deviation of 49.69 and a max / min difference of about 512.
<br>

Both attributes will undergo a standardization according the following formula:

$$ x_j^{(i)} = \frac{x_j^{(i)} - \mu_j}{s_j} \\ $$

Here $x_j^{(i)}$ is the value of attribute $j$ of the $i's$ example. $\mu_j$ is the mean of attribute $j$ and $s_j$ is the standard deviation of attribute $j$.

Check the correlation between the numerical values:

In [281]:
train_data.corr()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
PassengerId,1.0,-0.005007,-0.035144,0.036847,-0.057527,-0.001652,0.012658
Survived,-0.005007,1.0,-0.338481,-0.077221,-0.035322,0.081629,0.257307
Pclass,-0.035144,-0.338481,1.0,-0.369226,0.083081,0.018443,-0.5495
Age,0.036847,-0.077221,-0.369226,1.0,-0.308247,-0.189119,0.096067
SibSp,-0.057527,-0.035322,0.083081,-0.308247,1.0,0.414838,0.159651
Parch,-0.001652,0.081629,0.018443,-0.189119,0.414838,1.0,0.216225
Fare,0.012658,0.257307,-0.5495,0.096067,0.159651,0.216225,1.0


In [282]:
train_data.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


### Prepare the training data

##### Survived - attribute
Drop the label attribute `Survived` from the training data set and move it into the label set `y_train`.

In [283]:
y_train = train_data['Survived'].copy()
X_train = train_data.drop('Survived', axis=1)
X_train.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


##### Embarked - attribute
Lets have a closer look at the entries with the missing `Embarked` values.

In [284]:
X_train.loc[X_train['Embarked'].isnull()]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
61,62,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,
829,830,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,


Miss Amelie Icard and Mrs. George Nelson (Martha Evelyn) Stone had the same ticket number and lived in the same cabin. 
Searching for Mrs. George Nelson (Martha Evelyn) Stone on google led to the folowing link [encyclopedia-titanica.org](https://www.encyclopedia-titanica.org/titanic-survivor/martha-evelyn-stone.html). 
<br> 

Here you can find the following information:"Mrs Stone boarded the Titanic in **Southampton** on 10 April 1912 and was travelling in first class with her maid Amelie Icard. She occupied cabin B-28."
<br>

Now we know, that both women enbarked in Southampton which is abbreviated with an `S`. Lets fill this in:

In [285]:
X_train.loc[X_train['Embarked'].isnull(), 'Embarked'] = 'S'

Check the insertion

In [286]:
X_train.loc[X_train.PassengerId == 62]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
61,62,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,S


In [287]:
displayMissingValues(X_train)

Missing values within PassengerId: 0
Missing values within Pclass: 0
Missing values within Name: 0
Missing values within Sex: 0
Missing values within Age: 177
Missing values within SibSp: 0
Missing values within Parch: 0
Missing values within Ticket: 0
Missing values within Fare: 0
Missing values within Cabin: 687
Missing values within Embarked: 0


##### PassengerId & Ticket - attribute
Remove the attributes `PassengerId` and `Ticket` from the training set.

In [288]:
X_train = X_train.drop(['PassengerId', 'Ticket'], axis=1)
X_train.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C85,C
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,C123,S
4,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,,S


In [289]:
displayMissingValues(X_train)

Missing values within Pclass: 0
Missing values within Name: 0
Missing values within Sex: 0
Missing values within Age: 177
Missing values within SibSp: 0
Missing values within Parch: 0
Missing values within Fare: 0
Missing values within Cabin: 687
Missing values within Embarked: 0


##### Age - attribute
###### Replace missing values (by hand)

There is also the possibility to replace the missing values with the sklearn.impute.SimpleImputer class.
Use [`sklearn.impute.SimpleImputer`](https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html) class for replacing missing values.
<br>
We replace the missing values here by hand because we want to distinguish between the classes and within the classes between the gender. 

In [290]:
sample_age_incomplete_rows = X_train[X_train['Age'].isnull()].head(10)
sample_age_incomplete_rows

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
5,3,"Moran, Mr. James",male,,0,0,8.4583,,Q
17,2,"Williams, Mr. Charles Eugene",male,,0,0,13.0,,S
19,3,"Masselmani, Mrs. Fatima",female,,0,0,7.225,,C
26,3,"Emir, Mr. Farred Chehab",male,,0,0,7.225,,C
28,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,7.8792,,Q
29,3,"Todoroff, Mr. Lalio",male,,0,0,7.8958,,S
31,1,"Spencer, Mrs. William Augustus (Marie Eugenie)",female,,1,0,146.5208,B78,C
32,3,"Glynn, Miss. Mary Agatha",female,,0,0,7.75,,Q
36,3,"Mamee, Mr. Hanna",male,,0,0,7.2292,,C
42,3,"Kraeff, Mr. Theodor",male,,0,0,7.8958,,C


Calculate the median values depending on the class and the gender.

In [291]:
medianAges = np.zeros((2,3))

for i in range(0, 2):
    for j in range(0, 3):
        if(i == 0):
            medianAges[i, j] = X_train.loc[(X_train.Pclass == j+1) & (X_train.Sex == 'female')]['Age'].median()
        else:
            medianAges[i, j] = X_train.loc[(X_train.Pclass == j+1) & (X_train.Sex == 'male')]['Age'].median()

medianAges

array([[35. , 28. , 21.5],
       [40. , 30. , 25. ]])

Add the missing `Age` values to X_train

In [292]:
def addMissingAgeValues(dataset, medianAges):
    for i in range(medianAges.shape[0]):
        for j in range(medianAges.shape[1]):
            if(i == 0):
                X_train.loc[(dataset.Pclass == j+1) & (dataset.Age.isnull()) 
                            & (dataset.Sex == 'female'), 'Age'] = medianAges[i, j]
            else:
                X_train.loc[(dataset.Pclass == j+1) & (dataset.Age.isnull()) 
                            & (dataset.Sex == 'male'), 'Age'] = medianAges[i, j]

In [293]:
addMissingAgeValues(X_train, medianAges)

Check if the replacement was successful.

In [294]:
X_train.loc[sample_age_incomplete_rows.index.values]

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
5,3,"Moran, Mr. James",male,25.0,0,0,8.4583,,Q
17,2,"Williams, Mr. Charles Eugene",male,30.0,0,0,13.0,,S
19,3,"Masselmani, Mrs. Fatima",female,21.5,0,0,7.225,,C
26,3,"Emir, Mr. Farred Chehab",male,25.0,0,0,7.225,,C
28,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,21.5,0,0,7.8792,,Q
29,3,"Todoroff, Mr. Lalio",male,25.0,0,0,7.8958,,S
31,1,"Spencer, Mrs. William Augustus (Marie Eugenie)",female,35.0,1,0,146.5208,B78,C
32,3,"Glynn, Miss. Mary Agatha",female,21.5,0,0,7.75,,Q
36,3,"Mamee, Mr. Hanna",male,25.0,0,0,7.2292,,C
42,3,"Kraeff, Mr. Theodor",male,25.0,0,0,7.8958,,C


##### Cabin - attribute
Replace the cabin attribute by the more general deck attribute which encapsulates lots of cabins. There where 9 decks on the Titanic named A to G. A was the uppermost one and G was the lowermost one. The deck assignment is taken from the fist letter of the cabin. If the cabin has no value than we substitute it by MISSING.

In [295]:
X_train['Deck'] = X_train.Cabin.str.get(0)
X_train['Deck'] = X_train['Deck'].fillna('MISSING')

Check the assigned values

In [296]:
a = X_train['Deck'].unique()
np.sort(a)

array(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'MISSING', 'T'], dtype=object)

In [297]:
X_train['Deck'].value_counts()

MISSING    687
C           59
B           47
D           33
E           32
A           15
F           13
G            4
T            1
Name: Deck, dtype: int64

There is a T Deck which is not a correct Deck! Check where this comes from.

In [298]:
X_train[X_train.Deck == 'T']

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Deck
339,1,"Blackwell, Mr. Stephen Weart",male,45.0,0,0,35.5,T,S,T


A Search on google led to the following result again from [encyclopedia-titanica.org](https://www.encyclopedia-titanica.org/titanic-victim/stephen-weart-blackwell.html): Blackwell occupied cabin "T" on the **Boat Deck** (ticket number 113784 which cost £35, 10s). And from [wikipedia](https://en.wikipedia.org/wiki/RMS_Titanic) you get the following information: "The Boat Deck, on which the lifeboats were housed..." Thus we know, that cabin T is located closest to deck A and therefore we replace T by A.  

In [302]:
X_train.Deck.replace('T', 'A', inplace=True)
X_train['Deck'].value_counts()

MISSING    687
C           59
B           47
D           33
E           32
A           16
F           13
G            4
Name: Deck, dtype: int64

### Preprocessing categorical attributes
Extract the categorical attributes `Sex` and `Embarked` and store them within cat_attributes. 

In [26]:
cat_attributes = X_train[['Sex', 'Embarked']]
cat_attributes.head()

Unnamed: 0,Sex,Embarked
0,male,S
1,female,C
2,female,S
3,female,S
4,male,S


#### One-Hot-Encoder
Use [`OneHotEncoder`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html) to transform the categorical values of the attributes `Sex` and `Embarked` into numerical values. The result is a matrix with $n$ columns representing the $n$ different categorical values. In this case $n = 5$. We do not want a sparse matrix but an array, therefore we use `sparse=False`.

In [27]:
from sklearn.preprocessing import OneHotEncoder

cat_encoder = OneHotEncoder(sparse=False)
X_train_1hot = cat_encoder.fit_transform(cat_attributes)
X_train_1hot

array([[0., 1., 0., 0., 1.],
       [1., 0., 1., 0., 0.],
       [1., 0., 0., 0., 1.],
       ...,
       [1., 0., 0., 0., 1.],
       [0., 1., 1., 0., 0.],
       [0., 1., 0., 1., 0.]])

Get the category names out of the cat_encoder and store them within a list.

In [28]:
categories_1hot = cat_encoder.categories_
categories_1hot_list = []
for i in range(len(categories_1hot)):
    for j in range(len(categories_1hot[i])):
        categories_1hot_list.append(categories_1hot[i][j])
    
categories_1hot_list

['female', 'male', 'C', 'Q', 'S']

Delete the categorical attributes from X-train. They will be replaced by the one-hot encoded values.

In [29]:
X_train = X_train.drop(['Sex', 'Embarked'], axis = 1)
X_train.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare
0,3,22.0,1,0,7.25
1,1,38.0,1,0,71.2833
2,3,26.0,0,0,7.925
3,1,35.0,1,0,53.1
4,3,35.0,0,0,8.05


Convert the array of one-hot encoder into a pandas DataFrame.

In [30]:
X_train_1hot = pd.DataFrame(X_train_1hot, columns=categories_1hot_list)
X_train_1hot.head(10)

Unnamed: 0,female,male,C,Q,S
0,0.0,1.0,0.0,0.0,1.0
1,1.0,0.0,1.0,0.0,0.0
2,1.0,0.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0,1.0
4,0.0,1.0,0.0,0.0,1.0
5,0.0,1.0,0.0,1.0,0.0
6,0.0,1.0,0.0,0.0,1.0
7,0.0,1.0,0.0,0.0,1.0
8,1.0,0.0,0.0,0.0,1.0
9,1.0,0.0,1.0,0.0,0.0


In [31]:
X_train_1hot.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 889 entries, 0 to 888
Data columns (total 5 columns):
female    889 non-null float64
male      889 non-null float64
C         889 non-null float64
Q         889 non-null float64
S         889 non-null float64
dtypes: float64(5)
memory usage: 34.8 KB


Concatenate the one-hot encoded categories within `X_train_1hot` to `X_train` 

In [32]:
X_train = pd.concat([X_train, X_train_1hot], axis=1)
X_train.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,female,male,C,Q,S
0,3,22.0,1,0,7.25,0.0,1.0,0.0,0.0,1.0
1,1,38.0,1,0,71.2833,1.0,0.0,1.0,0.0,0.0
2,3,26.0,0,0,7.925,1.0,0.0,0.0,0.0,1.0
3,1,35.0,1,0,53.1,1.0,0.0,0.0,0.0,1.0
4,3,35.0,0,0,8.05,0.0,1.0,0.0,0.0,1.0


In [33]:
classes_cat = X_train['Pclass']
classes_cat = classes_cat.as_matrix()
type(classes_cat)

  


numpy.ndarray

In [34]:
classes_cat = classes_cat.reshape(-1, 1) 
classes_cat = cat_encoder.fit_transform(classes_cat)

In [35]:
classes_cat_1hot = pd.DataFrame(classes_cat, columns=['class1', 'class2', 'class3'])
classes_cat_1hot.head(10)

Unnamed: 0,class1,class2,class3
0,0.0,0.0,1.0
1,1.0,0.0,0.0
2,0.0,0.0,1.0
3,1.0,0.0,0.0
4,0.0,0.0,1.0
5,0.0,0.0,1.0
6,1.0,0.0,0.0
7,0.0,0.0,1.0
8,0.0,0.0,1.0
9,0.0,1.0,0.0


In [36]:
X_train.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,female,male,C,Q,S
0,3,22.0,1,0,7.25,0.0,1.0,0.0,0.0,1.0
1,1,38.0,1,0,71.2833,1.0,0.0,1.0,0.0,0.0
2,3,26.0,0,0,7.925,1.0,0.0,0.0,0.0,1.0
3,1,35.0,1,0,53.1,1.0,0.0,0.0,0.0,1.0
4,3,35.0,0,0,8.05,0.0,1.0,0.0,0.0,1.0


In [37]:
X_train.drop('Pclass', axis=1, inplace=True)
X_train.head()

Unnamed: 0,Age,SibSp,Parch,Fare,female,male,C,Q,S
0,22.0,1,0,7.25,0.0,1.0,0.0,0.0,1.0
1,38.0,1,0,71.2833,1.0,0.0,1.0,0.0,0.0
2,26.0,0,0,7.925,1.0,0.0,0.0,0.0,1.0
3,35.0,1,0,53.1,1.0,0.0,0.0,0.0,1.0
4,35.0,0,0,8.05,0.0,1.0,0.0,0.0,1.0


In [38]:
X_train = pd.concat([X_train, classes_cat_1hot], axis=1)
X_train.head()

Unnamed: 0,Age,SibSp,Parch,Fare,female,male,C,Q,S,class1,class2,class3
0,22.0,1,0,7.25,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
1,38.0,1,0,71.2833,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,26.0,0,0,7.925,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,35.0,1,0,53.1,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
4,35.0,0,0,8.05,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0


In [39]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 889 entries, 0 to 888
Data columns (total 12 columns):
Age       889 non-null float64
SibSp     889 non-null int64
Parch     889 non-null int64
Fare      889 non-null float64
female    889 non-null float64
male      889 non-null float64
C         889 non-null float64
Q         889 non-null float64
S         889 non-null float64
class1    889 non-null float64
class2    889 non-null float64
class3    889 non-null float64
dtypes: float64(10), int64(2)
memory usage: 83.4 KB


Use StandardScaler from sklearn to scale the attribute values.
StandardScaler uses the method:
$$ x_j^{(i)} = \frac{x_j^{(i)} - \mu_j}{s_j} \\ $$

Again $x_j^{(i)}$ is the value of attribute $j$ of the $i's$ example. $\mu_j$ is the mean of attribute $j$ and $s_j$ is the standard deviation of attribute $j$.

In [40]:
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
df = pd.DataFrame(scale.fit_transform(X_train.values), columns=X_train.columns, index=X_train.index)
#df

In [41]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
#scaler.fit_transform(X_train[['Fare']].values)
scaled_values = scaler.fit_transform(X_train)
scaled_values.shape
#scaled_values = X_train

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


(889, 12)

**Now the training data has been prepared for machine learning!**

* The data has been loaded.
* The data has been studied. 
* The correlation between the numerical attributed has been performed.
* The "label" have been separated from the training data.
* We have checked for and handled missing (numerical) values within the attributes.
* Categorical attributes have been preprocessed by one-hot encoding.
* Scale the date to fit into the machine learning algorithms

**Results of the preprocessing steps:**
* `X_train`: the prepared training data
* `y_train`: the labeled data for training

## Train and evaluate different classification models
Use [`GridSearchCV`](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html#sklearn.model_selection.GridSearchCV) for model evaluation.

In [42]:
scaled_values = scaled_values[:, 0:7]

In [43]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

**Logistic Regression model:** [`LogisticRegression`](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html)

In [44]:
from sklearn.linear_model import LogisticRegression

parameters = {'solver':('newton-cg', 'lbfgs', 'liblinear', 'sag'), 'C':[0.1, 1, 10], 'max_iter':[100, 1000]}
log_reg = LogisticRegression(random_state=0, max_iter=100)
grid_log_reg = GridSearchCV(log_reg, parameters, cv=5, return_train_score=False)
grid_log_reg.fit(scaled_values, y_train)

grid_log_results = grid_log_reg.cv_results_
print('Best estimator:', grid_log_reg.best_estimator_)
print('Best score:', grid_log_reg.best_score_)
#pd.DataFrame(grid_log_results)

Best estimator: LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=0, solver='newton-cg',
          tol=0.0001, verbose=0, warm_start=False)
Best score: 0.7885264341957255


**SVC:**[`SVC`](https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC)

In [45]:
from sklearn.svm import SVC, LinearSVC

parameters = {'kernel':('linear', 'poly', 'rbf', 'sigmoid'), 'C':[0.1, 1, 10],}
svc_clas = SVC(gamma='scale')
grid_svc = GridSearchCV(svc_clas, parameters, cv=5, return_train_score=False)
grid_svc.fit(scaled_values, y_train)

grid_svc_results = grid_svc.cv_results_
print('Best estimator:', grid_svc.best_estimator_)
print('Best score:', grid_svc.best_score_)

Best estimator: SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
Best score: 0.8233970753655793


**LinearSVC:**[`LinearSVC`](https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html#sklearn.svm.LinearSVC)

In [46]:
parameters = {'C':[0.1, 1, 10],}
lsvc_clas = LinearSVC(max_iter=100000)
grid_lsvc = GridSearchCV(lsvc_clas, parameters, cv=5, return_train_score=False)
grid_lsvc.fit(scaled_values, y_train)

grid_lsvc_results = grid_lsvc.cv_results_
print('Best estimator:', grid_lsvc.best_estimator_)
print('Best score:', grid_lsvc.best_score_)

KeyboardInterrupt: 

**SGDClassifier:**[`SGDClassifier`](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html#sklearn.linear_model.SGDClassifier)

In [None]:
from sklearn.linear_model import SGDClassifier

parameters = {}

sgd_clas = SGDClassifier(max_iter=1000, tol=1e-3)
grid_sgd = GridSearchCV(sgd_clas, parameters, cv=5, return_train_score=False)
grid_sgd.fit(scaled_values, y_train)

grid_sgd_results = grid_sgd.cv_results_
print('Best estimator:', grid_sgd.best_estimator_)
print('Best score:', grid_sgd.best_score_)

**DecisionTreeClassifier:** [`DecisionTreeClassifier`](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html#sklearn.tree.DecisionTreeClassifier)

In [None]:
from sklearn.tree import DecisionTreeClassifier

tree_clas = DecisionTreeClassifier()

parameters = {'min_samples_split':[2, 4, 6, 8, 10, 20, 30, 35, 40, 45], 'min_samples_leaf':[1, 2, 3, 4, 6, 8]}
grid_tree = GridSearchCV(tree_clas, parameters, cv=5, return_train_score=False)
grid_tree.fit(scaled_values, y_train)

grid_tree_results = grid_tree.cv_results_
print('Best estimator:', grid_tree.best_estimator_)
print('Best score:', grid_tree.best_score_)

**RandomForestClassifier:**[`RandomForestClassifier`](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier)

In [None]:
from sklearn.ensemble import RandomForestClassifier

forest_clas = RandomForestClassifier(n_estimators=100)

parameters = {'min_samples_split':[2, 4, 6, 8, 10, 20, 30, 35, 40, 45], 'min_samples_leaf':[1, 2, 3, 4, 6, 8]}
grid_forest = GridSearchCV(forest_clas, parameters, cv=5, return_train_score=False)
grid_forest.fit(scaled_values, y_train)

grid_forest_results = grid_forest.cv_results_
print('Best estimator:', grid_forest.best_estimator_)
print('Best score:', grid_forest.best_score_)

**KNeighborsClassifier:**[`KNeighborsClassifier`](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html#sklearn.neighbors.KNeighborsClassifier)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

neighbor_clas = KNeighborsClassifier()

parameters = {'algorithm':('auto', 'ball_tree', 'kd_tree', 'brute')}
grid_neighbor = GridSearchCV(neighbor_clas, parameters, cv=5, return_train_score=False)
grid_neighbor.fit(scaled_values, y_train)

grid_neighbor_results = grid_neighbor.cv_results_
print('Best estimator:', grid_neighbor.best_estimator_)
print('Best score:', grid_neighbor.best_score_)

**GaussianNB:**[`GaussianNB`](https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html#sklearn.naive_bayes.GaussianNB)

In [None]:
from sklearn.naive_bayes import GaussianNB

gauss_clas = GaussianNB()

parameters = {}
grid_gauss = GridSearchCV(gauss_clas, parameters, cv=5, return_train_score=False)
grid_gauss.fit(scaled_values, y_train)

grid_gauss_results = grid_gauss.cv_results_
print('Best estimator:', grid_gauss.best_estimator_)
print('Best score:', grid_gauss.best_score_)

**Perceptron:**[`Perceptron`](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Perceptron.html#sklearn.linear_model.Perceptron)

In [None]:
from sklearn.linear_model import Perceptron

perceptron_clas = Perceptron(max_iter=1000, tol=1e-3)

parameters = {'penalty':(None, 'l2', 'l1', 'elasticnet'), 
              'alpha':[1, 0.3, 0.1, 0.03, 0.01, 0.003, 0.001, 0.0003, 0.0001], 
              'max_iter':[100, 300, 1000]}
grid_perceptron = GridSearchCV(perceptron_clas, parameters, cv=5, return_train_score=False)
grid_perceptron.fit(scaled_values, y_train)

grid_perceptron_results = grid_perceptron.cv_results_
print('Best estimator:', grid_perceptron.best_estimator_)
print('Best score:', grid_perceptron.best_score_)

**MLPClassifier:**[`MLPClassifier`](https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html#sklearn.neural_network.MLPClassifier)

In [None]:
from sklearn.neural_network import MLPClassifier

nn_clas = MLPClassifier(hidden_layer_sizes=(200, 100, 50), max_iter=1000, random_state=1)

parameters = {'activation':('identity', 'logistic', 'tanh', 'relu'), 
              'alpha':[1, 0.3, 0.1, 0.03, 0.01, 0.003, 0.001, 0.0003, 0.0001], 
              'solver':('lbfgs', 'sgd', 'adam')}
grid_nn = GridSearchCV(nn_clas, parameters, cv=5, return_train_score=False)
grid_nn.fit(scaled_values, y_train)

grid_nn_results = grid_nn.cv_results_
print('Best estimator:', grid_nn.best_estimator_)
print('Best score:', grid_nn.best_score_)

In [None]:
from sklearn.neural_network import MLPClassifier

nn_clas = MLPClassifier(hidden_layer_sizes=(200, 100, 50), max_iter=1000, random_state=1)

parameters = {'activation':['relu'], 
              'alpha':[1], 
              'solver':['sgd']}
grid_nn = GridSearchCV(nn_clas, parameters, cv=5, return_train_score=False)
grid_nn.fit(scaled_values, y_train)

grid_nn_results = grid_nn.cv_results_
print('Best estimator:', grid_nn.best_estimator_)
print('Best score:', grid_nn.best_score_)

In [None]:
from sklearn.neural_network import MLPClassifier

nn_clas = MLPClassifier(hidden_layer_sizes=(200, 100, 50), alpha=0.0001, max_iter=1000, random_state=1)
nn_clas.fit(scaled_values, y_train)

In [None]:
y_pred = nn_clas.predict(scaled_values)
accuracy_score(y_train, y_pred)