# kaggle Titanic competition

In [2465]:
# import required pakages
import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib as mpl
from matplotlib import pyplot as plt

Load the training and testing data.

In [2466]:
train_data = pd.read_csv("data/train.csv")
test_data = pd.read_csv("data/test.csv")
gender_data = pd.read_csv("data/gender_submission.csv")

### Get insights into the training data
Use `train_data.head(10)` for a tabular overview of the training data.

In [2467]:
train_data.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


Use `train_data.info()` to get further insight into the training data.

In [2468]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


`train_data.info()` reveals some missing data within the columns `Age`, `Cabin`and `Embarked`.
<br><br>
There are three possibilities to **handle missing values**:
* delete the entries with the missing values
* delete the hole attribute with the missing values
* fill the missing values with the median value of that attribute
<br>

In this case we will use all three different methods to handle the missing values:
* The `Cabin`attribute will be deleted.
* The two entries with the missing `Embarked` value will be deleted.
* The missing values within the `Age` attribute will be filled by the median of the other values of that attribute.
<br>

Furthermore `train_data.info()` reveals that the attributes have three data types: 
* `int64` -> which is okay for further processing.
* `float64` -> which is okay for further processing.
* `object` -> these attributes have to be adopted for further processing.
<br>

First let's have a look at these object attributes and their importance for the upcoming machine-learning process:
* `Name` is an individual value with no valuable information and therefore not important for further processing.
* `Sex` is important for further processing. We will use one-hot-encoding to make this attribute processable.
* `Ticket` is an individual number with no valuable information and therefore not important for further processing.
* `Cabin` will be deleted because there are just 204 out of 981 values. 
* `Embarked` can be interesting for further processing. We will use one-hot-encoding to make this attribute processable.

Use `train_data.describe()` to get an insight of the numerical attributes especially their variance.

In [2469]:
train_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


There are two attributes that should undergo additional preprocessing:
* `Age` with a standard deviation of 14.52 and a max / min difference of 79.58.
* `Fare` with a standard deviation of 49.69 and a max / min difference of about 512.
<br>

Both attributes will undergo a standardization according the following formula:

$$ x_j^{(i)} = \frac{x_j^{(i)} - \mu_j}{s_j} \\ $$

Here $x_j^{(i)}$ is the value of attribute $j$ of the $i's$ example. $\mu_j$ is the mean of attribute $j$ and $s_j$ is the standard deviation of attribute $j$.

check the correlation between the numerical values:

In [2470]:
train_data.corr()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
PassengerId,1.0,-0.005007,-0.035144,0.036847,-0.057527,-0.001652,0.012658
Survived,-0.005007,1.0,-0.338481,-0.077221,-0.035322,0.081629,0.257307
Pclass,-0.035144,-0.338481,1.0,-0.369226,0.083081,0.018443,-0.5495
Age,0.036847,-0.077221,-0.369226,1.0,-0.308247,-0.189119,0.096067
SibSp,-0.057527,-0.035322,0.083081,-0.308247,1.0,0.414838,0.159651
Parch,-0.001652,0.081629,0.018443,-0.189119,0.414838,1.0,0.216225
Fare,0.012658,0.257307,-0.5495,0.096067,0.159651,0.216225,1.0


In [2471]:
train_data.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


### Prepare the training data

 Delete the two entries with the missing *Embarked* attribute

In [2472]:
#train_data = train_data.dropna(subset=["Embarked"])
train_data.dropna(subset=["Embarked"], inplace=True)
train_data.reset_index(inplace=True)
train_data.tail()

Unnamed: 0,index,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
884,886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
885,887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
886,888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
887,889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
888,890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


Drop the label attribute `Survived` from the training data set and move it into the label set `y_train`.

In [2473]:
y_train = train_data['Survived'].copy()
X_train = train_data.drop('Survived', axis=1)
X_train.head()

Unnamed: 0,index,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Remove the attributes `PassengerId`, `Name`, `Ticket` and `Cabin` from the training set.

In [2474]:
X_train = X_train.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
X_train.head()

Unnamed: 0,index,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,2,3,female,26.0,0,0,7.925,S
3,3,1,female,35.0,1,0,53.1,S
4,4,3,male,35.0,0,0,8.05,S


In [2475]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 889 entries, 0 to 888
Data columns (total 8 columns):
index       889 non-null int64
Pclass      889 non-null int64
Sex         889 non-null object
Age         712 non-null float64
SibSp       889 non-null int64
Parch       889 non-null int64
Fare        889 non-null float64
Embarked    889 non-null object
dtypes: float64(2), int64(4), object(2)
memory usage: 55.6+ KB


In [2476]:
sample_incomplete_rows = X_train[X_train.isnull().any(axis=1)].head(10)
sample_incomplete_rows

Unnamed: 0,index,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
5,5,3,male,,0,0,8.4583,Q
17,17,2,male,,0,0,13.0,S
19,19,3,female,,0,0,7.225,C
26,26,3,male,,0,0,7.225,C
28,28,3,female,,0,0,7.8792,Q
29,29,3,male,,0,0,7.8958,S
31,31,1,female,,1,0,146.5208,C
32,32,3,female,,0,0,7.75,Q
36,36,3,male,,0,0,7.2292,C
42,42,3,male,,0,0,7.8958,C


#### Replace missing values (by hand)

There is also the possibility to replace the missing values with the sklearn.impute.SimpleImputer class
Use [`sklearn.impute.SimpleImputer`](https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html) class for replacing missing values.

In [2477]:
median = X_train["Age"].median()
X_train["Age"].fillna(median, inplace=True)

Check if the replacement was successful.

In [2478]:
X_train.loc[sample_incomplete_rows.index.values]

Unnamed: 0,index,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
5,5,3,male,28.0,0,0,8.4583,Q
17,17,2,male,28.0,0,0,13.0,S
19,19,3,female,28.0,0,0,7.225,C
26,26,3,male,28.0,0,0,7.225,C
28,28,3,female,28.0,0,0,7.8792,Q
29,29,3,male,28.0,0,0,7.8958,S
31,31,1,female,28.0,1,0,146.5208,C
32,32,3,female,28.0,0,0,7.75,Q
36,36,3,male,28.0,0,0,7.2292,C
42,42,3,male,28.0,0,0,7.8958,C


In [2479]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 889 entries, 0 to 888
Data columns (total 8 columns):
index       889 non-null int64
Pclass      889 non-null int64
Sex         889 non-null object
Age         889 non-null float64
SibSp       889 non-null int64
Parch       889 non-null int64
Fare        889 non-null float64
Embarked    889 non-null object
dtypes: float64(2), int64(4), object(2)
memory usage: 55.6+ KB


### Preprocessing categorical attributes
Extract the categorical attributes `Sex` and `Embarked` and store them within cat_attributes. 

In [2480]:
cat_attributes = X_train[['Sex', 'Embarked']]
cat_attributes.head()

Unnamed: 0,Sex,Embarked
0,male,S
1,female,C
2,female,S
3,female,S
4,male,S


#### One-Hot-Encoder
Use [`OneHotEncoder`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html) to transform the categorical values of the attributes `Sex` and `Embarked` into numerical values. The result is a matrix with $n$ columns representing the $n$ different categorical values. In this case $n = 5$. We do not want a sparse matrix but an array, therefore we use `sparse=False`.

In [2481]:
from sklearn.preprocessing import OneHotEncoder

cat_encoder = OneHotEncoder(sparse=False)
X_train_1hot = cat_encoder.fit_transform(cat_attributes)
X_train_1hot

array([[0., 1., 0., 0., 1.],
       [1., 0., 1., 0., 0.],
       [1., 0., 0., 0., 1.],
       ...,
       [1., 0., 0., 0., 1.],
       [0., 1., 1., 0., 0.],
       [0., 1., 0., 1., 0.]])

Get the category names out of the cat_encoder and store them within a list.

In [2482]:
categories_1hot = cat_encoder.categories_
categories_1hot_list = []
for i in range(len(categories_1hot)):
    for j in range(len(categories_1hot[i])):
        categories_1hot_list.append(categories_1hot[i][j])
    
categories_1hot_list

['female', 'male', 'C', 'Q', 'S']

Delete the categorical attributes from X-train. They will be replaced by the one-hot encoded values.

In [2483]:
X_train = X_train.drop(['Sex', 'Embarked'], axis = 1)
X_train.head()

Unnamed: 0,index,Pclass,Age,SibSp,Parch,Fare
0,0,3,22.0,1,0,7.25
1,1,1,38.0,1,0,71.2833
2,2,3,26.0,0,0,7.925
3,3,1,35.0,1,0,53.1
4,4,3,35.0,0,0,8.05


Convert the array of one-hot encoder into a pandas DataFrame.

In [2484]:
X_train_1hot = pd.DataFrame(X_train_1hot, columns=categories_1hot_list)
X_train_1hot.head(10)

Unnamed: 0,female,male,C,Q,S
0,0.0,1.0,0.0,0.0,1.0
1,1.0,0.0,1.0,0.0,0.0
2,1.0,0.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0,1.0
4,0.0,1.0,0.0,0.0,1.0
5,0.0,1.0,0.0,1.0,0.0
6,0.0,1.0,0.0,0.0,1.0
7,0.0,1.0,0.0,0.0,1.0
8,1.0,0.0,0.0,0.0,1.0
9,1.0,0.0,1.0,0.0,0.0


In [2485]:
X_train_1hot.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 889 entries, 0 to 888
Data columns (total 5 columns):
female    889 non-null float64
male      889 non-null float64
C         889 non-null float64
Q         889 non-null float64
S         889 non-null float64
dtypes: float64(5)
memory usage: 34.8 KB


Concatenate the one-hot encoded categories within `X_train_1hot` to `X_train` 

In [2486]:
X_train = pd.concat([X_train, X_train_1hot], axis=1)
X_train.head()

Unnamed: 0,index,Pclass,Age,SibSp,Parch,Fare,female,male,C,Q,S
0,0,3,22.0,1,0,7.25,0.0,1.0,0.0,0.0,1.0
1,1,1,38.0,1,0,71.2833,1.0,0.0,1.0,0.0,0.0
2,2,3,26.0,0,0,7.925,1.0,0.0,0.0,0.0,1.0
3,3,1,35.0,1,0,53.1,1.0,0.0,0.0,0.0,1.0
4,4,3,35.0,0,0,8.05,0.0,1.0,0.0,0.0,1.0


In [2487]:
X_train.drop(['index'], axis=1, inplace=True)
X_train.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,female,male,C,Q,S
0,3,22.0,1,0,7.25,0.0,1.0,0.0,0.0,1.0
1,1,38.0,1,0,71.2833,1.0,0.0,1.0,0.0,0.0
2,3,26.0,0,0,7.925,1.0,0.0,0.0,0.0,1.0
3,1,35.0,1,0,53.1,1.0,0.0,0.0,0.0,1.0
4,3,35.0,0,0,8.05,0.0,1.0,0.0,0.0,1.0


In [2488]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 889 entries, 0 to 888
Data columns (total 10 columns):
Pclass    889 non-null int64
Age       889 non-null float64
SibSp     889 non-null int64
Parch     889 non-null int64
Fare      889 non-null float64
female    889 non-null float64
male      889 non-null float64
C         889 non-null float64
Q         889 non-null float64
S         889 non-null float64
dtypes: float64(7), int64(3)
memory usage: 69.5 KB


Use StandardScaler from sklearn to scale the attribute values.
StandardScaler uses the method:
$$ x_j^{(i)} = \frac{x_j^{(i)} - \mu_j}{s_j} \\ $$

Again $x_j^{(i)}$ is the value of attribute $j$ of the $i's$ example. $\mu_j$ is the mean of attribute $j$ and $s_j$ is the standard deviation of attribute $j$.

In [2489]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_values = scaler.fit_transform(X_train)
scaled_values.shape

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


(889, 10)

**Now the training data has been prepared for machine learning!**

* The data has been loaded.
* The data has been studied. 
* The correlation between the numerical attributed has been performed.
* The "label" have been separated from the training data.
* We have checked for and handled missing (numerical) values within the attributes.
* Categorical attributes have been preprocessed by one-hot encoding.
* Scale the date to fit into the machine learning algorithms

**Results of the preprocessing steps:**
* `X_train`: the prepared training data
* `y_train`: the labeled data for training

### Select and train a model

**Logistic Regression model:** [`LogisticRegression`](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html)
<br>
Train the logistic regression model.

In [2490]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

#clf = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial', max_iter=1000).fit(X_train, y_train)
#log_reg = LogisticRegression(random_state=0, solver='liblinear', max_iter=1000)
log_reg = LogisticRegression(C=0.01, random_state=0, solver='lbfgs', multi_class='multinomial', max_iter=100)
log_reg.fit(scaled_values, y_train)

LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=None, penalty='l2', random_state=0, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

Predict the first 20 values of the training set and compare it with the labels.

In [2491]:
#some_data_prepared = scaled_values[:20]
#some_labels = y_train[:20]
#print('Predictions:', log_reg.predict(some_data_prepared))

In [2492]:
#print('Labels: ', list(some_labels))

Get the mean accuracy on the training data and the labels.

In [2493]:
y_pred = log_reg.predict(scaled_values)
accuracy_score(y_train, y_pred)

0.8020247469066367

In [2494]:
log_reg.score(scaled_values, y_train)

0.8020247469066367

**Neuronal Network:** [`MLPClassifier`](https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html#sklearn.neural_network.MLPClassifier)

In [2495]:
scaled_values = scaled_values[:, 0:7]

In [2496]:
from sklearn.neural_network import MLPClassifier

nn_clas = MLPClassifier(hidden_layer_sizes=(200, 100, 50), alpha=0.0001, max_iter=1000, random_state=1)
nn_clas.fit(scaled_values, y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(200, 100, 50), learning_rate='constant',
       learning_rate_init=0.001, max_iter=1000, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=1, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [2497]:
y_pred = nn_clas.predict(scaled_values)
accuracy_score(y_train, y_pred)

0.8852643419572553