In [92]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt

In [93]:
df_train = pd.read_csv('train.csv')
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [94]:
# We don't really need the name or the ticket number
df_train.drop(['Name', 'Ticket'], axis=1, inplace=True)
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,1,0,3,male,22.0,1,0,7.25,,S
1,2,1,1,female,38.0,1,0,71.2833,C85,C
2,3,1,3,female,26.0,0,0,7.925,,S
3,4,1,1,female,35.0,1,0,53.1,C123,S
4,5,0,3,male,35.0,0,0,8.05,,S


In [95]:
df_train.count()

PassengerId    891
Survived       891
Pclass         891
Sex            891
Age            714
SibSp          891
Parch          891
Fare           891
Cabin          204
Embarked       889
dtype: int64

In [96]:
df_train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [97]:
df_train.Cabin.describe()

count         204
unique        147
top       B96 B98
freq            4
Name: Cabin, dtype: object

In [98]:
# Check how many of them are survivors
df_train[(df_train.Survived == 1) & (df_train.Cabin.isna())]

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
2,3,1,3,female,26.0,0,0,7.9250,,S
8,9,1,3,female,27.0,0,2,11.1333,,S
9,10,1,2,female,14.0,1,0,30.0708,,C
15,16,1,2,female,55.0,0,0,16.0000,,S
17,18,1,2,male,,0,0,13.0000,,S
...,...,...,...,...,...,...,...,...,...,...
866,867,1,2,female,27.0,1,0,13.8583,,C
869,870,1,3,male,4.0,1,1,11.1333,,S
874,875,1,2,female,28.0,1,0,24.0000,,C
875,876,1,3,female,15.0,0,0,7.2250,,C


In [99]:
# nur 204 von 891 Datenpunkten haben eine Cbain-Info, allerdings sind die NaN-Werte zu dicht mit Survived Fällen. 
# Deshalb können wir versuchen Cabin zu encoden, anstatt komplett rauszunehmen
Cabin_unique_values = df_train.Cabin.unique()
Cabin_encode_table = {}
for index, value in enumerate(Cabin_unique_values):
    Cabin_encode_table[value] = index
    
df_train["Cabin_encoded"] = df_train.Cabin.map(Cabin_encode_table)
df_train.Cabin_encoded.value_counts()

Cabin_encoded
0      687
8        4
4        4
73       4
54       3
      ... 
61       1
60       1
58       1
57       1
147      1
Name: count, Length: 148, dtype: int64

In [100]:
# Now we can encode other non-numerical columns Sex and Embarked accordingly
Embarked_unique_values = df_train.Embarked.unique()
Embarked_encode_table = {}
for index, value in enumerate(Embarked_unique_values):
    Embarked_encode_table[value] = index
    
Sex_unique_values = df_train.Sex.unique()
Sex_encode_table = {}
for index, value in enumerate(Sex_unique_values):
    Sex_encode_table[value] = index
    
df_train["Embarked_encoded"] = df_train.Embarked.map(Embarked_encode_table)
df_train["Sex_encoded"] = df_train.Sex.map(Sex_encode_table)

In [101]:
df_train.Embarked_encoded.value_counts()

Embarked_encoded
0    644
1    168
2     77
3      2
Name: count, dtype: int64

In [102]:
df_train.Sex_encoded.value_counts()

Sex_encoded
0    577
1    314
Name: count, dtype: int64

In [103]:
# Age also needs to be filled, because it has some NaN values
df_train.Age.describe()

count    714.000000
mean      29.699118
std       14.526497
min        0.420000
25%       20.125000
50%       28.000000
75%       38.000000
max       80.000000
Name: Age, dtype: float64

In [104]:
# So lets fill it with the mean
df_train.Age.fillna(df_train.Age.mean(), inplace=True)

In [105]:
# One last look at the training data before scaling
df_train.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Cabin_encoded,Embarked_encoded,Sex_encoded
0,1,0,3,male,22.0,1,0,7.25,,S,0,0,0
1,2,1,1,female,38.0,1,0,71.2833,C85,C,1,1,1
2,3,1,3,female,26.0,0,0,7.925,,S,0,0,1
3,4,1,1,female,35.0,1,0,53.1,C123,S,2,0,1
4,5,0,3,male,35.0,0,0,8.05,,S,0,0,0
5,6,0,3,male,29.699118,0,0,8.4583,,Q,0,2,0
6,7,0,1,male,54.0,0,0,51.8625,E46,S,3,0,0
7,8,0,3,male,2.0,3,1,21.075,,S,0,0,0
8,9,1,3,female,27.0,0,2,11.1333,,S,0,0,1
9,10,1,2,female,14.0,1,0,30.0708,,C,0,1,1


In [106]:
# Some columns we don't need anymore
df_train_data = df_train.drop(["PassengerId", "Sex", "Cabin", "Embarked"], axis=1)
df_train_data.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Cabin_encoded,Embarked_encoded,Sex_encoded
0,0,3,22.0,1,0,7.25,0,0,0
1,1,1,38.0,1,0,71.2833,1,1,1
2,1,3,26.0,0,0,7.925,0,0,1
3,1,1,35.0,1,0,53.1,2,0,1
4,0,3,35.0,0,0,8.05,0,0,0


In [107]:
df_train_labels = df_train_data.Survived

scaler = MinMaxScaler()
df_train_scaled = pd.DataFrame(
    scaler.fit_transform(df_train_data.drop(['Survived'], axis=1)),
    columns=df_train_data.columns[1:] # Because we don't have Survived anymore
)

In [108]:
df_train_scaled.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Cabin_encoded,Embarked_encoded,Sex_encoded
0,1.0,0.271174,0.125,0.0,0.014151,0.0,0.0,0.0
1,0.0,0.472229,0.125,0.0,0.139136,0.006803,0.333333,1.0
2,1.0,0.321438,0.0,0.0,0.015469,0.0,0.0,1.0
3,0.0,0.434531,0.125,0.0,0.103644,0.013605,0.0,1.0
4,1.0,0.434531,0.0,0.0,0.015713,0.0,0.0,0.0


In [109]:
df_train_scaled.describe()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Cabin_encoded,Embarked_encoded,Sex_encoded
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.654321,0.367921,0.065376,0.063599,0.062858,0.105629,0.122709,0.352413
std,0.418036,0.163383,0.137843,0.134343,0.096995,0.238073,0.215867,0.47799
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.5,0.271174,0.0,0.0,0.01544,0.0,0.0,0.0
50%,1.0,0.367921,0.0,0.0,0.028213,0.0,0.0,0.0
75%,1.0,0.434531,0.125,0.0,0.060508,0.0,0.333333,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [110]:
# Let's train the model now
model = RandomForestClassifier(random_state=42)
model.fit(df_train_scaled, df_train_labels)

In [111]:
# Looking at the feature importance
for column_name, feature_importance in zip(df_train_scaled.columns, model.feature_importances_):
    print(column_name, feature_importance, sep=':')

Pclass:0.06918017863455934
Age:0.23610622269181258
SibSp:0.04647740370130991
Parch:0.03717060852431524
Fare:0.2343029472451507
Cabin_encoded:0.09628262417583099
Embarked_encoded:0.03291349151813235
Sex_encoded:0.24756652350888889


In [112]:
# Looking at the score
model.score(df_train_scaled, df_train_labels)

0.9865319865319865

In [113]:
conf_mtx = confusion_matrix(df_train_labels, model.predict(df_train_scaled))
conf_mtx

array([[547,   2],
       [ 10, 332]], dtype=int64)

In [114]:
# Now testing the model
df_test = pd.read_csv('test.csv')
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [115]:
df_test.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.841838,14.181209,0.89676,0.981429,55.907576
min,892.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,1.0,21.0,0.0,0.0,7.8958
50%,1100.5,3.0,27.0,0.0,0.0,14.4542
75%,1204.75,3.0,39.0,1.0,0.0,31.5
max,1309.0,3.0,76.0,8.0,9.0,512.3292


In [116]:
df_test.Cabin.describe()

count                  91
unique                 76
top       B57 B59 B63 B66
freq                    3
Name: Cabin, dtype: object

In [117]:
# We need to encode again
last_idx_counter = len(Cabin_unique_values)
for value in df_test.Cabin.unique():
    if Cabin_encode_table.get(value, -1) == -1:
        Cabin_encode_table[value] = last_idx_counter
        last_idx_counter += 1

last_idx_counter = len(Embarked_unique_values)
for value in df_test.Embarked.unique():
    if Embarked_encode_table.get(value, -1) == -1:
        Embarked_encode_table[value] = last_idx_counter
        last_idx_counter += 1
        
df_test["Cabin_encoded"] = df_test.Cabin.map(Cabin_encode_table)
df_test["Embarked_encoded"] = df_test.Embarked.map(Embarked_encode_table)
df_test["Sex_encoded"] = df_test.Sex.map(Sex_encode_table)

In [118]:
df_test.Cabin_encoded.value_counts()

Cabin_encoded
0      327
59       3
174      2
160      2
173      2
      ... 
169      1
170      1
22       1
120      1
186      1
Name: count, Length: 77, dtype: int64

In [119]:
df_test.Embarked_encoded.value_counts()

Embarked_encoded
0    270
1    102
2     46
Name: count, dtype: int64

In [120]:
df_test.Sex_encoded.value_counts()

Sex_encoded
0    266
1    152
Name: count, dtype: int64

In [121]:
# Lets fill age with the mean again
df_test.Age.describe()

count    332.000000
mean      30.272590
std       14.181209
min        0.170000
25%       21.000000
50%       27.000000
75%       39.000000
max       76.000000
Name: Age, dtype: float64

In [122]:
df_train.Age.mean(), df_test.Age.mean()

(29.69911764705882, 30.272590361445783)

In [123]:
# We fill it with the mean from the training data
df_test.Age.fillna(df_train.Age.mean(), inplace=True)

# We should also fill the NaN values in other columns with 0
df_test.fillna(0, inplace=True)
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Cabin_encoded,Embarked_encoded,Sex_encoded
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,0,Q,0,2,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,0,S,0,0,1
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,0,Q,0,2,0
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,0,S,0,0,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,0,S,0,0,1


In [124]:
# Now lets drop some columns and scale the data
df_test_scaled = pd.DataFrame(
    scaler.transform(df_test.drop(["PassengerId", "Name", "Sex", "Ticket", "Cabin", "Embarked"], axis=1)),
    columns=df_train_scaled.columns
)
df_test_scaled.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Cabin_encoded,Embarked_encoded,Sex_encoded
0,1.0,0.428248,0.0,0.0,0.015282,0.0,0.666667,0.0
1,1.0,0.585323,0.125,0.0,0.013663,0.0,0.0,1.0
2,0.5,0.773813,0.0,0.0,0.018909,0.0,0.666667,0.0
3,1.0,0.334004,0.0,0.0,0.016908,0.0,0.0,0.0
4,1.0,0.271174,0.125,0.166667,0.023984,0.0,0.0,1.0


In [125]:
df_test_scaled.describe()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Cabin_encoded,Embarked_encoded,Sex_encoded
count,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0
mean,0.632775,0.373644,0.055921,0.065391,0.069373,0.170605,0.154705,0.363636
std,0.420919,0.158792,0.112095,0.163571,0.109046,0.374628,0.228505,0.481622
min,0.0,-0.003141,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.28374,0.0,0.0,0.015412,0.0,0.0,0.0
50%,1.0,0.367921,0.0,0.0,0.028213,0.0,0.0,0.0
75%,1.0,0.443956,0.125,0.0,0.061429,0.0,0.333333,1.0
max,1.0,0.949736,1.0,1.5,1.0,1.265306,0.666667,1.0


In [126]:
df_test.Fare.unique()

array([  7.8292,   7.    ,   9.6875,   8.6625,  12.2875,   9.225 ,
         7.6292,  29.    ,   7.2292,  24.15  ,   7.8958,  26.    ,
        82.2667,  61.175 ,  27.7208,  12.35  ,   7.225 ,   7.925 ,
        59.4   ,   3.1708,  31.6833,  61.3792, 262.375 ,  14.5   ,
        61.9792,  30.5   ,  21.6792,  31.5   ,  20.575 ,  23.45  ,
        57.75  ,   8.05  ,   9.5   ,  56.4958,  13.4167,  26.55  ,
         7.85  ,  13.    ,  52.5542,  29.7   ,   7.75  ,  76.2917,
        15.9   ,  60.    ,  15.0333,  23.    , 263.    ,  15.5792,
        29.125 ,   7.65  ,  16.1   ,  13.5   ,   7.725 ,  21.    ,
         7.8792,  42.4   ,  28.5375, 211.5   ,  25.7   ,  15.2458,
       221.7792,  10.7083,  14.4542,  13.9   ,   7.775 ,  52.    ,
         7.7958,  78.85  ,   7.8542,  55.4417,   8.5167,  22.525 ,
         7.8208,   8.7125,  15.0458,   7.7792,  31.6792,   7.2833,
         6.4375,  16.7   ,  75.2417,  15.75  ,   7.25  ,  23.25  ,
        28.5   ,  25.4667,  46.9   , 151.55  ,  18.    ,  51.8

In [127]:
prediction_probas = model.predict_proba(df_test_scaled)
prediction_probas

array([[0.98825758, 0.01174242],
       [0.75      , 0.25      ],
       [0.74      , 0.26      ],
       [0.26      , 0.74      ],
       [0.59      , 0.41      ],
       [0.85      , 0.15      ],
       [0.67111111, 0.32888889],
       [0.91      , 0.09      ],
       [0.17      , 0.83      ],
       [0.97      , 0.03      ],
       [1.        , 0.        ],
       [0.73      , 0.27      ],
       [0.07      , 0.93      ],
       [0.87      , 0.13      ],
       [0.07      , 0.93      ],
       [0.04      , 0.96      ],
       [0.87      , 0.13      ],
       [0.45      , 0.55      ],
       [0.715     , 0.285     ],
       [0.52      , 0.48      ],
       [0.73      , 0.27      ],
       [0.48      , 0.52      ],
       [0.05      , 0.95      ],
       [0.69      , 0.31      ],
       [0.09      , 0.91      ],
       [0.98      , 0.02      ],
       [0.01      , 0.99      ],
       [0.34      , 0.66      ],
       [0.37      , 0.63      ],
       [0.79      , 0.21      ],
       [0.

In [128]:
prediction_survived = np.array(
    list(
        map(
            int,
            [entry[1] > entry[0] for entry in prediction_probas]
        )
    )
)

df_test["NotSurvivedProba"] = prediction_probas[:, 0]
df_test["SurvivedProba"] = prediction_probas[:, 1]
df_test["Survived"] = prediction_survived

In [129]:
submission = df_test[["PassengerId", "Survived"]]
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,1
4,896,0


In [132]:
submission.to_csv("submission_erennakdag.csv", index=False)