# Libraries

In [219]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import time

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

%matplotlib inline

# Download Dataset

In [220]:
df = pd.read_csv("train.csv")

In [221]:
# check dataset
df.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


# Run basic diagnostics

In [222]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [223]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [224]:
# check dataframe's size
df.shape

(891, 12)

In [225]:
# check how many died/lived
df.Survived.value_counts()

0    549
1    342
Name: Survived, dtype: int64

In [226]:
# check survivability based on sex
df.groupby(['Sex','Survived']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,PassengerId,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
Sex,Survived,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
female,0,81,81,81,64,81,81,81,81,6,81
female,1,233,233,233,197,233,233,233,233,91,231
male,0,468,468,468,360,468,468,468,468,62,468
male,1,109,109,109,93,109,109,109,109,45,109


In [227]:
# check types of variables
df.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [228]:
# check variables
df.count()

PassengerId    891
Survived       891
Pclass         891
Name           891
Sex            891
Age            714
SibSp          891
Parch          891
Ticket         891
Fare           891
Cabin          204
Embarked       889
dtype: int64

# Keep passenger ID column

In [229]:
ID = df.PassengerId

# Drop useless variables

## ID

In [230]:
df = df.drop(["PassengerId"], axis=1)

## Name

In [231]:
# Name is clearly a useless variable
df = df.drop(["Name"], axis=1)

## Ticket

In [232]:
df = df.drop(["Ticket"], axis=1)

# Fix missing values

In [233]:
# search for missing values
pd.isnull(df).sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Cabin       687
Embarked      2
dtype: int64

## Embarked column

In [234]:
df.Embarked.head()

0    S
1    C
2    S
3    S
4    S
Name: Embarked, dtype: object

In [235]:
pd.unique(df.Embarked)

array(['S', 'C', 'Q', nan], dtype=object)

In [236]:
df.Embarked.value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [237]:
df.Embarked.fillna('S', inplace = True)

In [238]:
pd.value_counts(df.Embarked)

S    646
C    168
Q     77
Name: Embarked, dtype: int64

## Age column

In [239]:
pd.unique(df.Age)

array([22.  , 38.  , 26.  , 35.  ,   nan, 54.  ,  2.  , 27.  , 14.  ,
        4.  , 58.  , 20.  , 39.  , 55.  , 31.  , 34.  , 15.  , 28.  ,
        8.  , 19.  , 40.  , 66.  , 42.  , 21.  , 18.  ,  3.  ,  7.  ,
       49.  , 29.  , 65.  , 28.5 ,  5.  , 11.  , 45.  , 17.  , 32.  ,
       16.  , 25.  ,  0.83, 30.  , 33.  , 23.  , 24.  , 46.  , 59.  ,
       71.  , 37.  , 47.  , 14.5 , 70.5 , 32.5 , 12.  ,  9.  , 36.5 ,
       51.  , 55.5 , 40.5 , 44.  ,  1.  , 61.  , 56.  , 50.  , 36.  ,
       45.5 , 20.5 , 62.  , 41.  , 52.  , 63.  , 23.5 ,  0.92, 43.  ,
       60.  , 10.  , 64.  , 13.  , 48.  ,  0.75, 53.  , 57.  , 80.  ,
       70.  , 24.5 ,  6.  ,  0.67, 30.5 ,  0.42, 34.5 , 74.  ])

In [240]:
age_nan = df[df.Age.isna()]
age_nan.head(10)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
5,0,3,male,,0,0,8.4583,,Q
17,1,2,male,,0,0,13.0,,S
19,1,3,female,,0,0,7.225,,C
26,0,3,male,,0,0,7.225,,C
28,1,3,female,,0,0,7.8792,,Q
29,0,3,male,,0,0,7.8958,,S
31,1,1,female,,1,0,146.5208,B78,C
32,1,3,female,,0,0,7.75,,Q
36,1,3,male,,0,0,7.2292,,C
42,0,3,male,,0,0,7.8958,,C


In [241]:
np.mean(df.Age)

29.69911764705882

In [242]:
df.Age.fillna('30', inplace = True)

In [243]:
pd.isnull(df).sum()

Survived      0
Pclass        0
Sex           0
Age           0
SibSp         0
Parch         0
Fare          0
Cabin       687
Embarked      0
dtype: int64

## Cabin Column (dropped)

In [244]:
pd.unique(df.Cabin)

array([nan, 'C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6',
       'C23 C25 C27', 'B78', 'D33', 'B30', 'C52', 'B28', 'C83', 'F33',
       'F G73', 'E31', 'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101',
       'F E69', 'D47', 'B86', 'F2', 'C2', 'E33', 'B19', 'A7', 'C49', 'F4',
       'A32', 'B4', 'B80', 'A31', 'D36', 'D15', 'C93', 'C78', 'D35',
       'C87', 'B77', 'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19',
       'B49', 'D', 'C22 C26', 'C106', 'C65', 'E36', 'C54',
       'B57 B59 B63 B66', 'C7', 'E34', 'C32', 'B18', 'C124', 'C91', 'E40',
       'T', 'C128', 'D37', 'B35', 'E50', 'C82', 'B96 B98', 'E10', 'E44',
       'A34', 'C104', 'C111', 'C92', 'E38', 'D21', 'E12', 'E63', 'A14',
       'B37', 'C30', 'D20', 'B79', 'E25', 'D46', 'B73', 'C95', 'B38',
       'B39', 'B22', 'C86', 'C70', 'A16', 'C101', 'C68', 'A10', 'E68',
       'B41', 'A20', 'D19', 'D50', 'D9', 'A23', 'B50', 'A26', 'D48',
       'E58', 'C126', 'B71', 'B51 B53 B55', 'D49', 'B5', 'B20', 'F G63',
       'C62 C64',

In [245]:
# keep only the first character (C85 -> C)
df.Cabin = df.Cabin.str[0]

In [246]:
df.head(10)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22,1,0,7.25,,S
1,1,1,female,38,1,0,71.2833,C,C
2,1,3,female,26,0,0,7.925,,S
3,1,1,female,35,1,0,53.1,C,S
4,0,3,male,35,0,0,8.05,,S
5,0,3,male,30,0,0,8.4583,,Q
6,0,1,male,54,0,0,51.8625,E,S
7,0,3,male,2,3,1,21.075,,S
8,1,3,female,27,0,2,11.1333,,S
9,1,2,female,14,1,0,30.0708,,C


In [247]:
pd.unique(df.Cabin)

array([nan, 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [248]:
df[df.Cabin=='T']

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
339,0,1,male,45,0,0,35.5,T,S


In [249]:
df.loc[(df["Cabin"].notna()) & (df["Sex"]=="female")]

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
1,1,1,female,38,1,0,71.2833,C,C
3,1,1,female,35,1,0,53.1000,C,S
10,1,3,female,4,1,1,16.7000,G,S
11,1,1,female,58,0,0,26.5500,C,S
31,1,1,female,30,1,0,146.5208,B,C
52,1,1,female,49,1,0,76.7292,D,C
61,1,1,female,38,0,0,80.0000,B,S
66,1,2,female,29,0,0,10.5000,F,S
88,1,1,female,23,3,2,263.0000,C,S
123,1,2,female,32.5,0,0,13.0000,E,S


In [250]:
df.loc[(df["Cabin"].notna()) & (df["Sex"]=="male")]

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
6,0,1,male,54,0,0,51.8625,E,S
21,1,2,male,34,0,0,13.0000,D,S
23,1,1,male,28,0,0,35.5000,A,S
27,0,1,male,19,3,2,263.0000,C,S
54,0,1,male,65,0,1,61.9792,B,C
55,1,1,male,30,0,0,35.5000,C,S
62,0,1,male,45,1,0,83.4750,C,S
75,0,3,male,25,0,0,7.6500,F,S
92,0,1,male,46,1,0,61.1750,E,S
96,0,1,male,71,0,0,34.6542,A,C


In [251]:
df = df.drop(["Cabin"], axis=1)

In [252]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22,1,0,7.25,S
1,1,1,female,38,1,0,71.2833,C
2,1,3,female,26,0,0,7.925,S
3,1,1,female,35,1,0,53.1,S
4,0,3,male,35,0,0,8.05,S


# convert categorical variables to numeric

In [253]:
df.dtypes

Survived      int64
Pclass        int64
Sex          object
Age          object
SibSp         int64
Parch         int64
Fare        float64
Embarked     object
dtype: object

## Age column

In [254]:
df.Age.head()

0    22
1    38
2    26
3    35
4    35
Name: Age, dtype: object

In [255]:
pd.unique(df.Age)

array([22.0, 38.0, 26.0, 35.0, '30', 54.0, 2.0, 27.0, 14.0, 4.0, 58.0,
       20.0, 39.0, 55.0, 31.0, 34.0, 15.0, 28.0, 8.0, 19.0, 40.0, 66.0,
       42.0, 21.0, 18.0, 3.0, 7.0, 49.0, 29.0, 65.0, 28.5, 5.0, 11.0,
       45.0, 17.0, 32.0, 16.0, 25.0, 0.83, 30.0, 33.0, 23.0, 24.0, 46.0,
       59.0, 71.0, 37.0, 47.0, 14.5, 70.5, 32.5, 12.0, 9.0, 36.5, 51.0,
       55.5, 40.5, 44.0, 1.0, 61.0, 56.0, 50.0, 36.0, 45.5, 20.5, 62.0,
       41.0, 52.0, 63.0, 23.5, 0.92, 43.0, 60.0, 10.0, 64.0, 13.0, 48.0,
       0.75, 53.0, 57.0, 80.0, 70.0, 24.5, 6.0, 0.67, 30.5, 0.42, 34.5,
       74.0], dtype=object)

In [256]:
df = df.astype({'Age': 'Float64'})
df.dtypes

Survived      int64
Pclass        int64
Sex          object
Age         float64
SibSp         int64
Parch         int64
Fare        float64
Embarked     object
dtype: object

In [257]:
df.Age = np.ceil(df.Age)

In [258]:
df.Age.head()

0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
Name: Age, dtype: float64

In [259]:
df.dtypes

Survived      int64
Pclass        int64
Sex          object
Age         float64
SibSp         int64
Parch         int64
Fare        float64
Embarked     object
dtype: object

In [260]:
pd.to_numeric(df.Age).head()

0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
Name: Age, dtype: float64

In [261]:
df = df.astype({'Age': 'int64'})
df.dtypes

Survived      int64
Pclass        int64
Sex          object
Age           int64
SibSp         int64
Parch         int64
Fare        float64
Embarked     object
dtype: object

## Sex column (One Hot Encoding)

In [262]:
pd.unique(df.Sex)

array(['male', 'female'], dtype=object)

In [263]:
df.head(10)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22,1,0,7.25,S
1,1,1,female,38,1,0,71.2833,C
2,1,3,female,26,0,0,7.925,S
3,1,1,female,35,1,0,53.1,S
4,0,3,male,35,0,0,8.05,S
5,0,3,male,30,0,0,8.4583,Q
6,0,1,male,54,0,0,51.8625,S
7,0,3,male,2,3,1,21.075,S
8,1,3,female,27,0,2,11.1333,S
9,1,2,female,14,1,0,30.0708,C


In [264]:
# replace "male"->1, "female"->2
#df.replace({"Sex":{'male' : 1, 'female': 2}}, inplace=True)

In [265]:
# one hot encoding pandas get_dummies
df = pd.get_dummies(df, columns=['Sex'], prefix = ['Sex'], drop_first=True)

In [266]:
df.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Embarked,Sex_male
0,0,3,22,1,0,7.25,S,1
1,1,1,38,1,0,71.2833,C,0
2,1,3,26,0,0,7.925,S,0
3,1,1,35,1,0,53.1,S,0
4,0,3,35,0,0,8.05,S,1


In [267]:
df.dtypes

Survived      int64
Pclass        int64
Age           int64
SibSp         int64
Parch         int64
Fare        float64
Embarked     object
Sex_male      uint8
dtype: object

## Embarked column (one hot encoding via pandas get_dummies

In [268]:
df.Embarked.head()

0    S
1    C
2    S
3    S
4    S
Name: Embarked, dtype: object

In [269]:
pd.unique(df.Embarked)

array(['S', 'C', 'Q'], dtype=object)

In [270]:
# one hot encoding pandas get_dummies
df = pd.get_dummies(df, columns=['Embarked'], prefix = ['Embarked'], drop_first=True)

In [271]:
df.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,0,3,22,1,0,7.25,1,0,1
1,1,1,38,1,0,71.2833,0,0,0
2,1,3,26,0,0,7.925,0,0,1
3,1,1,35,1,0,53.1,0,0,1
4,0,3,35,0,0,8.05,1,0,1


# Normalize/Standardize variables

In [272]:
df.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,0,3,22,1,0,7.25,1,0,1
1,1,1,38,1,0,71.2833,0,0,0
2,1,3,26,0,0,7.925,0,0,1
3,1,1,35,1,0,53.1,0,0,1
4,0,3,35,0,0,8.05,1,0,1


## Split to x, y variables

In [273]:
y_df = df.Survived
y = y_df.values

In [274]:
x_df = df.drop(["Survived"], axis=1)
x = x_df.values

## Standardization

In [275]:
# standardization scaler
st_scaler = StandardScaler() 

In [276]:
# standardize x
st_x = st_scaler.fit_transform(x)

In [277]:
st_df = pd.DataFrame(st_x, columns = x_df.columns)

In [278]:
# merge x,y
st_df = pd.concat([st_df, pd.DataFrame(y, columns = ['Survived'])], axis = 1)
st_df.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S,Survived
0,0.827377,-0.597994,0.432793,-0.473674,-0.502445,0.737695,-0.307562,0.615838,0
1,-1.566107,0.633232,0.432793,-0.473674,0.786845,-1.355574,-0.307562,-1.623803,1
2,0.827377,-0.290188,-0.474545,-0.473674,-0.488854,-1.355574,-0.307562,0.615838,1
3,-1.566107,0.402377,0.432793,-0.473674,0.42073,-1.355574,-0.307562,0.615838,1
4,0.827377,0.402377,-0.474545,-0.473674,-0.486337,0.737695,-0.307562,0.615838,0


## Normalization

In [279]:
# normalization scaler
n_scaler = MinMaxScaler()

In [280]:
# normalize x
n_x = n_scaler.fit_transform(x)

In [281]:
n_df = pd.DataFrame(n_x, columns = x_df.columns)

In [282]:
# merge x,y
n_df = pd.concat([n_df, pd.DataFrame(y, columns = ['Survived'])], axis = 1)
n_df.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S,Survived
0,1.0,0.265823,0.125,0.0,0.014151,1.0,0.0,1.0,0
1,0.0,0.468354,0.125,0.0,0.139136,0.0,0.0,0.0,1
2,1.0,0.316456,0.0,0.0,0.015469,0.0,0.0,1.0,1
3,0.0,0.43038,0.125,0.0,0.103644,0.0,0.0,1.0,1
4,1.0,0.43038,0.0,0.0,0.015713,1.0,0.0,1.0,0


# check correlations

In [283]:
n_df.dtypes

Pclass        float64
Age           float64
SibSp         float64
Parch         float64
Fare          float64
Sex_male      float64
Embarked_Q    float64
Embarked_S    float64
Survived        int64
dtype: object

In [284]:
st_df.dtypes

Pclass        float64
Age           float64
SibSp         float64
Parch         float64
Fare          float64
Sex_male      float64
Embarked_Q    float64
Embarked_S    float64
Survived        int64
dtype: object

In [285]:
st_df.corr()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S,Survived
Pclass,1.0,-0.3293,0.083081,0.018443,-0.5495,0.1319,0.221009,0.074053,-0.338481
Age,-0.3293,1.0,-0.232683,-0.180337,0.090291,0.084989,-0.010564,-0.022069,-0.070982
SibSp,0.083081,-0.232683,1.0,0.414838,0.159651,-0.114631,-0.026354,0.068734,-0.035322
Parch,0.018443,-0.180337,0.414838,1.0,0.216225,-0.245489,-0.081228,0.060814,0.081629
Fare,-0.5495,0.090291,0.159651,0.216225,1.0,-0.182333,-0.117216,-0.162184,0.257307
Sex_male,0.1319,0.084989,-0.114631,-0.245489,-0.182333,1.0,-0.074115,0.119224,-0.543351
Embarked_Q,0.221009,-0.010564,-0.026354,-0.081228,-0.117216,-0.074115,1.0,-0.499421,0.00365
Embarked_S,0.074053,-0.022069,0.068734,0.060814,-0.162184,0.119224,-0.499421,1.0,-0.149683
Survived,-0.338481,-0.070982,-0.035322,0.081629,0.257307,-0.543351,0.00365,-0.149683,1.0


In [286]:
n_df.corr()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S,Survived
Pclass,1.0,-0.3293,0.083081,0.018443,-0.5495,0.1319,0.221009,0.074053,-0.338481
Age,-0.3293,1.0,-0.232683,-0.180337,0.090291,0.084989,-0.010564,-0.022069,-0.070982
SibSp,0.083081,-0.232683,1.0,0.414838,0.159651,-0.114631,-0.026354,0.068734,-0.035322
Parch,0.018443,-0.180337,0.414838,1.0,0.216225,-0.245489,-0.081228,0.060814,0.081629
Fare,-0.5495,0.090291,0.159651,0.216225,1.0,-0.182333,-0.117216,-0.162184,0.257307
Sex_male,0.1319,0.084989,-0.114631,-0.245489,-0.182333,1.0,-0.074115,0.119224,-0.543351
Embarked_Q,0.221009,-0.010564,-0.026354,-0.081228,-0.117216,-0.074115,1.0,-0.499421,0.00365
Embarked_S,0.074053,-0.022069,0.068734,0.060814,-0.162184,0.119224,-0.499421,1.0,-0.149683
Survived,-0.338481,-0.070982,-0.035322,0.081629,0.257307,-0.543351,0.00365,-0.149683,1.0


In [287]:
st_df.corr()[abs(st_df.corr())>0.3]

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S,Survived
Pclass,1.0,-0.3293,,,-0.5495,,,,-0.338481
Age,-0.3293,1.0,,,,,,,
SibSp,,,1.0,0.414838,,,,,
Parch,,,0.414838,1.0,,,,,
Fare,-0.5495,,,,1.0,,,,
Sex_male,,,,,,1.0,,,-0.543351
Embarked_Q,,,,,,,1.0,-0.499421,
Embarked_S,,,,,,,-0.499421,1.0,
Survived,-0.338481,,,,,-0.543351,,,1.0


In [348]:
n_df.corr()[abs(n_df.corr())>0.2]

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S,Survived
Pclass,1.0,-0.3293,,,-0.5495,,0.221009,,-0.338481
Age,-0.3293,1.0,-0.232683,,,,,,
SibSp,,-0.232683,1.0,0.414838,,,,,
Parch,,,0.414838,1.0,0.216225,-0.245489,,,
Fare,-0.5495,,,0.216225,1.0,,,,0.257307
Sex_male,,,,-0.245489,,1.0,,,-0.543351
Embarked_Q,0.221009,,,,,,1.0,-0.499421,
Embarked_S,,,,,,,-0.499421,1.0,
Survived,-0.338481,,,,0.257307,-0.543351,,,1.0


In [289]:
# Sex and ticket class seem to have the higher correlation

# Run ML algorithms

In [290]:
accuracy_list = []

## Logistic Regretion

In [291]:
from sklearn.linear_model import LogisticRegression

In [388]:
def logistic(X,y):
    xtrain, xvalid, ytrain, yvalid = train_test_split(X, y, random_state=42, test_size=0.1)
    
    model = LogisticRegression(random_state=None, 
                               solver='liblinear',
                               #C=1,
                               #class_weight='balanced',
                               multi_class = 'ovr',
                               max_iter = 10)
    
    start_time = time.time()
    model.fit(xtrain, ytrain)
    elapsed_time = time.time() - start_time
    
    y_pred = model.predict(xvalid)
    acc = metrics.accuracy_score(yvalid, y_pred)
    print ('Accuracy:',acc*100, '%')
    print ('Confusion matrix:\n',confusion_matrix(yvalid, y_pred)) 
    
    print ('\nElapsed time:', elapsed_time, 'sec')
    accuracy_list.append(acc)

### Keep only most important variables

In [393]:
temp_df = st_df[['Sex_male','Fare','Pclass']]

In [394]:
logistic(temp_df, st_df.iloc[:,-1])

Accuracy: 81.11111111111111 %
Confusion matrix:
 [[45  9]
 [ 8 28]]

Elapsed time: 0.001998424530029297 sec


### standardized model

In [395]:
logistic(st_df.iloc[:,:-1], st_df.iloc[:,-1])

Accuracy: 84.44444444444444 %
Confusion matrix:
 [[46  8]
 [ 6 30]]

Elapsed time: 0.0019974708557128906 sec


### normalized model


In [396]:
logistic(n_df.iloc[:,:-1], n_df.iloc[:,-1])

Accuracy: 82.22222222222221 %
Confusion matrix:
 [[45  9]
 [ 7 29]]

Elapsed time: 0.0029976367950439453 sec


## SVM

In [77]:
from sklearn.svm import SVC 

In [466]:
def svm(X,y):
    
    xtrain, xvalid, ytrain, yvalid = train_test_split(X, y, random_state=32, test_size=0.2)
    
    model = SVC(kernel = 'rbf'
               )
    
    start_time = time.time()
    model.fit(xtrain, ytrain)
    elapsed_time = time.time() - start_time
    
    y_pred = model.predict(xvalid)
    acc = metrics.accuracy_score(yvalid, y_pred)
    print ('Accuracy:',acc*100, '%')
    print ('Confusion matrix:\n',confusion_matrix(yvalid, y_pred)) 
    
    print ('\nElapsed time:', elapsed_time, 'sec')
    accuracy_list.append(acc)

In [467]:
svm(st_df.iloc[:,:-1], st_df.iloc[:,-1])

Accuracy: 82.12290502793296 %
Confusion matrix:
 [[98 10]
 [22 49]]

Elapsed time: 0.014990568161010742 sec




In [468]:
svm(n_df.iloc[:,:-1], n_df.iloc[:,-1])

Accuracy: 79.3296089385475 %
Confusion matrix:
 [[94 14]
 [23 48]]

Elapsed time: 0.01826786994934082 sec




## Decision Tree

In [81]:
from sklearn.tree import DecisionTreeClassifier

In [82]:
def dectree(X,y):
    xtrain, xvalid, ytrain, yvalid = train_test_split(X, y, test_size=0.3)
    
    model = DecisionTreeClassifier(criterion = 'entropy')
    
    start_time = time.time()
    model.fit(xtrain, ytrain)
    elapsed_time = time.time() - start_time
    
    y_pred = model.predict(xvalid)
    acc = metrics.accuracy_score(yvalid, y_pred)
    print ('Accuracy:', acc*100, '%')
    print ('Confusion matrix:\n',confusion_matrix(yvalid, y_pred)) 
    
    print ('\nElapsed time:', elapsed_time, 'sec')
    accuracy_list.append(acc)

In [83]:
svm(st_df.iloc[:,:-1], n_df.iloc[:,-1])

Accuracy: 79.1044776119403 %
Confusion matrix:
 [[134  23]
 [ 33  78]]

Elapsed time: 0.010993003845214844 sec


In [84]:
svm(n_df.iloc[:,:-1], n_df.iloc[:,-1])

Accuracy: 79.1044776119403 %
Confusion matrix:
 [[134  23]
 [ 33  78]]

Elapsed time: 0.006995439529418945 sec


## Random Forest

In [85]:
from sklearn.ensemble import RandomForestClassifier

In [86]:
def randforest(X,y):
    xtrain, xvalid, ytrain, yvalid = train_test_split(X, y, test_size=0.3)
    
    model = RandomForestClassifier(n_estimators=100, random_state = 0)

    start_time = time.time()
    model.fit(xtrain, ytrain)
    elapsed_time = time.time() - start_time
    
    y_pred = model.predict(xvalid)
    acc = metrics.accuracy_score(yvalid, y_pred)
    print ('Accuracy:',acc)
    print ('Confusion matrix:\n',confusion_matrix(yvalid, y_pred)) 
    
    print ('\nElapsed time:', elapsed_time, 'sec')
    accuracy_list.append(acc)

In [87]:
randforest(st_df.iloc[:,:-1], n_df.iloc[:,-1])

Accuracy: 0.8171641791044776
Confusion matrix:
 [[145  20]
 [ 29  74]]

Elapsed time: 0.1269214153289795 sec


In [88]:
randforest(n_df.iloc[:,:-1], n_df.iloc[:,-1])

Accuracy: 0.8208955223880597
Confusion matrix:
 [[140  16]
 [ 32  80]]

Elapsed time: 0.12592387199401855 sec


## KNN

In [89]:
from sklearn.neighbors import KNeighborsClassifier

In [90]:
def KNN(X,y):
    xtrain, xvalid, ytrain, yvalid = train_test_split(X, y, test_size=0.3)
    
    model = KNeighborsClassifier(n_neighbors = 6)
    
    start_time = time.perf_counter()
    model.fit(xtrain, ytrain)
    elapsed_time = time.perf_counter() - start_time
    
    y_pred = model.predict(xvalid)
    acc = metrics.accuracy_score(yvalid, y_pred)
    print ('Accuracy:',acc*100,'%')
    print ('Confusion matrix:\n',confusion_matrix(yvalid, y_pred)) 
    
    print ('\nElapsed time:', elapsed_time, 'sec')
    accuracy_list.append(acc)

In [91]:
KNN(st_df.iloc[:,:-1], n_df.iloc[:,-1])

Accuracy: 80.22388059701493 %
Confusion matrix:
 [[146  24]
 [ 29  69]]

Elapsed time: 0.002195 sec


In [92]:
KNN(n_df.iloc[:,:-1], n_df.iloc[:,-1])

Accuracy: 80.97014925373134 %
Confusion matrix:
 [[150  13]
 [ 38  67]]

Elapsed time: 0.0021100000000000008 sec


## Gaussian Naive Bayess

In [93]:
from sklearn.naive_bayes import GaussianNB

In [94]:
def nb(X,y):
    xtrain, xvalid, ytrain, yvalid = train_test_split(X, y, test_size=0.3)
    
    model = GaussianNB()
    
    start_time = time.perf_counter()
    model.fit(xtrain, ytrain)
    elapsed_time = time.perf_counter() - start_time
    
    y_pred = model.predict(xvalid)
    acc = metrics.accuracy_score(yvalid, y_pred)
    print ('Accuracy:',acc*100, '%')
    print ('Confusion matrix:\n',confusion_matrix(yvalid, y_pred)) 
    
    print ('\nElapsed time:', elapsed_time, 'sec')
    accuracy_list.append(acc)

In [95]:
nb(st_df.iloc[:,:-1], n_df.iloc[:,-1])

Accuracy: 77.23880597014924 %
Confusion matrix:
 [[138  27]
 [ 34  69]]

Elapsed time: 0.0017870999999999998 sec


In [96]:
nb(n_df.iloc[:,:-1], n_df.iloc[:,-1])

Accuracy: 79.1044776119403 %
Confusion matrix:
 [[140  24]
 [ 32  72]]

Elapsed time: 0.0025096000000000007 sec


## Gradient Boosting

In [97]:
from sklearn.ensemble import GradientBoostingClassifier

In [98]:
def gb(X,y):
    xtrain, xvalid, ytrain, yvalid = train_test_split(X, y, test_size=0.1)
    
    model = GradientBoostingClassifier(learning_rate=0.1, n_estimators=100, max_depth = 4)
    
    start_time = time.time()
    model.fit(xtrain, ytrain)
    elapsed_time = time.time() - start_time
    
    y_pred = model.predict(xvalid)
    acc = metrics.accuracy_score(yvalid, y_pred)
    print ('Accuracy:',acc*100, '%')
    print ('Confusion matrix:\n',confusion_matrix(yvalid, y_pred)) 
    
    print ('\nElapsed time:', elapsed_time, 'sec')
    accuracy_list.append(acc)

In [99]:
gb(st_df.iloc[:,:-1], n_df.iloc[:,-1])

Accuracy: 80.0 %
Confusion matrix:
 [[44 10]
 [ 8 28]]

Elapsed time: 0.14191293716430664 sec


In [100]:
gb(n_df.iloc[:,:-1], n_df.iloc[:,-1])

Accuracy: 82.22222222222221 %
Confusion matrix:
 [[48  8]
 [ 8 26]]

Elapsed time: 0.1469099521636963 sec


# Neural Networks

In [101]:
from keras.models import Sequential
from keras.layers import Dense

Using TensorFlow backend.


In [102]:
accuracy_list_NN = []

In [103]:
# fix random seed for reproducibility
seed = 7
np.random.seed(seed)

In [186]:
# split train/test set
X = st_df.iloc[:,:-1]
y = st_df.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42)

## Keras Classifier

In [205]:
from keras.wrappers.scikit_learn import KerasClassifier

In [212]:
# create model
model = Sequential()
# First Hidden Layer
model.add(Dense(32, activation='relu', kernel_initializer='random_normal', input_dim=X.shape[1]))
# Second  Hidden Layer
model.add(Dense(16, activation='relu', kernel_initializer='random_normal'))
# Third  Hidden Layer
model.add(Dense(8, activation='relu', kernel_initializer='random_normal'))
# Output layer
model.add(Dense(1, activation='sigmoid', kernel_initializer='random_normal'))

In [213]:
# Compile model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [214]:
print (model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_16 (Dense)             (None, 32)                288       
_________________________________________________________________
dense_17 (Dense)             (None, 16)                528       
_________________________________________________________________
dense_18 (Dense)             (None, 8)                 136       
_________________________________________________________________
dense_19 (Dense)             (None, 1)                 9         
Total params: 961
Trainable params: 961
Non-trainable params: 0
_________________________________________________________________
None


In [215]:
# run model
batch_size = 10
start = time.time()
model.fit(X_train, y_train, epochs = 100, batch_size = batch_size, verbose = 2)
elapsed_time = time.time() - start

Epoch 1/100
 - 1s - loss: 0.6890 - acc: 0.6174
Epoch 2/100
 - 0s - loss: 0.6470 - acc: 0.6829
Epoch 3/100
 - 0s - loss: 0.5116 - acc: 0.7953
Epoch 4/100
 - 0s - loss: 0.4607 - acc: 0.8054
Epoch 5/100
 - 0s - loss: 0.4488 - acc: 0.8104
Epoch 6/100
 - 0s - loss: 0.4409 - acc: 0.8121
Epoch 7/100
 - 0s - loss: 0.4362 - acc: 0.8104
Epoch 8/100
 - 0s - loss: 0.4304 - acc: 0.8205
Epoch 9/100
 - 0s - loss: 0.4275 - acc: 0.8188
Epoch 10/100
 - 0s - loss: 0.4251 - acc: 0.8238
Epoch 11/100
 - 0s - loss: 0.4231 - acc: 0.8238
Epoch 12/100
 - 0s - loss: 0.4180 - acc: 0.8322
Epoch 13/100
 - 0s - loss: 0.4154 - acc: 0.8339
Epoch 14/100
 - 0s - loss: 0.4134 - acc: 0.8339
Epoch 15/100
 - 0s - loss: 0.4115 - acc: 0.8372
Epoch 16/100
 - 0s - loss: 0.4115 - acc: 0.8372
Epoch 17/100
 - 0s - loss: 0.4093 - acc: 0.8372
Epoch 18/100
 - 0s - loss: 0.4076 - acc: 0.8456
Epoch 19/100
 - 0s - loss: 0.4093 - acc: 0.8339
Epoch 20/100
 - 0s - loss: 0.4041 - acc: 0.8456
Epoch 21/100
 - 0s - loss: 0.4066 - acc: 0.8406
E

In [216]:
loss, acc = model.evaluate(X_test, y_test, verbose = 2, batch_size = batch_size)

In [217]:
print("loss: %.2f" % (loss))
print("acc: %.2f" % (acc))

loss: 0.45
acc: 0.81


In [218]:
print ('elapsed time:', elapsed_time, 'sec')

elapsed time: 7.77970290184021 sec


In [116]:
# predict on test set
y_pred = model.predict(X_test)
y_pred = np.round(y_pred)

In [118]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[161  14]
 [ 42  78]]


# Submit results

In [119]:
# read test data
test = pd.read_csv("test.csv")

In [120]:
# drop useless columns
predictions_ID = test.PassengerId
test = test.drop(["PassengerId"], axis=1)
test = test.drop(["Name"], axis=1)
test = test.drop(["Ticket"], axis=1)
test = test.drop(["Cabin"], axis=1)

In [121]:
# search for missing values
pd.isnull(test).sum()

Pclass       0
Sex          0
Age         86
SibSp        0
Parch        0
Fare         1
Embarked     0
dtype: int64

In [122]:
# change missing values
test.Age.fillna(np.mean(test.Age), inplace = True)
test.Fare.fillna(np.mean(test.Fare), inplace = True)

In [123]:
test.dtypes

Pclass        int64
Sex          object
Age         float64
SibSp         int64
Parch         int64
Fare        float64
Embarked     object
dtype: object

In [124]:
# convert categorical to numeric
pd.to_numeric(test.Age).head()
test = test.astype({'Age': 'Float64'})
test.Age = np.ceil(test.Age)

test = pd.get_dummies(test, columns=['Sex'], prefix = ['Sex'], drop_first=True)
test = pd.get_dummies(test, columns=['Embarked'], prefix = ['Embarked'], drop_first=True)

In [125]:
test.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,3,35.0,0,0,7.8292,1,1,0
1,3,47.0,1,0,7.0,0,0,1
2,2,62.0,0,0,9.6875,1,1,0
3,3,27.0,0,0,8.6625,1,0,1
4,3,22.0,1,1,12.2875,0,0,1


In [126]:
X = test.values

In [127]:
# standardization scaler
st_X = st_scaler.fit_transform(X)

# standardize x
st_X = st_scaler.fit_transform(X)

In [128]:
# predict on test set
y_test_pred = model.predict(st_X, batch_size = None, verbose = 1)
y_test_pred = np.round(y_test_pred)



In [129]:
y_test_pred.shape

(418, 1)

In [130]:
predictions_ID.shape

(418,)

In [143]:
predictions = pd.concat([pd.DataFrame(predictions_ID), pd.DataFrame(y_test_pred, columns = ['Survived'])], axis = 1)

In [144]:
predictions.head(10)

Unnamed: 0,PassengerId,Survived
0,892,0.0
1,893,0.0
2,894,0.0
3,895,0.0
4,896,0.0
5,897,0.0
6,898,1.0
7,899,0.0
8,900,1.0
9,901,0.0


In [166]:
predictions.dtypes

PassengerId    int64
Survived       int32
dtype: object

In [165]:
predictions.Survived = predictions['Survived'].astype(int)

In [159]:
# convert all columns of DataFrame to numeric
#predictions = predictions.apply(pd.to_numeric) # convert all columns of DataFrame

In [167]:
#predictions = predictions.as_type(int)

In [168]:
predictions.to_csv(path_or_buf=r"C:\Users\pgrig\Desktop\ML_practice\titanic\glob_predictions.csv",
                   encoding='utf-8', index=False)