In [1]:
import tensorflow
from keras.models import Sequential
from keras.layers import Dense , Activation , Dropout
import pandas as pd
from sklearn.metrics import f1_score
import numpy as np
from keras.utils import plot_model
import pydot
import keras

Using TensorFlow backend.


In [2]:
## input
df = pd.read_csv("C:\\Users\\unieuro\\Documents\\ML\\WondrousMachineLaundry-master\\titanic\\data\\train.csv")

## mapping strings to numbers
df["Sex"] = df["Sex"].replace({"male":0, "female":1})
df["Embarked"] = df["Embarked"].replace({'C':0, 'S':1, 'Q':2})

## handling names
title_dict = {"Mr.":0, "Mrs.":1, "Mme.":1, "Miss.":2, "Mlle.":2, "Master.":3, "Dr.":4, "Rev.": 5}
for i_row, row in df.iterrows():
    name_str = row["Name"]
    if not isinstance(name_str, str):
        break # break the loop if the names have already been replaced by ints
    df.loc[i_row,"Name"] = 6 # assign the default value at the begining
    for word in name_str.split():
        if word in title_dict.keys():
            df.loc[i_row,"Name"] = title_dict[word]
            break # break the loop if the key value has been found
        
## handling nans
df = df.drop(columns=["Cabin"])
df["Age"] = df["Age"].fillna(df["Age"].median())
df["Embarked"] = df["Embarked"].fillna(df["Embarked"].mode().iloc[0])

## get dummies
features = ["Pclass", "Name"]
for f in features:
    df = pd.concat([df.drop(columns=f), pd.get_dummies(df[f], prefix=f)], axis=1)

## creating input and output dataframes
yf = df["Survived"]
Xf = df.drop(columns=["PassengerId", "Survived", "Ticket"])

## normalizing
features = ["Age", "Fare"]
Xf[features] = (Xf[features] - Xf[features].mean()) / Xf[features].std()
print(Xf.head())

   Sex       Age  SibSp  Parch      Fare  Embarked  Pclass_1  Pclass_2  \
0    0 -0.565419      1      0 -0.502163       1.0         0         0   
1    1  0.663488      1      0  0.786404       0.0         1         0   
2    1 -0.258192      0      0 -0.488580       1.0         0         0   
3    1  0.433068      1      0  0.420494       1.0         1         0   
4    0  0.433068      0      0 -0.486064       1.0         0         0   

   Pclass_3  Name_0  Name_1  Name_2  Name_3  Name_4  Name_5  Name_6  
0         1       1       0       0       0       0       0       0  
1         0       0       1       0       0       0       0       0  
2         1       0       0       1       0       0       0       0  
3         0       0       1       0       0       0       0       0  
4         1       1       0       0       0       0       0       0  


In [3]:
Xdata=Xf.loc[:,:].to_numpy()
ydata=yf.loc[:].to_numpy()

In [7]:
frac_train = 0.85
train_set = int(Xdata.shape[0] * frac_train)

x_train = Xdata[:int(train_set * 0.8) , :]
y_train = ydata[:int(train_set * 0.8)]

x_test = Xdata[int(train_set * 0.8):train_set , :]
y_test = ydata[int(train_set * 0.8):train_set]

model = Sequential()
model.add(Dense(32, input_dim = Xdata.shape[1], activation='relu'))
#model.add(Dropout(0.5))
model.add(Dense(32, activation='relu'))
#model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

model.fit(x_train, y_train,
          epochs=10,
          batch_size=10 , verbose = 1 , shuffle = True , validation_split = 0.2)
score = model.evaluate(x_test, y_test, batch_size = 10)
print("\nModel loss is:" , score[0])
print("\nModel accuracy is:" , score[1])

Train on 484 samples, validate on 121 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

Model loss is: 0.42701735916106326

Model accuracy is: 0.8092105388641357


In [None]:
from sklearn.model_selection import train_test_split

lst_test = []
lst_train = []

model = Sequential()
model.add(Dense(10, input_dim = Xdata.shape[1], activation='relu'))
#model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy' , optimizer='rmsprop' , metrics=['accuracy'])

lst_train, lst_test = fitNN(Xd = Xdata, yd = ydata, modelNN = model, frac_tr = 0.75, shuffle = True)

###################################### 39%

In [None]:
from matplotlib import pyplot as plt
plt.plot(range(5, length), lst_train , color = 'blue', label = "Train")
plt.plot(range(5, length), lst_test, color= 'red', label = "Test")
plt.legend()
plt.ylim([0.6,1.2])
plt.show()

In [50]:
def fitNN(Xd, yd, modelNN, frac_tr, shuffle = False):
    ltrain = []
    ltest = []
    length = len(yd)
    for m1 in range(5,length):
        print("#"*int(100 * m1 / length), "{0:.0f}%".format(100 * m1 / length), end='\r')
        m1_train = int(m1 * frac_tr)
        if shuffle:
            Xnp_train, Xnp_test, ynp_train, ynp_test = train_test_split(Xdata, ydata, test_size=m1 - m1_train, train_size=m1_train)
        else:
            Xnp_train, Xnp_test, ynp_train, ynp_test = Xdata[:m1_train], Xdata[m1_train:], ydata[:m1_train], ydata[m1_train:] 
        model.fit(Xnp_train , ynp_train , epochs=10 , verbose = 0 , shuffle = True , validation_split = 0.2)
        score = model.evaluate(Xnp_train , ynp_train , verbose = 0)
        ltrain.append(score[1])
        score = model.evaluate(Xnp_test , ynp_test , verbose = 0)
        ltest.append(score[1])
    return ltrain, ltest