# Titanic - Just another approach

### Importing libraries

In [0]:
# Titanic Example
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

### Reading files

In [0]:
# Reading csv Titanic's Dataset
titanic = pd.read_csv('titanic(1).csv')# .head(50)
titanic


Unnamed: 0,Passenger Class,Name,Sex,Age,No of Siblings or Spouses on Board,No of Parents or Children on Board,Ticket Number,Passenger Fare,Cabin,Port of Embarkation,Life Boat,Survived
0,First,"Allen, Miss. Elisabeth Walton",Female,29.0,0.0,0.0,24160,211.3,B5,Southampton,2,Yes
1,First,"Allison, Master. Hudson Trevor",Male,0.9,1.0,2.0,113781,151.6,C22 C26,Southampton,11,Yes
2,First,"Allison, Miss. Helen Loraine",Female,2.0,1.0,2.0,113781,151.6,C22 C26,Southampton,,No
3,First,"Allison, Mr. Hudson Joshua Creighton",Male,30.0,1.0,2.0,113781,151.6,C22 C26,Southampton,,No
4,First,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",Female,25.0,1.0,2.0,113781,151.6,C22 C26,Southampton,,No
...,...,...,...,...,...,...,...,...,...,...,...,...
1304,Third,"Zabour, Miss. Hileni",Female,14.5,1.0,0.0,2665,14.5,,Cherbourg,,No
1305,Third,"Zabour, Miss. Thamine",Female,,1.0,0.0,2665,14.5,,Cherbourg,,No
1306,Third,"Zakarian, Mr. Mapriededer",Male,26.5,0.0,0.0,2656,7.2,,Cherbourg,,No
1307,Third,"Zakarian, Mr. Ortin",Male,27.0,0.0,0.0,2670,7.2,,Cherbourg,,No


### Make ETL cleaning
Please think carefully about the impact of row deletion on your dataframe as there may be other ways of fixing missing data

In [0]:
# Let's begin by having a look to see how "good" is our dataset by using count(). Thre are missing values?
titanic.count()

# As we can see from the results below, there are missing values in some columns.

Passenger Class                       1309
Name                                  1309
Sex                                   1309
Age                                   1046
No of Siblings or Spouses on Board    1309
No of Parents or Children on Board    1309
Ticket Number                         1309
Passenger Fare                        1308
Cabin                                  295
Port of Embarkation                   1307
Life Boat                              486
Survived                              1309
dtype: int64

Let's fix age issue using a mix of pandas and pure Python code!!!
Don't expect libraries will resolve all your needs. They are tools to made your life easier just when you need it.


In [0]:
# definig a function to fill zero values with the dataset's age mean
# Option 1
def add_age(cols):
    Age = cols[0]
    Pclass= cols[1]
    if pd.isnull(Age):
        return int(titanic[titanic["Passenger Class"] == Pclass]["Age"].mean())
    else:
        return Age

titanic["Age"] = titanic[["Age", "Passenger Class"]].apply(add_age,axis=1)

##########################################
# Option 2
# def add_age2 ( Age, Pclass)
#    if pd.isnull(Age):
#        return int(titanic[titanic["Passenger Class"] == Pclass]["Age"].mean())
#    else:
#        return Age
    
# Calling de function
# titanic["Age"] = add_age2 ( titanic["Age"],titanic["Passenger Class"])
titanic.count()

Passenger Class                       1309
Name                                  1309
Sex                                   1309
Age                                   1309
No of Siblings or Spouses on Board    1309
No of Parents or Children on Board    1309
Ticket Number                         1309
Passenger Fare                        1308
Cabin                                  295
Port of Embarkation                   1307
Life Boat                              486
Survived                              1309
dtype: int64

We have too many null values for Cabin column, so we just remove it. Notice that we remove the column, not the rows with nuls in Cabin column.


In [0]:
titanic.drop(["Cabin", "Life Boat"],inplace=True,axis=1)

titanic.count()

Passenger Class                       1309
Name                                  1309
Sex                                   1309
Age                                   1309
No of Siblings or Spouses on Board    1309
No of Parents or Children on Board    1309
Ticket Number                         1309
Passenger Fare                        1308
Port of Embarkation                   1307
Survived                              1309
dtype: int64

Again, we remove some rows with null values that we can't fix by any means

In [0]:
titanic.dropna(inplace=True)
titanic.count()

Passenger Class                       1306
Name                                  1306
Sex                                   1306
Age                                   1306
No of Siblings or Spouses on Board    1306
No of Parents or Children on Board    1306
Ticket Number                         1306
Passenger Fare                        1306
Port of Embarkation                   1306
Survived                              1306
dtype: int64

### Converting non-numerical data (when possible)

Ok, now that we are done with cleaning the data, let's convert some categorical data into numeric.

In [0]:
#  Sex column.Using the get_dummies function of Pandas to create two columns, one for male, one for female.
sex = pd.get_dummies(titanic["Sex"])
sex.head()

Unnamed: 0,Female,Male
0,1,0
1,0,1
2,1,0
3,0,1
4,1,0


Let's do the same for "Port of Embarkation" and "Passenger Class"

In [0]:
embarked = pd.get_dummies(titanic["Port of Embarkation"])
embarked

Unnamed: 0,Cherbourg,Queenstown,Southampton
0,0,0,1
1,0,0,1
2,0,0,1
3,0,0,1
4,0,0,1
...,...,...,...
1304,1,0,0
1305,1,0,0
1306,1,0,0
1307,1,0,0


In [0]:
pc_class = pd.get_dummies(titanic["Passenger Class"])
pc_class

Unnamed: 0,First,Second,Third
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0
...,...,...,...
1304,0,0,1
1305,0,0,1
1306,0,0,1
1307,0,0,1


now we add these new numeric columns to the dataframe

In [0]:
titanic = pd.concat([titanic,pc_class,sex,embarked],axis=1)
titanic.count()

Passenger Class                       1306
Name                                  1306
Sex                                   1306
Age                                   1306
No of Siblings or Spouses on Board    1306
No of Parents or Children on Board    1306
Ticket Number                         1306
Passenger Fare                        1306
Port of Embarkation                   1306
Survived                              1306
First                                 1306
Second                                1306
Third                                 1306
Female                                1306
Male                                  1306
Cherbourg                             1306
Queenstown                            1306
Southampton                           1306
dtype: int64

In [0]:
# We save a DataFrame copy to be used with Seaborn's plots
sea_titanic = titanic.copy()
sea_titanic

Unnamed: 0,Passenger Class,Name,Sex,Age,No of Siblings or Spouses on Board,No of Parents or Children on Board,Ticket Number,Passenger Fare,Port of Embarkation,Survived,First,Second,Third,Female,Male,Cherbourg,Queenstown,Southampton
0,First,"Allen, Miss. Elisabeth Walton",Female,29.0,0.0,0.0,24160,211.3,Southampton,Yes,1,0,0,1,0,0,0,1
1,First,"Allison, Master. Hudson Trevor",Male,0.9,1.0,2.0,113781,151.6,Southampton,Yes,1,0,0,0,1,0,0,1
2,First,"Allison, Miss. Helen Loraine",Female,2.0,1.0,2.0,113781,151.6,Southampton,No,1,0,0,1,0,0,0,1
3,First,"Allison, Mr. Hudson Joshua Creighton",Male,30.0,1.0,2.0,113781,151.6,Southampton,No,1,0,0,0,1,0,0,1
4,First,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",Female,25.0,1.0,2.0,113781,151.6,Southampton,No,1,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,Third,"Zabour, Miss. Hileni",Female,14.5,1.0,0.0,2665,14.5,Cherbourg,No,0,0,1,1,0,1,0,0
1305,Third,"Zabour, Miss. Thamine",Female,24.0,1.0,0.0,2665,14.5,Cherbourg,No,0,0,1,1,0,1,0,0
1306,Third,"Zakarian, Mr. Mapriededer",Male,26.5,0.0,0.0,2656,7.2,Cherbourg,No,0,0,1,0,1,1,0,0
1307,Third,"Zakarian, Mr. Ortin",Male,27.0,0.0,0.0,2670,7.2,Cherbourg,No,0,0,1,0,1,1,0,0


Finally, we can get rid of non-numerical columns (except for "Passenger Class" wich will be used later)

In [0]:
titanic.drop(["Passenger Class","Passenger Fare","Name","Sex","Ticket Number","Port of Embarkation"],axis=1,inplace=True)
titanic.head()

Unnamed: 0,Age,No of Siblings or Spouses on Board,No of Parents or Children on Board,Survived,First,Second,Third,Female,Male,Cherbourg,Queenstown,Southampton
0,29.0,0.0,0.0,Yes,1,0,0,1,0,0,0,1
1,0.9,1.0,2.0,Yes,1,0,0,0,1,0,0,1
2,2.0,1.0,2.0,No,1,0,0,1,0,0,0,1
3,30.0,1.0,2.0,No,1,0,0,0,1,0,0,1
4,25.0,1.0,2.0,No,1,0,0,1,0,0,0,1


### Training Process

Now need train and test subsets to work with. X will contain all the features and y will contain the target variable

In [0]:
X = titanic.drop("Survived",axis=1)
y = titanic["Survived"]

We will use train_test_split from cross_validation module to split our data. 70% of the data will be training data and %30 will be testing data.


random_state = 101: Random state ensures that the splits that you generate are reproducible. The number doesn't matter, the important thing is that everytime you use the same number, you will always get the same output the first time you make the split. Scikit-learn uses random permutations to generate the splits. The random state that you provide is used as a seed to the random number generator. This ensures that the random numbers are generated in the same order.

In [0]:
X = X.values
X

array([[29. ,  0. ,  0. , ...,  0. ,  0. ,  1. ],
       [ 0.9,  1. ,  2. , ...,  0. ,  0. ,  1. ],
       [ 2. ,  1. ,  2. , ...,  0. ,  0. ,  1. ],
       ...,
       [26.5,  0. ,  0. , ...,  1. ,  0. ,  0. ],
       [27. ,  0. ,  0. , ...,  1. ,  0. ,  0. ],
       [29. ,  0. ,  0. , ...,  0. ,  0. ,  1. ]])

In [0]:
#Todos los valores en DeepLearning (Redes neuronales deben estar en numero)
from sklearn.preprocessing import LabelEncoder
labelEncoder_y = LabelEncoder()

y = labelEncoder_y.fit_transform(titanic["Survived"])
y

array([1, 1, 0, ..., 0, 0, 0])

In [0]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 101)

In [0]:
%tensorflow_version 2.x
import tensorflow.keras as keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [0]:
#Se crea la arquitectura del modelo
model = Sequential()

In [0]:
#Dense todo conectado con la capa anterior con la cantidad de neuronas 
model.add(Dense(3,input_shape=(11,),activation="relu"))
model.add(Dense(2,activation="relu"))
model.add(Dense(1,activation="sigmoid"))

In [0]:
#Compilacion del modelo
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [0]:
#Entrenamiento
model.fit(X_train,y_train,batch_size=10,epochs=100)

Train on 914 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100


<tensorflow.python.keras.callbacks.History at 0x7fef2915b588>

In [0]:
predictions = model.predict(X_test)