# Titanic - Just another approach

### Importing libraries

In [19]:
# Titanic Example
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

### Reading files

In [20]:
# Reading csv Titanic's Dataset
titanic = pd.read_csv('../Datasources/Titanic/titanic.csv')# .head(50)
titanic


Unnamed: 0,Passenger Class,Name,Sex,Age,No of Siblings or Spouses on Board,No of Parents or Children on Board,Ticket Number,Passenger Fare,Cabin,Port of Embarkation,Life Boat,Survived
0,First,"Allen, Miss. Elisabeth Walton",Female,29.0,0.0,0.0,24160,211.3,B5,Southampton,2,Yes
1,First,"Allison, Master. Hudson Trevor",Male,0.9,1.0,2.0,113781,151.6,C22 C26,Southampton,11,Yes
2,First,"Allison, Miss. Helen Loraine",Female,2.0,1.0,2.0,113781,151.6,C22 C26,Southampton,,No
3,First,"Allison, Mr. Hudson Joshua Creighton",Male,30.0,1.0,2.0,113781,151.6,C22 C26,Southampton,,No
4,First,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",Female,25.0,1.0,2.0,113781,151.6,C22 C26,Southampton,,No
...,...,...,...,...,...,...,...,...,...,...,...,...
1304,Third,"Zabour, Miss. Hileni",Female,14.5,1.0,0.0,2665,14.5,,Cherbourg,,No
1305,Third,"Zabour, Miss. Thamine",Female,,1.0,0.0,2665,14.5,,Cherbourg,,No
1306,Third,"Zakarian, Mr. Mapriededer",Male,26.5,0.0,0.0,2656,7.2,,Cherbourg,,No
1307,Third,"Zakarian, Mr. Ortin",Male,27.0,0.0,0.0,2670,7.2,,Cherbourg,,No


### Make ETL cleaning
Please think carefully about the impact of row deletion on your dataframe as there may be other ways of fixing missing data

In [21]:
# Let's begin by having a look to see how "good" is our dataset by using count(). Thre are missing values?
titanic.count()

# As we can see from the results below, there are missing values in some columns.

Passenger Class                       1309
Name                                  1309
Sex                                   1309
Age                                   1046
No of Siblings or Spouses on Board    1309
No of Parents or Children on Board    1309
Ticket Number                         1309
Passenger Fare                        1308
Cabin                                  295
Port of Embarkation                   1307
Life Boat                              486
Survived                              1309
dtype: int64

Let's fix age issue using a mix of pandas and pure Python code!!!
Don't expect libraries will resolve all your needs. They are tools to made your life easier just when you need it.


In [22]:
# definig a function to fill zero values with the dataset's age mean
# Option 1
def add_age(cols):
    Age = cols[0]
    Pclass= cols[1]
    if pd.isnull(Age):
        return int(titanic[titanic["Passenger Class"] == Pclass]["Age"].mean())
    else:
        return Age

titanic["Age"] = titanic[["Age", "Passenger Class"]].apply(add_age,axis=1)

##########################################
# Option 2
# def add_age2 ( Age, Pclass)
#    if pd.isnull(Age):
#        return int(titanic[titanic["Passenger Class"] == Pclass]["Age"].mean())
#    else:
#        return Age
    
# Calling de function
# titanic["Age"] = add_age2 ( titanic["Age"],titanic["Passenger Class"])
titanic.count()

Passenger Class                       1309
Name                                  1309
Sex                                   1309
Age                                   1309
No of Siblings or Spouses on Board    1309
No of Parents or Children on Board    1309
Ticket Number                         1309
Passenger Fare                        1308
Cabin                                  295
Port of Embarkation                   1307
Life Boat                              486
Survived                              1309
dtype: int64

We have too many null values for Cabin column, so we just remove it. Notice that we remove the column, not the rows with nuls in Cabin column.


In [23]:
titanic.drop(["Cabin", "Life Boat"],inplace=True,axis=1)

titanic.count()

Passenger Class                       1309
Name                                  1309
Sex                                   1309
Age                                   1309
No of Siblings or Spouses on Board    1309
No of Parents or Children on Board    1309
Ticket Number                         1309
Passenger Fare                        1308
Port of Embarkation                   1307
Survived                              1309
dtype: int64

Again, we remove some rows with null values that we can't fix by any means

In [24]:
titanic.dropna(inplace=True)
titanic.count()

Passenger Class                       1306
Name                                  1306
Sex                                   1306
Age                                   1306
No of Siblings or Spouses on Board    1306
No of Parents or Children on Board    1306
Ticket Number                         1306
Passenger Fare                        1306
Port of Embarkation                   1306
Survived                              1306
dtype: int64

### Converting non-numerical data (when possible)

Ok, now that we are done with cleaning the data, let's convert some categorical data into numeric.

In [25]:
#  Sex column.Using the get_dummies function of Pandas to create two columns, one for male, one for female.
sex = pd.get_dummies(titanic["Sex"])
sex.head()

Unnamed: 0,Female,Male
0,1,0
1,0,1
2,1,0
3,0,1
4,1,0


Let's do the same for "Port of Embarkation" and "Passenger Class"

In [26]:
embarked = pd.get_dummies(titanic["Port of Embarkation"])
embarked

Unnamed: 0,Cherbourg,Queenstown,Southampton
0,0,0,1
1,0,0,1
2,0,0,1
3,0,0,1
4,0,0,1
...,...,...,...
1304,1,0,0
1305,1,0,0
1306,1,0,0
1307,1,0,0


In [27]:
pc_class = pd.get_dummies(titanic["Passenger Class"])
pc_class

Unnamed: 0,First,Second,Third
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0
...,...,...,...
1304,0,0,1
1305,0,0,1
1306,0,0,1
1307,0,0,1


now we add these new numeric columns to the dataframe

In [28]:
titanic = pd.concat([titanic,pc_class,sex,embarked],axis=1)
titanic.count()

Passenger Class                       1306
Name                                  1306
Sex                                   1306
Age                                   1306
No of Siblings or Spouses on Board    1306
No of Parents or Children on Board    1306
Ticket Number                         1306
Passenger Fare                        1306
Port of Embarkation                   1306
Survived                              1306
First                                 1306
Second                                1306
Third                                 1306
Female                                1306
Male                                  1306
Cherbourg                             1306
Queenstown                            1306
Southampton                           1306
dtype: int64

In [29]:
# We save a DataFrame copy to be used with Seaborn's plots
sea_titanic = titanic.copy()
sea_titanic

Unnamed: 0,Passenger Class,Name,Sex,Age,No of Siblings or Spouses on Board,No of Parents or Children on Board,Ticket Number,Passenger Fare,Port of Embarkation,Survived,First,Second,Third,Female,Male,Cherbourg,Queenstown,Southampton
0,First,"Allen, Miss. Elisabeth Walton",Female,29.0,0.0,0.0,24160,211.3,Southampton,Yes,1,0,0,1,0,0,0,1
1,First,"Allison, Master. Hudson Trevor",Male,0.9,1.0,2.0,113781,151.6,Southampton,Yes,1,0,0,0,1,0,0,1
2,First,"Allison, Miss. Helen Loraine",Female,2.0,1.0,2.0,113781,151.6,Southampton,No,1,0,0,1,0,0,0,1
3,First,"Allison, Mr. Hudson Joshua Creighton",Male,30.0,1.0,2.0,113781,151.6,Southampton,No,1,0,0,0,1,0,0,1
4,First,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",Female,25.0,1.0,2.0,113781,151.6,Southampton,No,1,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,Third,"Zabour, Miss. Hileni",Female,14.5,1.0,0.0,2665,14.5,Cherbourg,No,0,0,1,1,0,1,0,0
1305,Third,"Zabour, Miss. Thamine",Female,24.0,1.0,0.0,2665,14.5,Cherbourg,No,0,0,1,1,0,1,0,0
1306,Third,"Zakarian, Mr. Mapriededer",Male,26.5,0.0,0.0,2656,7.2,Cherbourg,No,0,0,1,0,1,1,0,0
1307,Third,"Zakarian, Mr. Ortin",Male,27.0,0.0,0.0,2670,7.2,Cherbourg,No,0,0,1,0,1,1,0,0


Finally, we can get rid of non-numerical columns (except for "Passenger Class" wich will be used later)

In [31]:
titanic.drop(["Passenger Class","Passenger Fare","Name","Sex","Ticket Number","Port of Embarkation"],axis=1,inplace=True)
titanic.head()

Unnamed: 0,Age,No of Siblings or Spouses on Board,No of Parents or Children on Board,Survived,First,Second,Third,Female,Male,Cherbourg,Queenstown,Southampton
0,29.0,0.0,0.0,Yes,1,0,0,1,0,0,0,1
1,0.9,1.0,2.0,Yes,1,0,0,0,1,0,0,1
2,2.0,1.0,2.0,No,1,0,0,1,0,0,0,1
3,30.0,1.0,2.0,No,1,0,0,0,1,0,0,1
4,25.0,1.0,2.0,No,1,0,0,1,0,0,0,1


### Training Process

Now need train and test subsets to work with. X will contain all the features and y will contain the target variable

In [32]:
X = titanic.drop("Survived",axis=1)
y = titanic["Survived"]

We will use train_test_split from cross_validation module to split our data. 70% of the data will be training data and %30 will be testing data.


random_state = 101: Random state ensures that the splits that you generate are reproducible. The number doesn't matter, the important thing is that everytime you use the same number, you will always get the same output the first time you make the split. Scikit-learn uses random permutations to generate the splits. The random state that you provide is used as a seed to the random number generator. This ensures that the random numbers are generated in the same order.

In [33]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 101)

Let's use logistic regression to train the model

In [34]:
# This one will fail!!!!!!
# Look at the error messages
from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression()
logmodel.fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [35]:
# This one will work
from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression(max_iter=1000)
logmodel.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

Let’s see how accurate is our model for predictions:

In [36]:
predictions = logmodel.predict(X_test)
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

          No       0.85      0.85      0.85       247
         Yes       0.74      0.74      0.74       145

    accuracy                           0.81       392
   macro avg       0.80      0.80      0.80       392
weighted avg       0.81      0.81      0.81       392



We got 81% accuraccy with 0.74 recall, not bad. Let’s see the confusion matrix

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, predictions)

Int he above array, we can see: <br/>
<b>True positive:</b> 210 (We predicted a positive result and it was positive) <br/>
<b>True negative:</b> 107 (We predicted a negative result and it was negative) <br/>
<b>False positive:</b> 37 (We predicted a positive result and it was negative) <br/>
<b>False negative:</b> 38 (We predicted a negative result and it was positive)  <br/>  

### Let's see some data in graphics using seaborn

How many people survived depending on the class they were form

In [None]:
# fig, ax = plt.subplots(1,1, figsize = (12,10))
ax = sns.countplot(x = 'Passenger Class', hue = 'Survived', palette = 'Set1', data = sea_titanic)
ax.set(title = 'Passenger status (Survived/Died) against Passenger Class', 
       xlabel = 'Passenger Class', ylabel = 'Total')
plt.show()

how many people lived or Died according to their Sex

In [None]:
print(pd.crosstab(sea_titanic["Sex"],sea_titanic.Survived))

ax = sns.countplot(x = 'Sex', hue = 'Survived', palette = 'Set1', data = sea_titanic)
ax.set(title = 'Total Survivors According to Sex', xlabel = 'Sex', ylabel='Total')
plt.show()

Survivors dependig on age

In [None]:

print(pd.crosstab(sea_titanic["Age"],sea_titanic.Survived).sum())

# We look at Age column and set Intevals on the ages and the map them to their categories as
# (Children, Teen, Adult, Old)
interval = (0,18,35,60,120)
categories = ['Children','Teens','Adult', 'Old']
sea_titanic['Age_cats'] = pd.cut(sea_titanic.Age, interval, labels = categories)

ax = sns.countplot(x = 'Age_cats',  data = sea_titanic, hue = 'Survived', palette = 'Set1')

ax.set(xlabel='Age Categorical', ylabel='Total',
       title="Age Categorical Survival Distribution")

plt.show()

In [None]:
# ........