The goal is to predict whether or not a passenger survived based on attributes such as their age, sex, passenger class, where they embarked and so on.

In [1]:
# Load the dataset 
import os
import urllib.request

TITANIC_PATH = os.path.join("datasets", "titanic")
DOWNLOAD_URL = "https://raw.githubusercontent.com/ageron/handson-ml2/master/datasets/titanic/"

def fetch_titanic_data(url=DOWNLOAD_URL, path=TITANIC_PATH):
    if not os.path.isdir(path):
        os.makedirs(path)
    for filename in ("train.csv", "test.csv"):
        filepath = os.path.join(path, filename)
        if not os.path.isfile(filepath):
            print("Downloading", filename)
            urllib.request.urlretrieve(url + filename, filepath)

fetch_titanic_data()    

Downloading train.csv
Downloading test.csv


In [2]:
import pandas as pd

def load_titanic_data(filename, titanic_path=TITANIC_PATH):
    csv_path = os.path.join(titanic_path, filename)
    return pd.read_csv(csv_path)

In [4]:
train_set = load_titanic_data('train.csv')
test_set = load_titanic_data('test.csv')

In [5]:
train_set.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
test_set.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [8]:
# Let's set passengerId a index
test_set = test_set.set_index('PassengerId')
train_set = train_set.set_index('PassengerId')

In [9]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


In [11]:
train_set.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699113,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526507,1.102743,0.806057,49.693429
min,0.0,1.0,0.4167,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [13]:
# check for missing value
train_set.isnull().sum()

Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      2
dtype: int64

In [18]:
train_set.Survived.value_counts()

0    549
1    342
Name: Survived, dtype: int64

Most of people did not survived.

In [20]:
train_set.Sex.value_counts()

male      577
female    314
Name: Sex, dtype: int64

In [21]:
train_set.Embarked.value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [14]:
# Treate missing value 
# age remplce missing value by mean age
# Cabin remove the column
# Embarker  remove the missing value

In [24]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [29]:
# Now let's build preprocessing for numerical attributes
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scale', StandardScaler()),
])


In [30]:
# Now let's build preprocessing for categorical attributes
from sklearn.preprocessing import OneHotEncoder

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('scale', OneHotEncoder(sparse=False)),
])

In [33]:
# Let's join the numerical and categorical pipelines
from sklearn.compose import ColumnTransformer

num_attributes = ["Age", "SibSp", "Parch", "Fare"]
cat_attributes = ["Pclass", "Sex", "Embarked"]

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attributes),
    ('cat', cat_pipeline, cat_attributes),
])

In [34]:
# Let's get the label and feature
y_train = train_set.Survived
y_train

PassengerId
1      0
2      1
3      1
4      1
5      0
      ..
887    0
888    1
889    0
890    1
891    0
Name: Survived, Length: 891, dtype: int64

In [35]:
X_train = full_pipeline.fit_transform(train_set[num_attributes + cat_attributes])
X_train

array([[-0.56573582,  0.43279337, -0.47367361, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.6638609 ,  0.43279337, -0.47367361, ...,  1.        ,
         0.        ,  0.        ],
       [-0.25833664, -0.4745452 , -0.47367361, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [-0.10463705,  0.43279337,  2.00893337, ...,  0.        ,
         0.        ,  1.        ],
       [-0.25833664, -0.4745452 , -0.47367361, ...,  1.        ,
         0.        ,  0.        ],
       [ 0.20276213, -0.4745452 , -0.47367361, ...,  0.        ,
         1.        ,  0.        ]])

In [56]:
# Let's use randomforest classifier
from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier()

In [57]:
forest_clf.fit(X_train, y_train)

RandomForestClassifier()

In [73]:
y_predict = forest_clf.predict(X_train)

In [74]:
from sklearn.metrics import accuracy_score

acc = accuracy_score(y_predict, y_train)
acc

0.9797979797979798

In [75]:
# Check the confusion matrix
from sklearn.metrics import confusion_matrix

matrix = confusion_matrix(y_train, y_predict)
matrix

array([[542,   7],
       [ 11, 331]])

In [76]:
# Get the precision score
from sklearn.metrics import precision_score, recall_score, f1_score

precision_score(y_train, y_predict)

0.9792899408284024

In [77]:
# Get the recall score
recall_score(y_train, y_predict)

0.9678362573099415

In [78]:
# Get the f1 score
f1_score(y_train, y_predict)

0.973529411764706

In [80]:
X_test = full_pipeline.fit_transform(test_set)
X_test
test_prediction = forest_clf.predict(X_test)

In [88]:
#test_with_prediction = test_set + test_prediction
test_set.shape 
test_prediction.shape

(418, 10)

In [94]:
test_set['Prediction'] = test_prediction
test_set.to_csv('datasets/titanic/Prediction.csv')

In [87]:
# let's use cross validation
from sklearn.model_selection import cross_val_score

forest_scores = cross_val_score(forest_clf, X_train, y_train, cv=10)
forest_scores

array([0.74444444, 0.80898876, 0.74157303, 0.83146067, 0.88764045,
       0.83146067, 0.79775281, 0.76404494, 0.84269663, 0.85393258])

In [50]:
forest_scores.mean()

0.8159800249687889