# Titanic: Machine Learning from Disaster

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score

## Importing the dataset

In [2]:
train_dataset = pd.read_csv('train.csv')
test_dataset = pd.read_csv('test.csv')

## Concat both datasets and perform the data cleansing

In [3]:
all_dataset = pd.concat([train_dataset, test_dataset], sort=False).reset_index(drop=True)
all_dataset = all_dataset.drop(["PassengerId","Survived"], axis=1)

In [4]:
all_dataset.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
all_dataset.isnull().sum()

Pclass         0
Name           0
Sex            0
Age          263
SibSp          0
Parch          0
Ticket         0
Fare           1
Cabin       1014
Embarked       2
dtype: int64

## Taking care of missing data & Encoding data

In [6]:
def process_name(x):

    Title_Dictionary = {
        "Capt": "Officer",
        "Col": "Officer",
        "Major": "Officer",
        "Jonkheer": "Royalty",
        "Don": "Royalty",
        "Dona": "Royalty",
        "Sir" : "Royalty",
        "Dr": "Officer",
        "Rev": "Officer",
        "the Countess":"Royalty",
        "Mme": "Mrs",
        "Mlle": "Miss",
        "Ms": "Mrs",
        "Mr" : "Mr",
        "Mrs" : "Mrs",
        "Miss" : "Miss",
        "Master" : "Master",
        "Lady" : "Royalty"
    }

    # we extract the title from each name
    x['Name'] = x['Name'].map(lambda name:name.split(',')[1].split('.')[0].strip())

    # a map of more aggregated title
    # we map each title
    x['Name'] = x['Name'].map(Title_Dictionary)

    return x

In [7]:
def process_sex(x):
    le = LabelEncoder()
    x['Sex'] = le.fit_transform(x['Sex'])

    return x

In [8]:
def process_age(x):
    # a function that fills the missing values of the Age variable
    def fill_age(row):
        grouped_train = x.iloc[:891].groupby(['Sex','Pclass','Name'])
        grouped_median_train = grouped_train.median()
        grouped_median_train = grouped_median_train.reset_index()[['Sex', 'Pclass', 'Name', 'Age']]

        condition = (
            (grouped_median_train['Sex'] == row['Sex']) & 
            (grouped_median_train['Name'] == row['Name']) & 
            (grouped_median_train['Pclass'] == row['Pclass'])
        )

        return grouped_median_train[condition]['Age'].values[0]

    x['Age'] = x.apply(lambda row: fill_age(row) if np.isnan(row['Age']) else row['Age'], axis=1)

    return x

In [9]:
def process_ticket(x):
    # a function that extracts each prefix of the ticket, returns 'XXX' if no prefix (i.e the ticket is a digit)
    def cleanTicket(ticket):
        ticket = ticket.replace('.','')
        ticket = ticket.replace('/','')
        ticket = ticket.split()
        ticket = map(lambda t : t.strip(), ticket)
        ticket = list(filter(lambda t : not t.isdigit(), ticket))
        if len(ticket) > 0:
            return ticket[0]
        else: 
            return 'XXX'

    # Extracting dummy variables from tickets:
    x['Ticket'] = list( map( cleanTicket, x['Ticket'] ) )

    return x

In [10]:
def process_fare(x):
    # there's one missing fare value - replacing it with the mean.
    x.Fare.fillna(x.iloc[:891].Fare.mean(), inplace=True)

    return x

In [11]:
def process_cabin(x):
    # replacing missing cabins with U (for Uknown)
    x.Cabin.fillna('U', inplace=True)

    # mapping each Cabin value with the cabin letter
    x['Cabin'] = list( map( lambda c: c[0], x['Cabin'] ) )

    return x

In [12]:
def process_embarked(x):
    imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
    x['Embarked'] = imp.fit_transform(x[['Embarked']])

    return x

In [13]:
def process_family(x):
    # introducing a new feature : the size of families (including the passenger)
    x['FamilySize'] = x['Parch'] + x['SibSp'] + 1

    # introducing other features based on the family size
    x['Singleton'] = x['FamilySize'].map(lambda s: 1 if s == 1 else 0)
    x['SmallFamily'] = x['FamilySize'].map(lambda s: 1 if 2 <= s <= 4 else 0)
    x['LargeFamily'] = x['FamilySize'].map(lambda s: 1 if 5 <= s else 0)

    return x

In [14]:
process_pipe = Pipeline(steps=[
    ('name', FunctionTransformer(process_name)),
    ('sex', FunctionTransformer(process_sex)),
    ('age', FunctionTransformer(process_age)),
    ('ticket', FunctionTransformer(process_ticket)),
    ('fare', FunctionTransformer(process_fare)),
    ('cabin', FunctionTransformer(process_cabin)),
    ('embarked', FunctionTransformer(process_embarked)),
    ('family', FunctionTransformer(process_family)),
])
all_dataset = process_pipe.fit_transform(all_dataset)

In [15]:
all_dataset.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FamilySize,Singleton,SmallFamily,LargeFamily
0,3,Mr,1,22.0,1,0,A5,7.25,U,S,2,0,1,0
1,1,Mrs,0,38.0,1,0,PC,71.2833,C,C,2,0,1,0
2,3,Miss,0,26.0,0,0,STONO2,7.925,U,S,1,1,0,0
3,1,Mrs,0,35.0,1,0,XXX,53.1,C,S,2,0,1,0
4,3,Mr,1,35.0,0,0,XXX,8.05,U,S,1,1,0,0


In [16]:
all_dataset.isnull().sum()

Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
FamilySize     0
Singleton      0
SmallFamily    0
LargeFamily    0
dtype: int64

In [17]:
ct = ColumnTransformer(transformers=[('ohe', OneHotEncoder(sparse=False), ['Pclass', 'Name', 'Ticket', 'Cabin', 'Embarked'])], remainder='passthrough')

all_data = ct.fit_transform(all_dataset)

In [18]:
print(all_data)

[[0. 0. 1. ... 0. 1. 0.]
 [1. 0. 0. ... 0. 1. 0.]
 [0. 0. 1. ... 1. 0. 0.]
 ...
 [0. 0. 1. ... 1. 0. 0.]
 [0. 0. 1. ... 1. 0. 0.]
 [0. 0. 1. ... 0. 1. 0.]]


## Splitting the dataset into the Training set and Test set

In [19]:
def recover_train_test_target_data(x):
    train_data, test_data = np.split(all_data, [891])
    targets = train_dataset['Survived'].to_numpy()

    return train_data, test_data, targets

In [20]:
train_data, test_data, targets = recover_train_test_target_data(all_data)

## Training the model

In [21]:
classifier = RandomForestClassifier(bootstrap = False, min_samples_leaf = 1, n_estimators = 100, min_samples_split = 5, max_features = 'sqrt', max_depth = 3, random_state = 2, n_jobs = -1)

pipe = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('classifier', classifier)
])

pipe.fit(train_data, targets)

Pipeline(steps=[('scaler', StandardScaler()),
                ('classifier',
                 RandomForestClassifier(bootstrap=False, max_depth=3,
                                        max_features='sqrt',
                                        min_samples_split=5, n_jobs=-1,
                                        random_state=2))])

## Predicting the Test set results

In [22]:
pred = pipe.predict(test_data)

## Save results to submission.csv

In [23]:
submission = pd.DataFrame( data={ 'PassengerId': test_dataset['PassengerId'], 'Survived': pred } )
submission.to_csv('submission.csv', index=False, encoding='utf-8')
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


## Accuracy Score

In [24]:
X_train, X_test, y_train, y_test = train_test_split(train_data, targets, test_size = 0.2, random_state = 0)

classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
print(cm)

accuracy_score(y_test, y_pred)

[[96 14]
 [18 51]]


0.8212290502793296