In [1]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import re
import timeit

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from xgboost import XGBClassifier

In [2]:
df = pd.read_csv(os.path.join('titanic/processed', 'train.csv'))
# test = pd.read_csv(os.path.join('processed', 'test.csv'))

del df['Unnamed: 0']
# del test['Unnamed: 0']

#needed because XGBoost doesn't allow certain characters in column names
regex = re.compile(r"\[|\]|<", re.IGNORECASE)
df.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in df.columns.values]

In [3]:
# df.columns

In [4]:
X, y = df.iloc[:, 1:], df.iloc[:, 0]

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = .3,
                                                    random_state = 5,
                                                   stratify = y)

## SCore function

In [16]:
def accuracy_score(y_pred, y_test):
    a = list(y_pred)
    b = list(y_test)
    accuracy = len([a[i] for i in range(0, len(a)) if a[i] == b[i]]) / len(a)
    print(accuracy)

## DecisionTreeClassifier

In [6]:
start = timeit.default_timer()

dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
print("Accuracy: "+ str(round(dt.score(X_test, y_test), 2)))

stop = timeit.default_timer()
print('Time: ', round(stop - start, 2)) 

Accuracy: 0.78
Time:  0.01


## BaggingClassifier

In [7]:
start = timeit.default_timer()

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=100,bootstrap=True,random_state=0)
bag_clf.fit(X_train, y_train)
print("Accuracy: "+ str(round(bag_clf.score(X_test, y_test), 3)))

stop = timeit.default_timer()
print('Time: ', round(stop - start, 3)) 

Accuracy: 0.825
Time:  0.252


In [17]:
y_pred = bag_clf.predict(X_test)
accuracy_score(y_pred, y_test)

0.8246268656716418


## Random forest

RF is a bit faster because it doesn't use all features in each tree.

In [20]:
start = timeit.default_timer()

rf = RandomForestClassifier(n_estimators=100,random_state=0)
rf.fit(X_train, y_train)
print("Accuracy: "+ str(round(rf.score(X_test, y_test), 3)))

stop = timeit.default_timer()
print('Time: ', round(stop - start, 3)) 

Accuracy: 0.836
Time:  0.697


## XGBoost

while sequential instead of parallel, XGBoost is quite a bit quicker.

In [21]:
start = timeit.default_timer()

xgb = XGBClassifier(eval_metric='mlogloss', use_label_encoder =False, n_estimators=100)
xgb.fit(X_train, y_train)
print("Accuracy: "+ str(round(xgb.score(X_test, y_test), 3)))

stop = timeit.default_timer()
print('Time: ', round(stop - start, 3)) 

Accuracy: 0.832
Time:  0.25


Note that the XGBoost always provides the same score, whereas it varies for the RF.

Inspiration notebooks:
- https://www.kaggle.com/dmilla/introduction-to-decision-trees-titanic-dataset
- https://www.kaggle.com/simulacra/titanic-with-xgboost
- https://www.kaggle.com/zlatankr/titanic-random-forest-82-78
- seems to have better results improve using this
https://towardsdatascience.com/predicting-the-survival-of-titanic-passengers-30870ccc7e8
