In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
%matplotlib inline

titanic = pd.read_csv('data/titanic.csv')
titanic.head()

test_data = pd.read_csv('data/test.csv')

## Cleaning continuous variables

In [2]:
# Checking for null values
titanic.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [3]:
titanic['Age'].fillna(titanic['Age'].mean(), inplace=True)
test_data['Age'].fillna(test_data['Age'].mean(), inplace=True)
titanic.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,29.699118,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [4]:
# Creating a family count column
titanic['Family_cnt'] = titanic['SibSp'] + titanic['Parch']
titanic['Family_cnt']

test_data['Family_cnt'] = test_data['SibSp'] + test_data['Parch']

In [5]:
# Dropping unnecessary variables
titanic.drop(['PassengerId', 'SibSp', 'Parch'], axis=1, inplace=True)
test_data.drop(['SibSp', 'Parch'], axis=1, inplace=True)

## Cleaning categorical variables

In [6]:
# Seeing null vaues
titanic.isnull().sum()

Survived        0
Pclass          0
Name            0
Sex             0
Age             0
Ticket          0
Fare            0
Cabin         687
Embarked        2
Family_cnt      0
dtype: int64

In [7]:
# Seeing the survival rate if someone has a cabin or not
titanic.groupby(titanic['Cabin'].isnull())['Survived'].mean()


Cabin
False    0.666667
True     0.299854
Name: Survived, dtype: float64

In [8]:
# Seeing null vaues
test_data.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
Ticket           0
Fare             1
Cabin          327
Embarked         0
Family_cnt       0
dtype: int64

In [9]:
# Where method to change who has a cabin to 1 and who doesn't as 2
titanic['Cabin_ind'] = np.where(titanic['Cabin'].isnull(), 0, 1)
titanic.head()

test_data['Cabin_ind'] = np.where(test_data['Cabin'].isnull(), 0, 1)
test_data['Fare'].fillna(test_data['Fare'].mean(), inplace=True)

In [10]:
# Seeing null vaues
test_data.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
Ticket           0
Fare             0
Cabin          327
Embarked         0
Family_cnt       0
Cabin_ind        0
dtype: int64

In [11]:
# Convert gender to numbers
gender_num = {'male': 0, 'female': 1}

titanic['Sex'] = titanic['Sex'].map(gender_num)
titanic.head()

test_data['Sex'] = test_data['Sex'].map(gender_num)

In [12]:
# Dropping unnecessary variables
titanic.drop(['Cabin', 'Embarked', 'Name', 'Ticket'], axis=1, inplace=True)
titanic.head()

test_data.drop(['Cabin', 'Embarked', 'Name', 'Ticket'], axis=1, inplace=True)

In [13]:
# Saving to csv file
titanic.to_csv('data/titanic_cleaned.csv', index=False)

test_data.to_csv('data/test_data_cleaned.csv', index=False)

# Split data into train, validation, and test set

In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split

titanic = pd.read_csv('data/titanic_cleaned.csv')
titanic.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Family_cnt,Cabin_ind
0,0,3,0,22.0,7.25,1,0
1,1,1,1,38.0,71.2833,1,1
2,1,3,1,26.0,7.925,0,0
3,1,1,1,35.0,53.1,1,1
4,0,3,0,35.0,8.05,0,0


In [15]:
features = titanic.drop('Survived', axis=1)
labels = titanic['Survived']

# Splitting for test set
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.4, random_state=42)
# Splitting for a validation set
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [16]:
# Test to see that the split happened corrently

for dataset in [y_train, y_val, y_test]:
    print(round(len(dataset) / len(labels), 2))

0.6
0.2
0.2


In [17]:
X_train.to_csv('train_features.csv', index=False)
X_val.to_csv('val_features.csv', index=False)
X_test.to_csv('test_features.csv', index=False)

y_train.to_csv('train_labels.csv', index=False)
y_val.to_csv('val_labels.csv', index=False)
y_test.to_csv('test_labels.csv', index=False)

  """
  
  import sys


# Random Forest


In [18]:
import joblib
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)

tr_features = pd.read_csv('data/train_features.csv')
tr_labels = pd.read_csv('data/train_labels.csv', header=None)

## Hyperparameter tuning

In [19]:
# Function to print out parameters
def print_results(results):
    print('BEST PARAMS: {}\n'.format(results.best_params_))

    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, results.cv_results_['params']):
        print('{} (+/-{}) for {}'.format(round(mean, 3), round(std * 2, 3), params))

In [20]:
rf = RandomForestClassifier()
parameters = {
    'n_estimators': [5, 50, 250],
    'max_depth': [2, 4, 8, 16, 32, None]
}

cv = GridSearchCV(rf, parameters, cv=5)
cv.fit(tr_features, tr_labels.values.ravel())

print_results(cv)

BEST PARAMS: {'max_depth': 16, 'n_estimators': 50}

0.779 (+/-0.112) for {'max_depth': 2, 'n_estimators': 5}
0.787 (+/-0.124) for {'max_depth': 2, 'n_estimators': 50}
0.801 (+/-0.127) for {'max_depth': 2, 'n_estimators': 250}
0.807 (+/-0.098) for {'max_depth': 4, 'n_estimators': 5}
0.82 (+/-0.123) for {'max_depth': 4, 'n_estimators': 50}
0.826 (+/-0.101) for {'max_depth': 4, 'n_estimators': 250}
0.807 (+/-0.027) for {'max_depth': 8, 'n_estimators': 5}
0.818 (+/-0.072) for {'max_depth': 8, 'n_estimators': 50}
0.813 (+/-0.068) for {'max_depth': 8, 'n_estimators': 250}
0.798 (+/-0.037) for {'max_depth': 16, 'n_estimators': 5}
0.828 (+/-0.036) for {'max_depth': 16, 'n_estimators': 50}
0.809 (+/-0.035) for {'max_depth': 16, 'n_estimators': 250}
0.787 (+/-0.054) for {'max_depth': 32, 'n_estimators': 5}
0.811 (+/-0.05) for {'max_depth': 32, 'n_estimators': 50}
0.811 (+/-0.04) for {'max_depth': 32, 'n_estimators': 250}
0.807 (+/-0.022) for {'max_depth': None, 'n_estimators': 5}
0.813 (+/-0.027

In [21]:
# Find the best parameters
cv.best_estimator_


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=16, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=50,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [22]:
# Save model

joblib.dump(cv.best_estimator_, 'models/RF_model.pkl')

['models/RF_model.pkl']

# Evaluate model

In [23]:
import joblib
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score
from time import time

val_features = pd.read_csv('data/val_features.csv')
val_labels = pd.read_csv('data/val_labels.csv', header=None)

te_features = pd.read_csv('data/test_features.csv')
te_labels = pd.read_csv('data/test_labels.csv', header=None)

In [24]:
models = {}

for mdl in ['RF']:
    models[mdl] = joblib.load('models/{}_model.pkl'.format(mdl))

In [25]:
models

{'RF': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                        max_depth=16, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, n_estimators=50,
                        n_jobs=None, oob_score=False, random_state=None,
                        verbose=0, warm_start=False)}

In [26]:
def evaluate_model(name, model, features, labels):
    start = time()
    pred = model.predict(features)
    end = time()
    accuracy = round(accuracy_score(labels, pred), 3)
    precision = round(precision_score(labels, pred), 3)
    recall = round(recall_score(labels, pred), 3)
    print('{} -- Accuracy: {} / Precision: {} / Recall: {} / Latency: {}ms'.format(name,
                                                                                   accuracy,
                                                                                   precision,
                                                                                   recall,
                                                                                   round((end - start)*1000, 1)))

In [27]:
for name, mdl in models.items():
    evaluate_model(name, mdl, val_features, val_labels)

RF -- Accuracy: 0.787 / Precision: 0.729 / Recall: 0.662 / Latency: 71.9ms


In [28]:
evaluate_model('Random Forest', models['RF'], te_features, te_labels)

Random Forest -- Accuracy: 0.821 / Precision: 0.833 / Recall: 0.724 / Latency: 70.9ms


# Prediction

In [29]:
test_data = pd.read_csv('data/test_data_cleaned.csv')
test_data

Unnamed: 0,PassengerId,Pclass,Sex,Age,Fare,Family_cnt,Cabin_ind
0,892,3,0,34.50000,7.8292,0,0
1,893,3,1,47.00000,7.0000,1,0
2,894,2,0,62.00000,9.6875,0,0
3,895,3,0,27.00000,8.6625,0,0
4,896,3,1,22.00000,12.2875,2,0
...,...,...,...,...,...,...,...
413,1305,3,0,30.27259,8.0500,0,0
414,1306,1,1,39.00000,108.9000,0,1
415,1307,3,0,38.50000,7.2500,0,0
416,1308,3,0,30.27259,8.0500,0,0


In [30]:
# Make predictions with the hypertuned model

features = ['Pclass','Sex','Age','Fare','Family_cnt','Cabin_ind']
X_test = pd.get_dummies(test_data[features])


predictions = models['RF'].predict(X_test)

In [31]:

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [32]:
output.to_csv('submission.csv', index=False)