### Objective

Use machine learning to create a model that predicts which passengers survived the Titanic shipwreck by using passenger data (ie name, age, gender, socio-economic class, etc).

Competition Link: https://www.kaggle.com/c/titanic

In [1]:
import pandas as pd

import numpy as np

import warnings
warnings.simplefilter('ignore', FutureWarning)

# Data Pre-Processing

In [2]:
# Load training data
train_data = pd.read_csv("data/train.csv")
train_data 

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [3]:
# Load testing data
test_data = pd.read_csv("data/test.csv")
test_data

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [4]:
df_titanic = pd.DataFrame(train_data)
df_titanic.describe().columns

Index(['PassengerId', 'Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare'], dtype='object')

In [5]:
from sklearn.ensemble import RandomForestClassifier

# Assign X (data) and y (target)
y = train_data["Survived"]
features = ["Pclass", "Sex", "SibSp", "Parch"]


In [6]:
# Split into Training and Testing data
X = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])

# Create and Train(Fit) Model

### Classifier

In [9]:
# Create a Logistic Regression Model
from sklearn.linear_model import LogisticRegression
classifier1 = LogisticRegression()

# Fit (train) or model - First Classifier
classifier1.fit(X, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [10]:
print(f"Training Data Score: {classifier1.score(X, y)}")


Training Data Score: 0.8002244668911336


# Hyperparameter Tuning - GridSearchCV 

In [11]:
# Source Link: https://towardsdatascience.com/logistic-regression-model-tuning-with-scikit-learn-part-1-425142e01af5

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

param_grid = {
     'penalty' : ['l1', 'l2'],
    'C': [1, 5, 10, 50]}

grid = GridSearchCV(classifier1, param_grid = param_grid, cv = 5, verbose=3, n_jobs=-1)


In [12]:
# Fit the model

grid.fit(X, y)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:    3.1s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'C': [1, 5, 10, 50], 'penalty': ['l1', 'l2']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

## Quantify our Trained Model

In [13]:
# List the best parameters for this dataset
# List the best score

print(grid.best_params_)
print('Best Grid score: %.3f' % grid.best_score_)

{'C': 5, 'penalty': 'l1'}
Best Grid score: 0.800


In [14]:
# Make predictions with the hypertuned model
predictions = grid.predict(X_test)


# Make predictions



In [15]:
# # Logistic Regression

# pd.DataFrame({"Prediction": predictions, "Actual": y_test})
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


#  Save the Predictions

In [16]:
output.to_csv('submission.csv', index=False)
