# What is this notebook?

This notebook is my submission for the classic *Titanic* competition on Kaggle:

https://www.kaggle.com/c/titanic

Not doing this to change the world. Doing this to learn how Kaggle submissions work.

Each cell should be totally self-contained.

## Columns

- **Survival** - the category we want to predict (0 = died, 1 = survived)

- **Pclass** - ticket class (1 = 1st, 2 = 2nd, 3 = 3rd). Fare will be dropped, as class and fare are very closely related.

- **Name** - the name of the passenger

- **Sex** - sex (male/female)

- **Age** - age (in years)

- **SibSp** - # of siblings/spouses aboard the titanic

- **Parch** - number of parents/children about the titanic.

- **Ticket** - Ticket number

- **Fare** - Passenger Fare. Dropped, as Pclass allows us to discretize this information.

- **Cabin** - Cabin Number

- **Embarked** - Port of Embarkation (C - Cherbourg, Q - Queenstown, S - Southampton)

Before we even get started, let's look at the baseline––what proportion survived? If our mode

Let's go ahead and drop the Name column and the ticket number, since those are just labels.

In [312]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

def age_group(age):
    if 0 <= age < 18:
        return 'minor' # minor
    elif 18 <= age < 30:
        return 'young' # young adult
    elif 30 <= age < 65:
        return 'adult' # adult
    elif age >= 65:
        return 'senior' # senior
    else:
        return 'unkown'
    
def fare_group(fare):
    try:
        return int(fare/100)
    except ValueError:
        return 0

def clean_inputs(df):
    df['Male'] = df.Sex.apply(lambda x: int(x == 'male'))
    df['Deck'] = df.Cabin.apply(lambda cabin: cabin[0] if isinstance(cabin, str) else 'U')
    df['Embarked'] = df['Embarked'].fillna('U')
    df['AgeGroup'] = df['Age'].apply(age_group)
    df['FareGroup'] = df['Fare'].apply(fare_group)
    data = pd.get_dummies(data=df, columns=['Pclass', 'Embarked', 'SibSp', 'Parch', 'Deck', 'AgeGroup', 'FareGroup'], drop_first=True)
    data = data.drop(axis=1, labels=['Sex', 'Age', 'Cabin', 'Fare'])
    return data

def load_data(split=True):
    df = pd.read_csv('train.csv')
    y = df.Survived
    X = clean_inputs(df.drop(axis=1, labels=['Name', 'Ticket', 'Survived']))
    if split:
        return train_test_split(X, y, random_state=0)
    else:
        return X, y

load_data()[0]

Unnamed: 0,PassengerId,Male,Pclass_2,Pclass_3,Embarked_Q,Embarked_S,Embarked_U,SibSp_1,SibSp_2,SibSp_3,...,Deck_G,Deck_T,Deck_U,AgeGroup_minor,AgeGroup_senior,AgeGroup_unkown,AgeGroup_young,FareGroup_1,FareGroup_2,FareGroup_5
105,106,1,0,1,0,1,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
68,69,0,0,1,0,1,0,0,0,0,...,0,0,1,1,0,0,0,0,0,0
253,254,1,0,1,0,1,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0
320,321,1,0,1,0,1,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
706,707,0,1,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
835,836,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
192,193,0,0,1,0,1,0,1,0,0,...,0,0,1,0,0,0,1,0,0,0
629,630,1,0,1,1,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
559,560,0,0,1,0,1,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0


In [315]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.naive_bayes import BernoulliNB

X_train, X_test, y_train, y_test = load_data()

nb = BernoulliNB()
nb.fit(X_train.drop(axis=1, labels=['PassengerId']), y=y_train)
y_pred_nb = nb.predict(X_test.drop(axis=1, labels=['PassengerId']))

print(confusion_matrix(y_true=y_test, y_pred=y_pred_nb))
print(classification_report(y_test, y_pred_nb, labels=[0,1], target_names=['Died', 'Survived']))

[[121  18]
 [ 23  61]]
              precision    recall  f1-score   support

        Died       0.84      0.87      0.86       139
    Survived       0.77      0.73      0.75        84

    accuracy                           0.82       223
   macro avg       0.81      0.80      0.80       223
weighted avg       0.81      0.82      0.81       223



In [328]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression

X_train, X_test, y_train, y_test = load_data()

lr = LogisticRegression()
lr.fit(X_train.drop(axis=1, labels=['PassengerId']), y=y_train)
y_pred_lr = nb.predict(X_test.drop(axis=1, labels=['PassengerId']))

print(confusion_matrix(y_true=y_test, y_pred=y_pred_lr))
print(classification_report(y_test, y_pred_lr, labels=[0,1], target_names=['Died', 'Survived']))
submission.to_csv('ddh_submission2.csv', index=False)

[[121  18]
 [ 23  61]]
              precision    recall  f1-score   support

        Died       0.84      0.87      0.86       139
    Survived       0.77      0.73      0.75        84

    accuracy                           0.82       223
   macro avg       0.81      0.80      0.80       223
weighted avg       0.81      0.82      0.81       223



In [None]:
X_train, y = load_data(split=False)
X_test = clean_inputs(pd.read_csv('test.csv').drop(axis=1, labels=['Name', 'Ticket']))
X_test['Embarked_U'] = 0
X_test['Survived'] = lr.predict(X_test.drop(axis=1, labels=['PassengerId']))
submission = X_test[['PassengerId', 'Survived']]
submission.to_csv('ddh_submission2.csv', index=False)

In [317]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.neighbors import KNeighborsClassifier

X_train, X_test, y_train, y_test = load_data()

knn = KNeighborsClassifier(n_neighbors=10)
knn.fit(X_train.drop(axis=1, labels=['PassengerId']), y=y_train)
y_pred_knn = knn.predict(X_test.drop(axis=1, labels=['PassengerId']))

print(confusion_matrix(y_true=y_test, y_pred=y_pred_kn))
print(classification_report(y_test, y_pred_kn, labels=[0,1], target_names=['Died', 'Survived']))

[[133   6]
 [ 31  53]]
              precision    recall  f1-score   support

        Died       0.81      0.96      0.88       139
    Survived       0.90      0.63      0.74        84

    accuracy                           0.83       223
   macro avg       0.85      0.79      0.81       223
weighted avg       0.84      0.83      0.83       223



In [322]:
X_train, y = load_data(split=False)
X_test = clean_inputs(pd.read_csv('test.csv').drop(axis=1, labels=['Name', 'Ticket']))
X_test['Embarked_U'] = 0
X_test['Survived'] = knn.predict(X_test.drop(axis=1, labels=['PassengerId']))
submission = X_test[['PassengerId', 'Survived']]

In [326]:
submission.to_csv('ddh_submission.csv', index=False)

In [264]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.tree import DecisionTreeClassifier

X_train, X_test, y_train, y_test = load_data()

dt = DecisionTreeClassifier(random_state=0)
dt.fit(X=X_train, y=y_train)
y_pred_dt = dt.predict(X_test)

print(confusion_matrix(y_true=y_test, y_pred=y_pred_dt))
print(classification_report(y_test, y_pred_dt, labels=[0,1], target_names=['Died', 'Survived']))

[[122  17]
 [ 31  53]]
              precision    recall  f1-score   support

        Died       0.80      0.88      0.84       139
    Survived       0.76      0.63      0.69        84

    accuracy                           0.78       223
   macro avg       0.78      0.75      0.76       223
weighted avg       0.78      0.78      0.78       223



In [265]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier

X_train, X_test, y_train, y_test = load_data()

dt = RandomForestClassifier(random_state=0)
dt.fit(X=X_train, y=y_train)
y_pred_dt = dt.predict(X_test)

print(confusion_matrix(y_true=y_test, y_pred=y_pred_dt))
print(classification_report(y_test, y_pred_dt, labels=[0,1], target_names=['Died', 'Survived']))

[[120  19]
 [ 29  55]]
              precision    recall  f1-score   support

        Died       0.81      0.86      0.83       139
    Survived       0.74      0.65      0.70        84

    accuracy                           0.78       223
   macro avg       0.77      0.76      0.76       223
weighted avg       0.78      0.78      0.78       223



In [266]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.neural_network import MLPClassifier

X_train, X_test, y_train, y_test = load_data()

mlp = MLPClassifier(random_state=0, max_iter=1000)
mlp.fit(X=X_train, y=y_train)
y_pred_mlp = mlp.predict(X_test)

print(confusion_matrix(y_true=y_test, y_pred=y_pred_mlp))
print(classification_report(y_test, y_pred_mlp, labels=[0,1], target_names=['Died', 'Survived']))

[[121  18]
 [ 31  53]]
              precision    recall  f1-score   support

        Died       0.80      0.87      0.83       139
    Survived       0.75      0.63      0.68        84

    accuracy                           0.78       223
   macro avg       0.77      0.75      0.76       223
weighted avg       0.78      0.78      0.78       223

