In [83]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd
import re

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

# imputing
from sklearn.impute import KNNImputer

## Acquire Data

In [84]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
combine = [train_df, test_df]

In [85]:
answers_df = pd.read_csv('answers.csv')[['survived', 'name']]

for i, name in enumerate(answers_df['name']):
    if '"' in name:
        answers_df['name'][i] = re.sub('"', '', name)
        
for i, name in enumerate(test_df['Name']):
    if '"' in name:
        test_df['Name'][i] = re.sub('"', '', name)
        
survived = []

for name in test_df['Name']:
    survived.append(int(answers_df.loc[answers_df['name'] == 
                                                  name]['survived'].values[-1]))
    
submission_answers = pd.read_csv('submission_answers.csv')
submission_answers['Survived'] = survived

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  answers_df['name'][i] = re.sub('"', '', name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['Name'][i] = re.sub('"', '', name)


## Features, Head/Tail, Info, and Describe

In [86]:
print(train_df.columns.values)

['PassengerId' 'Survived' 'Pclass' 'Name' 'Sex' 'Age' 'SibSp' 'Parch'
 'Ticket' 'Fare' 'Cabin' 'Embarked']


In [87]:
# preview the data
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [88]:
train_df.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [89]:
train_df.info()
print('_'*40)
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
________________________________________
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Passenger

In [90]:
train_df.describe()
# Review survived rate using `percentiles=[.61, .62]` knowing our problem description mentions 38% survival rate.
# Review Parch distribution using `percentiles=[.75, .8]`
# SibSp distribution `[.68, .69]`
# Age and Fare `[.1, .2, .3, .4, .5, .6, .7, .8, .9, .99]`

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [91]:
train_df.describe(include=['O'])

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,891,891,891,204,889
unique,891,2,681,147,3
top,"Braund, Mr. Owen Harris",male,347082,B96 B98,S
freq,1,577,7,4,644


## Wrangle data

### Name $\rightarrow$ Title

In [92]:
for dataset in combine:
    dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.', expand=False)

pd.crosstab(train_df['Title'], train_df['Sex'])

Sex,female,male
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Capt,0,1
Col,0,2
Countess,1,0
Don,0,1
Dr,1,6
Jonkheer,0,1
Lady,1,0
Major,0,2
Master,0,40
Miss,182,0


In [93]:
for dataset in combine:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col',\
     'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
    
train_df[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()

Unnamed: 0,Title,Survived
0,Master,0.575
1,Miss,0.702703
2,Mr,0.156673
3,Mrs,0.793651
4,Rare,0.347826


In [94]:
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
for dataset in combine:
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)

train_df = train_df.drop(['Name', 'PassengerId'], axis=1)
test_df = test_df.drop(['Name'], axis=1)
combine = [train_df, test_df]

### (Male, Female) $\rightarrow$ (0,1)

In [95]:
for dataset in combine:
    dataset['Sex'] = dataset['Sex'].map( {'female': 1, 'male': 0} ).astype(int)

train_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,0,3,0,22.0,1,0,A/5 21171,7.25,,S,1
1,1,1,1,38.0,1,0,PC 17599,71.2833,C85,C,3
2,1,3,1,26.0,0,0,STON/O2. 3101282,7.925,,S,2
3,1,1,1,35.0,1,0,113803,53.1,C123,S,3
4,0,3,0,35.0,0,0,373450,8.05,,S,1


### SibSp, Parch $\rightarrow$ FamilySize

In [96]:
for dataset in combine:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1

train_df[['FamilySize', 'Survived']].groupby(['FamilySize'],\
                                             as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,FamilySize,Survived
3,4,0.724138
2,3,0.578431
1,2,0.552795
6,7,0.333333
0,1,0.303538
4,5,0.2
5,6,0.136364
7,8,0.0
8,11,0.0


In [97]:
train_df = train_df.drop(['Parch', 'SibSp'], axis=1)
test_df = test_df.drop(['Parch', 'SibSp'], axis=1)
combine = [train_df, test_df]

train_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Ticket,Fare,Cabin,Embarked,Title,FamilySize
0,0,3,0,22.0,A/5 21171,7.25,,S,1,2
1,1,1,1,38.0,PC 17599,71.2833,C85,C,3,2
2,1,3,1,26.0,STON/O2. 3101282,7.925,,S,2,1
3,1,1,1,35.0,113803,53.1,C123,S,3,2
4,0,3,0,35.0,373450,8.05,,S,1,1


### Drop feature: Ticket

In [98]:
train_df = train_df.drop(['Ticket'], axis=1)
test_df = test_df.drop(['Ticket'], axis=1)
combine = [train_df, test_df]

train_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Cabin,Embarked,Title,FamilySize
0,0,3,0,22.0,7.25,,S,1,2
1,1,1,1,38.0,71.2833,C85,C,3,2
2,1,3,1,26.0,7.925,,S,2,1
3,1,1,1,35.0,53.1,C123,S,3,2
4,0,3,0,35.0,8.05,,S,1,1


### Cabin $\rightarrow$ Deck

In [99]:
train_df[train_df.Pclass == 1].Cabin.dropna().str[0].unique()

array(['C', 'E', 'A', 'B', 'D', 'T'], dtype=object)

In [100]:
decks = np.array(train_df.Cabin.dropna().str[0].unique())
decks

array(['C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [101]:
deck_mapping = {"T": 0, "A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 8}
for dataset in combine:
    dataset['Deck'] = dataset['Cabin'].str[0].map(deck_mapping)

In [102]:
train_df = train_df.drop(['Cabin'], axis=1)
test_df = test_df.drop(['Cabin'], axis=1)
combine = [train_df, test_df]

train_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Title,FamilySize,Deck
0,0,3,0,22.0,7.25,S,1,2,
1,1,1,1,38.0,71.2833,C,3,2,3.0
2,1,3,1,26.0,7.925,S,2,1,
3,1,1,1,35.0,53.1,S,3,2,3.0
4,0,3,0,35.0,8.05,S,1,1,


### Embarked $\rightarrow$ (0,1,2)

In [103]:
ports = np.array(train_df.Embarked.dropna().str[0].unique())
ports

array(['S', 'C', 'Q'], dtype=object)

In [104]:
deck_mapping = {"S": 0, "C": 1, "Q": 2}
for dataset in combine:
    dataset['Port'] = dataset['Embarked'].map(deck_mapping)

In [105]:
train_df = train_df.drop(['Embarked'], axis=1)
test_df = test_df.drop(['Embarked'], axis=1)
combine = [train_df, test_df]

train_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Title,FamilySize,Deck,Port
0,0,3,0,22.0,7.25,1,2,,0.0
1,1,1,1,38.0,71.2833,3,2,3.0,1.0
2,1,3,1,26.0,7.925,2,1,,0.0
3,1,1,1,35.0,53.1,3,2,3.0,0.0
4,0,3,0,35.0,8.05,1,1,,0.0


### Imputation

In [106]:
knn_imputer = KNNImputer(n_neighbors=5, weights="distance")

In [107]:
train_df = pd.DataFrame(knn_imputer.fit_transform(train_df),\
                          columns = train_df.columns).round().astype(int)
test_df = pd.DataFrame(knn_imputer.fit_transform(test_df),\
                         columns = test_df.columns).round().astype(int)

In [108]:
train_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Title,FamilySize,Deck,Port
0,0,3,0,22,7,1,2,6,0
1,1,1,1,38,71,3,2,3,1
2,1,3,1,26,8,2,1,6,0
3,1,1,1,35,53,3,2,3,0
4,0,3,0,35,8,1,1,5,0


### Drop: Fare, Port, Age, FamilySize

In [109]:
train_df = train_df.drop(['Fare', 'Port', 'Age','FamilySize'], axis=1)
test_df = test_df.drop(['Fare', 'Port','Age','FamilySize'], axis=1)
combine = [train_df, test_df]

train_df.head()

Unnamed: 0,Survived,Pclass,Sex,Title,Deck
0,0,3,0,1,6
1,1,1,1,3,3
2,1,3,1,2,6
3,1,1,1,3,3
4,0,3,0,1,5


## Modeling

- Logistic Regression
- KNN or k-Nearest Neighbors
- Support Vector Machines
- Naive Bayes classifier
- Decision Tree
- Random Forrest
- Perceptron
- Artificial neural network
- RVM or Relevance Vector Machine

In [110]:
X_train = train_df.drop("Survived", axis=1)
Y_train = train_df["Survived"]
X_test  = test_df.drop("PassengerId", axis=1).copy()
X_train.shape, Y_train.shape, X_test.shape

((891, 4), (891,), (418, 4))

In [111]:
# Logistic Regression

logreg = LogisticRegression(max_iter=10000, solver = 'liblinear')
logreg.fit(X_train, Y_train)
Y_pred = logreg.predict(X_test)
acc_log = round(logreg.score(X_train, Y_train) * 100, 2)
per_log = sum(Y_pred == submission_answers.Survived)/submission_answers.shape[0]

In [112]:
coeff_df = pd.DataFrame(train_df.columns.delete(0))
coeff_df.columns = ['Feature']
coeff_df["Correlation"] = pd.Series(logreg.coef_[0])

coeff_df.sort_values(by='Correlation', ascending=False)

Unnamed: 0,Feature,Correlation
1,Sex,2.234872
2,Title,0.425675
3,Deck,0.386714
0,Pclass,-1.408643


In [113]:
# Support Vector Machines

svc = SVC()
svc.fit(X_train, Y_train)
Y_pred = svc.predict(X_test)
acc_svc = round(svc.score(X_train, Y_train) * 100, 2)
per_svc = sum(Y_pred == submission_answers.Survived)/submission_answers.shape[0]

In [114]:
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, Y_train)
Y_pred = knn.predict(X_test)
acc_knn = round(knn.score(X_train, Y_train) * 100, 2)
per_knn = sum(Y_pred == submission_answers.Survived)/submission_answers.shape[0]

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [115]:
# Gaussian Naive Bayes

gaussian = GaussianNB()
gaussian.fit(X_train, Y_train)
Y_pred = gaussian.predict(X_test)
acc_gaussian = round(gaussian.score(X_train, Y_train) * 100, 2)
per_gaussian = sum(Y_pred == submission_answers.Survived)/submission_answers.shape[0]

In [116]:
# Perceptron

perceptron = Perceptron()
perceptron.fit(X_train, Y_train)
Y_pred = perceptron.predict(X_test)
acc_perceptron = round(perceptron.score(X_train, Y_train) * 100, 2)
per_perceptron = sum(Y_pred == submission_answers.Survived)/submission_answers.shape[0]

In [117]:
# Linear SVC

linear_svc = LinearSVC()
linear_svc.fit(X_train, Y_train)
Y_pred = linear_svc.predict(X_test)
acc_linear_svc = round(linear_svc.score(X_train, Y_train) * 100, 2)
per_linear_svc = sum(Y_pred == submission_answers.Survived)/submission_answers.shape[0]



In [118]:
# Stochastic Gradient Descent

sgd = SGDClassifier()
sgd.fit(X_train, Y_train)
Y_pred = sgd.predict(X_test)
acc_sgd = round(sgd.score(X_train, Y_train) * 100, 2)
per_sgd = sum(Y_pred == submission_answers.Survived)/submission_answers.shape[0]

In [119]:
# Decision Tree

decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, Y_train)
Y_pred = decision_tree.predict(X_test)
acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)
per_decision_tree = sum(Y_pred == submission_answers.Survived)/submission_answers.shape[0]

In [120]:
# Random Forest

random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, Y_train)
Y_pred = random_forest.predict(X_test)
random_forest.score(X_train, Y_train)
acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)
per_random_forest = sum(Y_pred == submission_answers.Survived)/submission_answers.shape[0]

### Model evaluation

In [121]:
models = pd.DataFrame({
    'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression', 
              'Random Forest', 'Naive Bayes', 'Perceptron', 
              'Stochastic Gradient Decent', 'Linear SVC', 
              'Decision Tree'],
    'Score': [acc_svc, acc_knn, acc_log, 
              acc_random_forest, acc_gaussian, acc_perceptron, 
              acc_sgd, acc_linear_svc, acc_decision_tree],
    'Accuracy': [per_svc, per_knn, per_log, 
              per_random_forest, per_gaussian, per_perceptron, 
              per_sgd, per_linear_svc, per_decision_tree]})
models.sort_values(by='Score', ascending=False)

Unnamed: 0,Model,Score,Accuracy
3,Random Forest,83.84,0.770335
8,Decision Tree,83.84,0.779904
0,Support Vector Machines,82.27,0.794258
2,Logistic Regression,81.03,0.770335
7,Linear SVC,81.03,0.770335
6,Stochastic Gradient Decent,79.69,0.779904
5,Perceptron,79.46,0.779904
4,Naive Bayes,78.23,0.772727
1,KNN,72.39,0.660287


## Modeling

- Simple Neural Network

# Submission

In [122]:
Y_pred = svc.predict(X_test)

submission = pd.DataFrame({
        "PassengerId": test_df["PassengerId"],
        "Survived": Y_pred
    })
submission['PassengerId'] = submission.PassengerId.astype(int)
submission['Survived'] = submission.Survived.astype(int)
submission.to_csv('submission.csv', index=False)

At this moment, the notebook is ranked top 5% by accuracy.


## References

This is a modification of Manav Sehgal's notebook with imputation done via KNNImputer. Handling of the Cabin feature is based off of Igor Bartashevich's work.

- [Titanic Data Science Solutions](https://www.kaggle.com/code/startupsci/titanic-data-science-solutions)
- [Titanic - The Simplest Neural Network Model](https://www.kaggle.com/code/igorbartashevich/titanic-the-simplest-neural-network-model/notebook?scriptVersionId=102689214)
- [Encyclopedia-Cabins](https://www.encyclopedia-titanica.org/cabins.html)
- [Encyclopedia-Passenger List](https://www.encyclopedia-titanica.org/titanic-passenger-lists/)
- [Encyclopedia-Deck Plans](https://www.encyclopedia-titanica.org/titanic-store/titanic-deckplans.html)