# Kaggle Titanic Competition

In [590]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import mean_absolute_error
import torch
from torch import tensor
import torch.nn.functional as F

## Data cleaning


In [591]:
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

In [592]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [593]:
train_data.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [594]:
i = 0
while (i< len(train_data["Cabin"])):
    if (pd.isna(train_data.loc[i, "Cabin"])):
        train_data.loc[i, "Cabin"] = 'Missing'
    i += 1

train_data["Cabin_Letter"] = train_data["Cabin"]
i = 0
while (i< len(train_data["Cabin"])):
    train_data.loc[i, "Cabin_Letter"] = train_data.loc[i, "Cabin"][0]
    i += 1

In [595]:
i = 0
while (i< len(train_data["Name"])):
    train_data.loc[i, "Name_Mr"] = ("Mr." in train_data.loc[i, "Name"])
    i += 1
    
i = 0
while (i< len(train_data["Name"])):
    train_data.loc[i, "Name_Mrs"] = ("Mrs." in train_data.loc[i, "Name"])
    i += 1
    
i = 0
while (i< len(train_data["Name"])):
    train_data.loc[i, "Name_Miss"] = ("Miss." in train_data.loc[i, "Name"])
    i += 1

In [596]:
train_data["Cabin_A"] = train_data["Cabin_Letter"] == "A"
train_data["Cabin_B"] = train_data["Cabin_Letter"] == "B"
train_data["Cabin_C"] = train_data["Cabin_Letter"] == "C"
train_data["Cabin_D"] = train_data["Cabin_Letter"] == "D"
train_data["Cabin_E"] = train_data["Cabin_Letter"] == "E"
train_data["Cabin_F"] = train_data["Cabin_Letter"] == "F"
train_data["Cabin_G"] = train_data["Cabin_Letter"] == "G"
train_data["Cabin_T"] = train_data["Cabin_Letter"] == "T"
train_data["Cabin_M"] = train_data["Cabin_Letter"] == "M"

In [597]:
train_data["Missing_Age"] = train_data["Age"].isna()
train_data = pd.get_dummies(train_data, columns=["Sex","Pclass","Embarked"])
train_data["LogFare"] = np.log(train_data["Fare"].astype(float)+1)
train_data.fillna(train_data.mode().iloc[0], inplace=True)

In [598]:
train_data["Age"] = train_data["Age"] / 50
train_data["LogFare"] = train_data["LogFare"] / 5

In [599]:
indep = ['Age', 'SibSp', 'Parch', 'Name_Mr', 'Name_Mrs', 'Name_Miss', 
       'Missing_Age', 'Sex_female', 'Sex_male', 'Pclass_1', 'Pclass_2', 
       'Pclass_3', 'Embarked_C', 'Embarked_Q', 'Embarked_S', 'Cabin_A', 
       'Cabin_B', 'Cabin_C', 'Cabin_D', 'Cabin_E', 'Cabin_F', 'Cabin_G',
       'Cabin_T', 'Cabin_M', 'LogFare']

In [600]:
x = train_data[indep].astype(float)
y = train_data["Survived"]
x.head()

Unnamed: 0,Age,SibSp,Parch,Name_Mr,Name_Mrs,Name_Miss,Missing_Age,Sex_female,Sex_male,Pclass_1,...,Cabin_A,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Cabin_M,LogFare
0,0.44,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.422043
1,0.76,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.856119
2,0.52,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.437771
3,0.7,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.798167
4,0.7,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.440553


In [601]:
x.describe()

Unnamed: 0,Age,SibSp,Parch,Name_Mr,Name_Mrs,Name_Miss,Missing_Age,Sex_female,Sex_male,Pclass_1,...,Cabin_A,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Cabin_M,LogFare
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,...,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.571339,0.523008,0.381594,0.580247,0.140292,0.204265,0.198653,0.352413,0.647587,0.242424,...,0.016835,0.05275,0.066218,0.037037,0.035915,0.01459,0.004489,0.001122,0.771044,0.592449
std,0.263991,1.102743,0.806057,0.493796,0.347485,0.40339,0.39921,0.47799,0.47799,0.42879,...,0.128725,0.223659,0.248802,0.188959,0.186182,0.119973,0.06689,0.033501,0.420397,0.19381
min,0.0084,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.44,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.437444
50%,0.48,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.547576
75%,0.7,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.693147
max,1.6,8.0,6.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.248183


# Modeling

In [602]:
train_x, val_x, train_y, val_y = train_test_split(x, y, random_state = 5, test_size = 0.15)

In [603]:
SVM_Model = SVC()
SVM_Model.fit(train_x, train_y)
mean_absolute_error(SVM_Model.predict(val_x), val_y)

0.16417910447761194

In [604]:
RFC_Model = RandomForestClassifier()
RFC_Model.fit(train_x, train_y)
mean_absolute_error(RFC_Model.predict(val_x), val_y)

0.17164179104477612

In [605]:
DTC_Model = DecisionTreeClassifier(random_state = 1)
DTC_Model.fit(train_x, train_y)
mean_absolute_error(DTC_Model.predict(val_x), val_y)

0.208955223880597

In [620]:
KNN_Model = KNeighborsClassifier(n_neighbors=5)
KNN_Model.fit(train_x, train_y)
mean_absolute_error(KNN_Model.predict(val_x), val_y)

0.20149253731343283

In [607]:
GNB_Model = GaussianNB()
GNB_Model.fit(train_x, train_y)
mean_absolute_error(GNB_Model.predict(val_x), val_y)

0.23880597014925373

## Submitting

### Cleaning Data

In [608]:
i = 0
while (i< len(test_data["Cabin"])):
    if (pd.isna(test_data.loc[i, "Cabin"])):
        test_data.loc[i, "Cabin"] = 'Missing'
    i += 1

test_data["Cabin_Letter"] = test_data["Cabin"]
i = 0
while (i< len(test_data["Cabin"])):
    test_data.loc[i, "Cabin_Letter"] = test_data.loc[i, "Cabin"][0]
    i += 1

i = 0
while (i< len(test_data["Name"])):
    test_data.loc[i, "Name_Mr"] = ("Mr." in test_data.loc[i, "Name"])
    i += 1
    
i = 0
while (i< len(test_data["Name"])):
    test_data.loc[i, "Name_Mrs"] = ("Mrs." in test_data.loc[i, "Name"])
    i += 1
    
i = 0
while (i< len(test_data["Name"])):
    test_data.loc[i, "Name_Miss"] = ("Miss." in test_data.loc[i, "Name"])
    i += 1

test_data["Cabin_A"] = test_data["Cabin_Letter"] == "A"
test_data["Cabin_B"] = test_data["Cabin_Letter"] == "B"
test_data["Cabin_C"] = test_data["Cabin_Letter"] == "C"
test_data["Cabin_D"] = test_data["Cabin_Letter"] == "D"
test_data["Cabin_E"] = test_data["Cabin_Letter"] == "E"
test_data["Cabin_F"] = test_data["Cabin_Letter"] == "F"
test_data["Cabin_G"] = test_data["Cabin_Letter"] == "G"
test_data["Cabin_T"] = test_data["Cabin_Letter"] == "T"
test_data["Cabin_M"] = test_data["Cabin_Letter"] == "M"

test_data["Missing_Age"] = test_data["Age"].isna()
test_data = pd.get_dummies(test_data, columns=["Sex","Pclass","Embarked"])
test_data["LogFare"] = np.log(test_data["Fare"]+1)
test_data.fillna(test_data.mode().iloc[0], inplace=True)

test_data["Age"] = test_data["Age"] / 50
test_data["LogFare"] = test_data["LogFare"] / 5

In [609]:
test_data.columns

Index(['PassengerId', 'Name', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare',
       'Cabin', 'Cabin_Letter', 'Name_Mr', 'Name_Mrs', 'Name_Miss', 'Cabin_A',
       'Cabin_B', 'Cabin_C', 'Cabin_D', 'Cabin_E', 'Cabin_F', 'Cabin_G',
       'Cabin_T', 'Cabin_M', 'Missing_Age', 'Sex_female', 'Sex_male',
       'Pclass_1', 'Pclass_2', 'Pclass_3', 'Embarked_C', 'Embarked_Q',
       'Embarked_S', 'LogFare'],
      dtype='object')

In [610]:
test_x = test_data[indep]
test_x.head()

Unnamed: 0,Age,SibSp,Parch,Name_Mr,Name_Mrs,Name_Miss,Missing_Age,Sex_female,Sex_male,Pclass_1,...,Cabin_A,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Cabin_M,LogFare
0,0.69,0,0,True,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,True,0.435613
1,0.94,1,0,False,True,False,False,True,False,False,...,False,False,False,False,False,False,False,False,True,0.415888
2,1.24,0,0,True,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,True,0.473815
3,0.54,0,0,True,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,True,0.45365
4,0.44,1,1,False,True,False,False,True,False,False,...,False,False,False,False,False,False,False,False,True,0.517365


## Making Predictions

In [611]:
prediction = SVM_Model.predict(test_x).astype(int)

In [617]:
submission = pd.DataFrame({
    "PassengerId" : test_data["PassengerId"],
    "Survived" : prediction
})

submission.to_csv('submission.csv', index = False)

## Ensemble

In [615]:
prediction_RFC = RFC_Model.predict(test_x)
prediction_SVM = SVM_Model.predict(test_x)
prediction_DTC = DTC_Model.predict(test_x)
prediction_KNN = KNN_Model.predict(test_x)
prediction_GNB = GNB_Model.predict(test_x)

In [616]:
predictions_mean = (prediction_RFC + prediction_SVM + prediction_KNN)/3
predictions_mean = predictions_mean > 0.5
prediction = predictions_mean.astype(int)