# TITANIC SURVIVOR PREDICTOR

# Importing the Libraries

In [1]:
import pandas as pd
import numpy as np
import math
import statistics

# Importing the Datasets

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Data Preprocessing

# Inspecting the dataset

In [3]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


Checking for Missing Values

In [5]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

# **Handling the Missing Values**

The Cabin column has a lot of missing data so the entire column is dropped

In [6]:
train_1 = train.drop(labels=['Cabin'], axis = 1)

For the Age column, the missing values can be replaced by the mean or mode value.

In [7]:
age = train_1['Age']

In [8]:
print(age)

0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
       ... 
886    27.0
887    19.0
888     NaN
889    26.0
890    32.0
Name: Age, Length: 891, dtype: float64


In [9]:
age_mode = age.mode()
age_mean = age.mean()
age_rep =(age_mode + age_mean)*0.5

In [10]:
print(age_rep)

0    26.849559
dtype: float64


The age rows with the missing values will be filled with "age_rep"

In [11]:
train_1['Age'] = train_1['Age'].fillna(26.849559)

Consider the "Embarked Column"

In [12]:
train_1['Embarked'].unique()

array(['S', 'C', 'Q', nan], dtype=object)

It is categorical and with three distinct values so the missing rows will be fitted with the most occuring value

In [13]:
train_1['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

The missing values are replaced by S since it is the most occuring

In [14]:
train_1['Embarked'] = train_1['Embarked'].fillna('S')

Check

In [15]:
train_1.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

# Checking for Irrelevant Columns

In [16]:
train_1.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


The "PassengerId", "Name", and "Ticket" columns will be dropped as no strong analytical predictions can be made with them

In [17]:
train_2 = train_1
train_2 = train_2.drop(labels=['PassengerId', 'Name', 'Ticket'], axis = 1)

In [18]:
train_2.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


# Encoding Categorical Data

In [19]:
dataset = train_2

Sex Column

In [20]:
from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()
enc_sex = lb.fit_transform(dataset['Sex'])

Embarked Column

In [21]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
enc_embarked = le.fit_transform(dataset['Embarked'])

Update the Dataset with the encoded columns

*Add encoded*

In [22]:
dataset.insert(3, 'enc_sex', enc_sex, True)
dataset.insert(9, 'enc_embarked', enc_embarked, True)

*Remove Old columns*

In [23]:
dataset = dataset.drop(labels=['Sex', 'Embarked'], axis = 1)

Check

In [24]:
dataset.head()

Unnamed: 0,Survived,Pclass,enc_sex,Age,SibSp,Parch,Fare,enc_embarked
0,0,3,1,22.0,1,0,7.25,2
1,1,1,0,38.0,1,0,71.2833,0
2,1,3,0,26.0,0,0,7.925,2
3,1,1,0,35.0,1,0,53.1,2
4,0,3,1,35.0,0,0,8.05,2


In [25]:
dataset.describe()

Unnamed: 0,Survived,Pclass,enc_sex,Age,SibSp,Parch,Fare,enc_embarked
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.383838,2.308642,0.647587,29.133044,0.523008,0.381594,32.204208,1.536476
std,0.486592,0.836071,0.47799,13.051685,1.102743,0.806057,49.693429,0.791503
min,0.0,1.0,0.0,0.42,0.0,0.0,0.0,0.0
25%,0.0,2.0,0.0,22.0,0.0,0.0,7.9104,1.0
50%,0.0,3.0,1.0,26.849559,0.0,0.0,14.4542,2.0
75%,1.0,3.0,1.0,35.0,1.0,0.0,31.0,2.0
max,1.0,3.0,1.0,80.0,8.0,6.0,512.3292,2.0


# Defining the dependent variable y and the independent variable X

In [26]:
y = dataset.iloc[:, 0].values
X = dataset.iloc[:, 1:].values

# Splitting into Train and Test Sets

In [27]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

# Feature Scaling

In [28]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

# MODEL TRAINING

In [29]:
from sklearn.metrics import accuracy_score

# Random Forest Classification

In [30]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)
classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [31]:
y_pred = classifier.predict(X_test)
accuracy_score(y_test, y_pred)

0.776536312849162

# Decision Tree Classifier

In [32]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=0, splitter='best')

In [33]:
y_pred = classifier.predict(X_test)
accuracy_score(y_test, y_pred)

0.7653631284916201