<a href="https://colab.research.google.com/github/wollieliza/titanic-prediction/blob/main/titanic_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing packages

In [47]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Reading the database

In [48]:
data = pd.read_csv("/content/titanic_data.csv")

In [49]:
data.head()

Unnamed: 0,Passengerid,Age,Fare,Sex,sibsp,zero,zero.1,zero.2,zero.3,zero.4,zero.5,zero.6,Parch,zero.7,zero.8,zero.9,zero.10,zero.11,zero.12,zero.13,zero.14,Pclass,zero.15,zero.16,Embarked,zero.17,zero.18,2urvived
0,1,22.0,7.25,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,2.0,0,0,0
1,2,38.0,71.2833,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0.0,0,0,1
2,3,26.0,7.925,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,2.0,0,0,1
3,4,35.0,53.1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,2.0,0,0,1
4,5,35.0,8.05,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,2.0,0,0,0


In [50]:
data = data.drop(["zero", "zero.1", "zero.2", "zero.3", "zero.4", "zero.5", "zero.6", "zero.7", "zero.8",
                  "zero.9", "zero.10", "zero.11", "zero.12", "zero.13", "zero.14", "zero.15", "zero.16",
                  "zero.17", "zero.18", "Embarked"], axis = 1)

In [51]:
data.head()

Unnamed: 0,Passengerid,Age,Fare,Sex,sibsp,Parch,Pclass,2urvived
0,1,22.0,7.25,0,1,0,3,0
1,2,38.0,71.2833,1,1,0,1,1
2,3,26.0,7.925,1,0,0,3,1
3,4,35.0,53.1,1,1,0,1,1
4,5,35.0,8.05,0,0,0,3,0


# Editing keys and changing variables

In [52]:
data = data.set_index(['Passengerid'])
data = data.rename(columns = {'2urvived' : 'target'}, inplace = False)

In [53]:
data.head()

Unnamed: 0_level_0,Age,Fare,Sex,sibsp,Parch,Pclass,target
Passengerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,22.0,7.25,0,1,0,3,0
2,38.0,71.2833,1,1,0,1,1
3,26.0,7.925,1,0,0,3,1
4,35.0,53.1,1,1,0,1,1
5,35.0,8.05,0,0,0,3,0


# Describe

In [54]:
data.describe()

Unnamed: 0,Age,Fare,Sex,sibsp,Parch,Pclass,target
count,1309.0,1309.0,1309.0,1309.0,1309.0,1309.0,1309.0
mean,29.503186,33.281086,0.355997,0.498854,0.385027,2.294882,0.261268
std,12.905241,51.7415,0.478997,1.041658,0.86556,0.837836,0.439494
min,0.17,0.0,0.0,0.0,0.0,1.0,0.0
25%,22.0,7.8958,0.0,0.0,0.0,2.0,0.0
50%,28.0,14.4542,0.0,0.0,0.0,3.0,0.0
75%,35.0,31.275,1.0,1.0,0.0,3.0,1.0
max,80.0,512.3292,1.0,8.0,9.0,3.0,1.0


# Data transformation

In [55]:
data.isnull().sum()

Age       0
Fare      0
Sex       0
sibsp     0
Parch     0
Pclass    0
target    0
dtype: int64

# Sampling

In [56]:
x_train, x_test, y_train, y_test = train_test_split(data.drop(['target'], axis = 1),
                                                    data['target'],
                                                    test_size = 0.3,
                                                    random_state = 1234)

In [57]:
[{'train' : x_train.shape}, {'test' : x_test.shape}]

#for training, it's interesting to reserve 70% of the database, and for testing the other 30%

[{'train': (916, 6)}, {'test': (393, 6)}]

# Random Forest Model

In [58]:
# Mapping the model
rndforest = RandomForestClassifier(n_estimators = 1000, criterion = 'gini', max_depth = 5)

# Calculating the model
rndforest.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=5, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [59]:
probability    = rndforest.predict_proba(data.drop('target', axis = 1))[:,1]
classification = rndforest.predict(data.drop('target', axis = 1))

In [60]:
data['Probability']    = probability
data['Classification'] = classification

In [61]:
data

Unnamed: 0_level_0,Age,Fare,Sex,sibsp,Parch,Pclass,target,Probability,Classification
Passengerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,22.0,7.2500,0,1,0,3,0,0.089924,0
2,38.0,71.2833,1,1,0,1,1,0.704905,1
3,26.0,7.9250,1,0,0,3,1,0.325647,0
4,35.0,53.1000,1,1,0,1,1,0.693067,1
5,35.0,8.0500,0,0,0,3,0,0.087328,0
...,...,...,...,...,...,...,...,...,...
1305,28.0,8.0500,0,0,0,3,0,0.089017,0
1306,39.0,108.9000,1,0,0,1,0,0.704035,1
1307,38.5,7.2500,0,0,0,3,0,0.076718,0
1308,28.0,8.0500,0,0,0,3,0,0.089017,0
