# Titanic Machine Learning Challenge

Link: https://www.kaggle.com/competitions/titanic

## Importing necessary libraries

In [87]:
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np 

## Loading Data

In [88]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')


Training Dataset

In [89]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Testing Dataset

In [90]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


## Exploring the Dataset

In [91]:
train_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


Not all columns are useful in making prediction, removing uneccessary columns and the since our goal is to predict survivors we have our target feature is 'Survived' i.e. the column to predict.

In [92]:
features = list(train_df.columns)
features.remove('Survived')
features

['PassengerId',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

Binary values 0 and 1 show whether a passanger survived or not

In [93]:
target_feature = train_df['Survived']
target_feature

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

Now we will one hot encode text features for analysis

The percentage of women who survived

In [94]:
women = train_df[train_df.Sex == 'female']['Survived']
rate_women = sum(women)/len(women)
rate_women

0.7420382165605095

The percentage of men who survived

In [95]:
men = train_df[train_df.Sex == 'male']['Survived']
rate_men = sum(men)/len(men)
rate_men

0.18890814558058924

## Random Forest Descision Tree Classifier

In [96]:
y = train_df['Survived']
y

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

Now we will one hot encode the various features

In [97]:
features = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked']
X = pd.get_dummies(train_df[features])
X

Unnamed: 0,Pclass,SibSp,Parch,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,3,1,0,0,1,0,0,1
1,1,1,0,1,0,1,0,0
2,3,0,0,1,0,0,0,1
3,1,1,0,1,0,0,0,1
4,3,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...
886,2,0,0,0,1,0,0,1
887,1,0,0,1,0,0,0,1
888,3,1,2,1,0,0,0,1
889,1,0,0,0,1,1,0,0


In [98]:
X_test = pd.get_dummies(test_df[features])
X_test

Unnamed: 0,Pclass,SibSp,Parch,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,3,0,0,0,1,0,1,0
1,3,1,0,1,0,0,0,1
2,2,0,0,0,1,0,1,0
3,3,0,0,0,1,0,0,1
4,3,1,1,1,0,0,0,1
...,...,...,...,...,...,...,...,...
413,3,0,0,0,1,0,0,1
414,1,0,0,1,0,1,0,0
415,3,0,0,0,1,0,0,1
416,3,0,0,0,1,0,0,1


## Creating Model

In [99]:
model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model.fit(X, y)

RandomForestClassifier(max_depth=5, random_state=1)

In [100]:
predictions = model.predict(X_test)

In [101]:
output = pd.DataFrame({'PassengerId': test_df.PassengerId, 'Survived': predictions})
output

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


Writing output to CSV file for submission

In [102]:
output.to_csv('gender_submission.csv', index=False)

## All Done