In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

## Loading the dataset and perform data preprocessing

In [16]:
titanic_data = pd.read_csv("Data/Titanic.csv")
titanic_data.head()

Unnamed: 0,sex,age,sibsp,parch,fare,embarked,class,who,alone,survived
0,male,22.0,1,0,7.25,S,Third,man,False,0
1,female,38.0,1,0,71.2833,C,First,woman,False,1
2,female,26.0,0,0,7.925,S,Third,woman,True,1
3,female,35.0,1,0,53.1,S,First,woman,False,1
4,male,35.0,0,0,8.05,S,Third,man,True,0


#### Checking for missing values

In [3]:
titanic_data.isnull().sum()

sex           0
age         177
sibsp         0
parch         0
fare          0
embarked      2
class         0
who           0
alone         0
survived      0
dtype: int64

## ** Since there are 177 person with missing age values, we will fill it with the mean age **

In [4]:
titanic_data["age"] = titanic_data['age'].fillna(titanic_data['age'].median())
titanic_data["fare"] = titanic_data['fare'].fillna(titanic_data['fare'].median())
titanic_data["embarked"] = titanic_data['embarked'].fillna(titanic_data['embarked'].mode()[0])

### Encoding categorical variable

In [5]:
titanic_data = pd.get_dummies(titanic_data, columns=['sex', 'embarked', 'class', 'who', 'alone'])

## Createing features and target variable

In [14]:
x = titanic_data.drop(['survived'], axis=1)
y = titanic_data['survived']

Unnamed: 0,age,sibsp,parch,fare,sex_female,sex_male,embarked_C,embarked_Q,embarked_S,class_First,class_Second,class_Third,who_child,who_man,who_woman,alone_False,alone_True
0,22.0,1,0,7.2500,0,1,0,0,1,0,0,1,0,1,0,1,0
1,38.0,1,0,71.2833,1,0,1,0,0,1,0,0,0,0,1,1,0
2,26.0,0,0,7.9250,1,0,0,0,1,0,0,1,0,0,1,0,1
3,35.0,1,0,53.1000,1,0,0,0,1,1,0,0,0,0,1,1,0
4,35.0,0,0,8.0500,0,1,0,0,1,0,0,1,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,27.0,0,0,13.0000,0,1,0,0,1,0,1,0,0,1,0,0,1
887,19.0,0,0,30.0000,1,0,0,0,1,1,0,0,0,0,1,0,1
888,28.0,1,2,23.4500,1,0,0,0,1,0,0,1,0,0,1,1,0
889,26.0,0,0,30.0000,0,1,1,0,0,1,0,0,0,1,0,0,1


### Spliting the dataset into training and testing sets

In [7]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

## Training classification model using Random Forest and making predictions

In [8]:
random_forest_model = RandomForestClassifier(random_state=42)
random_forest_model.fit(x_train, y_train)

## Making the predictions

In [13]:
y_predict = random_forest_model.predict(x_test)

## Model Evaluation

In [10]:
print(f"Accuracy: {accuracy_score(y_test, y_predict)}")

Accuracy: 0.8212290502793296


In [11]:
print(f"Classification Report: {classification_report(y_test, y_predict)}")

Classification Report:               precision    recall  f1-score   support

           0       0.83      0.87      0.85       105
           1       0.80      0.76      0.78        74

    accuracy                           0.82       179
   macro avg       0.82      0.81      0.81       179
weighted avg       0.82      0.82      0.82       179

