In [1]:
import pandas as pd

# loading the training data
df = pd.read_csv('data/raw/train.csv')

# checking first few rows
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [2]:
#getting overview of columns and datatypes.
df.info()  

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [3]:
# counting missing null values
df.isna().sum() 

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [4]:
# summary stats for numerical columns
df.describe() 

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [5]:
# filling missing age,missing embarkation and dropping cabin as too many values are missing.
df['Age'].fillna(df['Age'].median(), inplace=True)
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
df.drop('Cabin', axis=1, inplace=True)

# Checking again to make sure no columns have missing data now
df.isna().sum()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)


PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

In [6]:
# creating some features and removing some which we can humanly see that it helps the algorithm.

# Creating FamilySize: total family members aboard (including self)
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

# Creating IsAlone: 1 if traveling alone, 0 otherwise
df['IsAlone'] = (df['FamilySize'] == 1).astype(int)

# Converting Sex text to numeric: male→0, female→1
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})

# Turning Embarked into two binary columns (Q and S; C is base)
embarked_dummies = pd.get_dummies(df['Embarked'], prefix='Embarked', drop_first=True)
df = pd.concat([df, embarked_dummies], axis=1)
df.drop('Embarked', axis=1, inplace=True)

# Dropping columns that aren't predictive
df.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)

print(df.shape)  
df.head()      


(891, 11)


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,FamilySize,IsAlone,Embarked_Q,Embarked_S
0,0,3,0,22.0,1,0,7.25,2,0,False,True
1,1,1,1,38.0,1,0,71.2833,2,0,False,False
2,1,3,1,26.0,0,0,7.925,1,1,False,True
3,1,1,1,35.0,1,0,53.1,2,0,False,True
4,0,3,0,35.0,0,0,8.05,1,1,False,True


In [22]:
import pandas as pd

# 1. Reload the raw data so every column is present
df = pd.read_csv('data/raw/train.csv')

# ==============================================
# 2. Initial Cleaning (fill missing values, drop Cabin)
# ==============================================

# 2.1 Fill missing Age with the median
df['Age'] = df['Age'].fillna(df['Age'].median())

# 2.2 Fill missing Embarked with the mode
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

# 2.3 Drop Cabin because it has too many missing values
df = df.drop('Cabin', axis=1)

# ==============================================
# 3. Original Feature Engineering (keep Name/Embarked until after these steps)
# ==============================================

# 3.1 Create FamilySize and IsAlone
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
df['IsAlone']    = (df['FamilySize'] == 1).astype(int)

# 3.2 Encode Sex
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})

# 3.3 One-hot encode Embarked (we still have df['Embarked'] here)
embarked_dummies = pd.get_dummies(df['Embarked'], prefix='Embarked', drop_first=True)
df = pd.concat([df, embarked_dummies], axis=1)

# ==============================================
# 4. New Feature Engineering: Title & AgeGroup
# ==============================================

# 4.1 Extract “Title” from Name
df['Title'] = df['Name'].str.extract(r',\s*([^\.]+)\.', expand=False)

# 4.2 Map rare titles to “Rare” and standardize common ones
title_map = {
    "Mr": "Mr", "Mrs": "Mrs", "Miss": "Miss", "Master": "Master",
    "Don": "Rare", "Rev": "Rare", "Dr": "Rare", "Mme": "Mrs", "Ms": "Miss",
    "Major": "Rare", "Lady": "Rare", "Sir": "Rare", "Mlle": "Miss", "Col": "Rare",
    "Capt": "Rare", "Countess": "Rare", "Jonkheer": "Rare", "Dona": "Rare"
}
df['Title'] = df['Title'].map(title_map)

# 4.3 One-hot encode Title
title_dummies = pd.get_dummies(df['Title'], prefix='Title', drop_first=True)
df = pd.concat([df, title_dummies], axis=1)

# 4.4 Bin Age into categories (“AgeGroup”)
bins = [0, 12, 18, 35, 60, 120]
labels = ['Child', 'Teen', 'Adult', 'Middle', 'Senior']
df['AgeGroup'] = pd.cut(df['Age'], bins=bins, labels=labels)

# 4.5 One-hot encode AgeGroup
age_dummies = pd.get_dummies(df['AgeGroup'], prefix='AgeGroup', drop_first=True)
df = pd.concat([df, age_dummies], axis=1)

# ==============================================
# 5. Drop columns no longer needed
# ==============================================

df = df.drop([
    'Name',       # we extracted Title already
    'Title',      # now represented by dummies
    'AgeGroup',   # now represented by dummies
    'PassengerId', 
    'Ticket', 
    'Embarked'    # we encoded it already with dummies
], axis=1)

# ==============================================
# 6. Quick sanity check of columns
# ==============================================

print("Final df shape and columns:")
print(df.shape)
print(df.columns.tolist())


Final df shape and columns:
(891, 19)
['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'FamilySize', 'IsAlone', 'Embarked_Q', 'Embarked_S', 'Title_Miss', 'Title_Mr', 'Title_Mrs', 'Title_Rare', 'AgeGroup_Teen', 'AgeGroup_Adult', 'AgeGroup_Middle', 'AgeGroup_Senior']


In [23]:
from sklearn.model_selection import train_test_split

# I'm doing it like splitting it into 70% training set, 20% CV set, 10% testing set.

X = df.drop('Survived', axis=1)
y = df['Survived']
# 1. First split off the 10% test set
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y,
    test_size=0.10,      # 10% goes to test
    stratify=y,          # keep same class balance
    random_state=42
)

# 2. Now split the remaining 90% into 70% train and 20% validation
#    Since X_temp is 90%, and we want 20% overall for validation,
#    we need val_size = 20/90 ≈ 0.2222 of X_temp.
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp,
    test_size=0.2222,    # ~20% of original
    stratify=y_temp,     # keep class balance
    random_state=42
)

# Quick check of sizes
print("Train set:     ", X_train.shape, y_train.shape)   # ~70% of 891 → ~624 rows
print("Validation set:", X_val.shape,   y_val.shape)     # ~20% of 891 → ~179 rows

print("Test set:      ", X_test.shape,  y_test.shape)    # ~10% of 891 → ~89 rows

Train set:      (623, 18) (623,)
Validation set: (178, 18) (178,)
Test set:       (90, 18) (90,)


# NOW LETS TRY LOGISTIC REGRESSION MODEL.

In [8]:
# Training in Logistic Regression

from sklearn.linear_model import LogisticRegression

# Initialize the model
logreg = LogisticRegression(max_iter=500, random_state=42)

# Fit on the training set
logreg.fit(X_train, y_train)

In [9]:
# Validating the Model

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Predict on the validation set
y_val_pred = logreg.predict(X_val)

# Compute accuracy
val_acc = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {val_acc:.4f}")

# Show confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred))

# Show precision, recall, f1-score
print("Classification Report:\n", classification_report(y_val, y_val_pred))


Validation Accuracy: 0.8202
Confusion Matrix:
 [[97 13]
 [19 49]]
Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.88      0.86       110
           1       0.79      0.72      0.75        68

    accuracy                           0.82       178
   macro avg       0.81      0.80      0.81       178
weighted avg       0.82      0.82      0.82       178



# NOW LETS DO ON DECISION TREE ALGORITHM

In [10]:
# Training set

from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(
    max_depth=5,         # limit depth to prevent overfitting
    min_samples_leaf=5,  # require at least 5 samples per leaf
    random_state=42
)

# Fitting on the training data
dt.fit(X_train, y_train) 

In [11]:
# Validation set.
y_val_pred_dt = dt.predict(X_val)

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
acc_dt = accuracy_score(y_val, y_val_pred_dt)
print(f"Decision Tree Validation Accuracy: {acc_dt:.4f}")

print("Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred_dt))

# Classification report
print("Classification Report:\n", classification_report(y_val, y_val_pred_dt))


Decision Tree Validation Accuracy: 0.8315
Confusion Matrix:
 [[104   6]
 [ 24  44]]
Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.95      0.87       110
           1       0.88      0.65      0.75        68

    accuracy                           0.83       178
   macro avg       0.85      0.80      0.81       178
weighted avg       0.84      0.83      0.82       178



In [12]:
# checkinh geature importance
import pandas as pd

feat_imp = pd.Series(dt.feature_importances_, index=X_train.columns)

print(feat_imp.sort_values(ascending=False))

Sex           0.552305
Pclass        0.209087
Age           0.112136
Fare          0.053396
FamilySize    0.043650
SibSp         0.029426
Parch         0.000000
IsAlone       0.000000
Embarked_Q    0.000000
Embarked_S    0.000000
dtype: float64


# Lets try Random Forest Algoritm

In [24]:
# Training set

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=100,      # number of trees
    max_depth=7,           # limit depth for each tree
    min_samples_leaf=5,    # require at least 5 samples per leaf
    random_state=42,
    n_jobs=-1              # use all CPU cores
)

rf.fit(X_train, y_train)

In [25]:
# Validation set

# Predict on validation set
y_val_pred_rf = rf.predict(X_val)

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

acc_rf = accuracy_score(y_val, y_val_pred_rf)
print(f"Random Forest Validation Accuracy: {acc_rf:.4f}")

print("Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred_rf))

print("Classification Report:\n", classification_report(y_val, y_val_pred_rf))


Random Forest Validation Accuracy: 0.8371
Confusion Matrix:
 [[103   7]
 [ 22  46]]
Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.94      0.88       110
           1       0.87      0.68      0.76        68

    accuracy                           0.84       178
   macro avg       0.85      0.81      0.82       178
weighted avg       0.84      0.84      0.83       178



In [26]:
#Feature importance

import pandas as pd

# Retrieve and sort feature importances
rf_imp = pd.Series(rf.feature_importances_, index=X_train.columns)
print("Random Forest Feature Importances:\n", rf_imp.sort_values(ascending=False))


Random Forest Feature Importances:
 Title_Mr           0.206756
Sex                0.196352
Fare               0.138644
Pclass             0.119734
Age                0.085842
Title_Mrs          0.066016
Title_Miss         0.049662
FamilySize         0.046443
SibSp              0.031735
Parch              0.012370
Embarked_S         0.011932
IsAlone            0.009940
AgeGroup_Adult     0.006465
AgeGroup_Middle    0.005930
Embarked_Q         0.004782
Title_Rare         0.003700
AgeGroup_Teen      0.002083
AgeGroup_Senior    0.001614
dtype: float64


## **RANDOM FOREST IS THE WINNER !!**

# We will now try to hypertune the parameters in Random Forest

In [27]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble    import RandomForestClassifier

# 1) Define a grid of hyperparameters to try
param_grid = {
    'n_estimators': [100, 200, 300],       # number of trees
    'max_depth': [5, 7, 10, None],         # tree depth (None = unlimited)
    'min_samples_split': [2, 5, 10],       # min samples to split a node
    'min_samples_leaf': [1, 3, 5],         # min samples at a leaf
    'max_features': ['sqrt', 'log2', None] # how many features to consider at each split
}

# 2) Initialize a basic RandomForestClassifier (just as a placeholder)
base_rf = RandomForestClassifier(random_state=42)

# 3) Wrap in GridSearchCV
#    - cv=3 means 3-fold cross-validation on X_train/y_train
#    - scoring='accuracy' means we’re optimizing for accuracy
#    - n_jobs=-1 means use all CPU cores
grid_search = GridSearchCV(
    estimator=base_rf,
    param_grid=param_grid,
    cv=3,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

# 4) Fit grid search on the TRAINING data (not validation/test)
grid_search.fit(X_train, y_train)

# 5) After it finishes, retrieve the best estimator
tuned_rf = grid_search.best_estimator_

# 6) Show the best hyperparameters and the best cross-validation accuracy
print("Best hyperparameters:", grid_search.best_params_)
print("Best CV accuracy:", grid_search.best_score_)

# 7) Evaluate this tuned model on the VALIDATION set
y_val_pred_tuned = tuned_rf.predict(X_val)

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

print("Tuned RF Validation Accuracy:", accuracy_score(y_val, y_val_pred_tuned))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred_tuned))
print("Classification Report:\n", classification_report(y_val, y_val_pred_tuned))

Fitting 3 folds for each of 324 candidates, totalling 972 fits
Best hyperparameters: {'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
Best CV accuracy: 0.8202341137123746
Tuned RF Validation Accuracy: 0.8539325842696629
Confusion Matrix:
 [[104   6]
 [ 20  48]]
Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.95      0.89       110
           1       0.89      0.71      0.79        68

    accuracy                           0.85       178
   macro avg       0.86      0.83      0.84       178
weighted avg       0.86      0.85      0.85       178



We can see that tuned RF accuracy is 83.7078% and our base RF accuracy was 83.71%. 
So we can conclude that hypertuning didn't help us much because our base line model already captured enough.

# So let's try to enrich the features.

after enriching and hypertuning them we can see that accuracy increased to 85.393%.

In [28]:
# For its generalisation we are checking the test data that we did not feed the model.

y_test_pred = rf.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

Test Accuracy: 0.8111111111111111
              precision    recall  f1-score   support

           0       0.80      0.93      0.86        55
           1       0.85      0.63      0.72        35

    accuracy                           0.81        90
   macro avg       0.82      0.78      0.79        90
weighted avg       0.82      0.81      0.80        90



## **THE END**