#1. DATA EXPLORATION AND UNDERSTANDING
1.   loaded the datasets
2.   printed the no. of rows and columns, the data type of each feature and no. of missing values in each feature



In [None]:
#LOAD AND EXPLORE DATA
import pandas as pd
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

#---structure and missing values---
print("=== Train Dataset Info ===")
print("Shape:", train.shape)
print("\nData Types:\n", train.dtypes)
print("\nMissing Values:\n", train.isnull().sum())

print("\n=== Test Dataset Info ===")
print("Shape:", test.shape)
print("\nData Types:\n", test.dtypes)
print("\nMissing Values:\n", test.isnull().sum())

#statistical summary
print("\n=== Train Head ===")
print(train.head(3))

print("\n=== Train Describe ===")
print(train.describe(include='all'))  # 'include=all' for categorical as well

print("\n=== Test Head ===")
print(test.head(3))

print("\n=== Test Describe ===")
print(test.describe(include='all'))



1. **categorical data**(*data that can be grouped into categories instead of measuring numerically*) is visualised using bar plots --> [Survival vs Pclass,Gender vs Survival].
2.   **numeric data** is visualised using histograms/KDE plots --> [Age distribution of survivors vs non-survivors, Fare distribution]
3.   **missing values** are quickly visualised using heatmaps



In [None]:
#VISUALIZE KEY PATTERNS
import matplotlib.pyplot as plt
import seaborn as sns

# Survival rate by passenger class
sns.barplot(x='Pclass', y='Survived', data=train)
plt.show()

# Age distribution of survivors vs non-survivors
sns.histplot(data=train, x='Age', hue='Survived', kde=True, bins=30)
plt.show()

# Gender-based survival analysis
sns.barplot(x='Sex', y='Survived', data=train)
plt.show()

# Fare distribution analysis
sns.histplot(data=train, x='Fare', hue='Survived', bins=30)
plt.show()


In [None]:
# Heatmap for Missing Data
sns.heatmap(train.isnull(), cbar=False)
plt.show()

#2. DATA PRE-PROCESSING

###2.1 MISSING DATA ANALYSIS
1. to identify missing values we check which columns have 'NaN' and how many.\
   missing --> age,cabin,embarked,fare
2. visualisation --> heatmap used
3. **Age** - around 20% missing. Missing values are replaced by *median* of the column
4. **Embarked** - 2 values missing. Replaced by *mode* of column.
5. **Cabin** - 70% missing. Either *drop* the column or *create a feature* 'HasCabin'(1=yes,0=no)
6. **Fare** - 1 missing in test data. The missing value can be replaced with *median* of the column

In [None]:
#HANDLING MISSING VALUES
#filling missing data for 'Age'
train['Age'] = train['Age'].fillna(train['Age'].median())
test['Age'] = test['Age'].fillna(test['Age'].median())
"""
inplace directly modifies the dataframe without the need to assign it back
"""

#filling missing data for 'Embarked'
train['Embarked'] = train['Embarked'].fillna(train['Embarked'].mode()[0])
"""
---No missing 'Embarked' in test data---
mode returns series of values. ".mode()[0]" only returns the first element.
"""

#Creating New Binary feature for known cabins
train['Cabin_known'] = train['Cabin'].notnull().astype(int)
test['Cabin_known'] = test['Cabin'].notnull().astype(int)
"""
Cabin_known is the new feature.
".notnull()" returns a boolean(true/false) depending on whether 'Cabin' had a value or not.
Later, ".astype(int)" converts this boolean to integers (true=1,false=0)
This way the new feature has values 0/1.
Cabin_known
1
0
1
.....
"""

# Drop the original Cabin column after creating Cabin_known
train.drop('Cabin', axis=1, inplace=True)
test.drop('Cabin', axis=1, inplace=True)


#filling missing data for 'Fare' in test dataset
test['Fare'].fillna(test['Fare'].median(), inplace=True) # only one missing value in test set


###2.2 FEATURE ENGINEERING
1. We extract titles from the names using python `split` operation
2. New feature named `FamilySize` is created.
3. Continuous variables like 'Age' are grouped

In [None]:
#FEATURE ENGINEERING

#Titles
train['Title'] = train['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())
test['Title'] = test['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())
"""
Logic: Braund, Mr. Owen Harris

x.split(',')[1] → " Mr. Owen Harris"

.split('.')[0] → " Mr"

.strip() → "Mr"
"""

#Family Size
train['FamilySize'] = train['SibSp'] + train['Parch'] + 1
test['FamilySize'] = test['SibSp'] + test['Parch'] + 1
"""
family size = sibling/spouse + parent/children + passenger(themselves)
"""

#Age Bins
train['AgeBin'] = pd.cut(train['Age'], bins=[0, 12, 18, 35, 60, 100], labels=['Child','Teen','Young','Adult','Senior'])
test['AgeBin'] = pd.cut(test['Age'], bins=[0, 12, 18, 35, 60, 100], labels=['Child','Teen','Young','Adult','Senior'])
"""
Splits the 'Age' category into intervals/bins and assigns names to those bins.
Example:
0–12 → children
13–18 → teens
19–35 → young adults
36–60 → adults
61–100 → seniors
This is then stored in a new feature 'AgeBin'. Now instead of 'Age=4' it becomes 'Child'
"""

###2.3 ENCODING CATEGORICAL VARIABLES
1. 'Sex' is converted to numerical(0/1)
2. 'Embarked', 'Title', 'AgeBin' are One Hot Encoded

In [None]:
#ENCODING CATERGORICAL VARIABLES

#converting sex to numerical(0/1)
train['Sex'] = train['Sex'].map({'male':0, 'female':1})
test['Sex'] = test['Sex'].map({'male':0, 'female':1})
"""
".map({'male':0,'female':1})" replaces 'male' with 0 and 'female' with 1, thus making 'Sex' numeric
"""

#One Hot Encoding 'Embarked', 'Title', 'AgeBin'

# Adding dummy columns for 'Embarked','Title' and 'AgeBin', without removing the original
Embarked_dummies = pd.get_dummies(train['Embarked'], prefix='Embarked', drop_first=True)
Embarked_dummies = Embarked_dummies.astype(int)
train = pd.concat([train, Embarked_dummies], axis=1)

Title_dummies = pd.get_dummies(train['Title'], prefix='Title', drop_first=True)
Title_dummies = Title_dummies.astype(int)
train = pd.concat([train, Title_dummies], axis=1)

AgeBin_dummies = pd.get_dummies(train['AgeBin'], prefix='AgeBin', drop_first=True)
AgeBin_dummies = AgeBin_dummies.astype(int)
train = pd.concat([train,AgeBin_dummies], axis=1)

"""
1. we make dummy variables for 'Embarked', 'Title' and 'AgeBin'.
2. "drop_first" drops the first column of the dummy variable to avoid redundancy. [Say, Embarked_S = 0,Embarked_C = 0,then Embarked_Q = 1 automatically]
"""





###2.4 FEATURE SELECTION

In [None]:
#FEATURE SELECTION

#dropping irrelevant columns
train.drop(['PassengerId','Name','Ticket'], axis=1, inplace=True)
test.drop(['PassengerId','Name','Ticket'], axis=1, inplace=True)
"""
why irrelevant?
PassengerId - just an identifier. no relation with survival
Name - mostly crowded data. we already extracted 'Title'
Ticket - messy alphanumeric values. not useful

axis=1 --> removes columns not rows
inplace=True --> directly modifies 'train' without the need to assign it back
"""

#Analyze Feature Correlations

# Calculate the correlation matrix for numeric features
corr_matrix = train.corr(numeric_only=True)
"""
numeric_only --> ignores object/categorical datatypes since correlation makes sense only for numbers
This gives a big square matrix where each cell = correlation between 2 features.
"""

# Inspect correlations with the target 'Survived'
surv_corr = corr_matrix['Survived'].sort_values(ascending=False)
"""
extraction of correlation of all features wrt 'Survived'
sort_values --> Sorts them so we can see strongest positive/negative relationships.
"""
print("\n=== Correlation of Features with 'Survived' ===")
print(surv_corr)


# Visualize the full correlation matrix as a heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap="coolwarm", square=True)
plt.title("Feature Correlation Matrix")
plt.show()

# Bar plot: Top 10 features most correlated with survival (excluding 'Survived' itself)
top_corr = surv_corr.drop('Survived').head(10) #'Survived' dropped beacuse correlation with self = 1
plt.figure(figsize=(8, 5))
top_corr.plot(kind='barh')
plt.title("Top 10 Feature Correlations with Survival")
plt.xlabel("Correlation with 'Survived'")
plt.ylabel("Feature")
plt.gca().invert_yaxis()
plt.show()

#3. RANDOM FOREST CLASSIFIER

###3.1 DECISION TREE IMPLEMENTATION
We first implemented a custom `DecisionTree` class from scratch. Each tree learns by recursively splitting the dataset on features that maximize information gain, using the Gini impurity measure as a criterion for split quality.

Key features of our decision tree implementation:
1. **Recursive tree building:** The algorithm continues splitting into left/right subtrees until all leaves are pure or until the stopping criteria (maximum depth, minimum samples) are met.
2. **Gini impurity calculation:** We use Gini impurity to measure how well each potential split separates the classes.
3. **Best split selection:** At every node, the tree considers all possible features and thresholds, choosing the split that produces the most gain in purity.
4. **Predictions:** For a given sample, the tree traverses down its branches based on feature values until it reaches a leaf, at which point the predicted class is returned.

In [None]:
#DECISION TREE IMPLEMENTATION
import numpy as np
from collections import Counter

class DecisionTree:
    def __init__(self, max_depth=None, min_samples_split=2, criterion = 'gini', max_features=None):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.criterion=criterion
        self.max_features=max_features
        self.tree = None

    def gini_impurity(self, y):
        """
        Computes the Gini impurity for a vector y of class labels.
        """
        m = len(y)
        if m == 0:
            return 0
        class_counts = np.bincount(y)
        probs = class_counts / m
        return 1 - np.sum(probs ** 2)

    def best_split(self, X, y):
        """
        Finds the best feature and threshold to split at this node.
        """
        n_features = X.shape[1]

        features = np.arange(n_features)

        if self.max_features is not None and self.max_features < n_features:
            features = np.random.choice(n_features, self.max_features, replace=False)

        best_gain, split_idx, split_thresh =0, None, None

        for feature in features:
            thresholds = np.unique(X[:, feature])
            for thresh in thresholds:
                left = y[X[:, feature] < thresh]
                right = y[X[:, feature] >= thresh]
                if len(left) == 0 or len(right) == 0:
                    continue
                curr_gain = self.information_gain(y, left, right)
                if curr_gain > best_gain:
                    best_gain = curr_gain
                    split_idx = feature
                    split_thresh = thresh
        return split_idx, split_thresh, best_gain

    def information_gain(self, parent, left, right):
        """
        Parent and children y's are given.
        """
        w_left = len(left) / len(parent)
        w_right = len(right) / len(parent)
        gain = self.gini_impurity(parent) - (w_left * self.gini_impurity(left) + w_right * self.gini_impurity(right))
        return gain

    def split_data(self, X, y, feature, threshold):
        """
        Splits X, y into left/right groups based on feature and threshold.
        """
        left_idxs = X[:, feature] < threshold
        right_idxs = X[:, feature] >= threshold
        return X[left_idxs], y[left_idxs], X[right_idxs], y[right_idxs]

    def build_tree(self, X, y, depth):
        """
        Recursively builds the tree.
        """
        num_samples, num_features = X.shape
        num_labels = len(np.unique(y))
        # Stopping conditions
        if (self.max_depth is not None and depth >= self.max_depth) or num_labels == 1 or num_samples < self.min_samples_split:
            leaf_value = Counter(y).most_common(1)[0][0]
            return {'leaf': True, 'class': leaf_value}

        # Find best split
        feature, thresh, best_gain = self.best_split(X, y)
        if feature is None or thresh is None:
            leaf_value = Counter(y).most_common(1)[0][0]
            return {'leaf': True, 'class': leaf_value}

        # Split and recurse
        X_left, y_left, X_right, y_right = self.split_data(X, y, feature, thresh)
        left = self.build_tree(X_left, y_left, depth + 1)
        right = self.build_tree(X_right, y_right, depth + 1)
        return {'leaf': False, 'feature': feature, 'thresh': thresh, 'left': left, 'right': right, 'gain':best_gain}

    def fit(self, X, y):
        """
        Fit the tree to data.
        """
        # Accept only np.array or convert
        if isinstance(X, list):
            X = np.array(X)
        if isinstance(y, list):
            y = np.array(y)
        if self.max_features is None:
            self.max_features = int(np.sqrt(X.shape[1]))
        self.tree = self.build_tree(X, y, 0)

    def predict_sample(self, x, node):
        """
        Predict a single sample recursively.
        """
        if node['leaf']:
            return node['class']
        if x[node['feature']] < node['thresh']:
            return self.predict_sample(x, node['left'])
        else:
            return self.predict_sample(x, node['right'])

    def predict(self, X):
        """
        Predict all samples.
        """
        if isinstance(X, list):
            X = np.array(X)
        return np.array([self.predict_sample(x, self.tree) for x in X])

def collect_tree_importances(node, importances):
    if node is None or node.get('leaf', False):
        return
    feature = node['feature']
    gain = node.get('gain', 0)
    importances[feature] += gain
    collect_tree_importances(node['left'], importances)
    collect_tree_importances(node['right'], importances)



###3.2 RANDOM FOREST CLASSIFIER IMPLEMENTATION
We implement a Random Forest classifier from scratch. This ensemble approach trains multiple decision trees on different bootstrap samples of the training data, and aggregates their predictions by majority voting. Random Forests help reduce overfitting compared to a single decision tree and provide a robust estimate of feature importance.

In this section:
1. Each tree is built using random subsets of the data and features.
2. The final prediction is based on the most common outcome predicted by all trees.
3. Hyperparameters such as the number of trees (`n_estimators`), maximum tree depth (`max_depth`), and minimum samples required for a split (`min_samples_split`) are tuned to optimize performance.

In [None]:
#RANDOM FOREST CLASSIFIER IMPLEMENTATION
class RandomForest:
  """
  Parameters:
    - n_estimators: Number of trees in the forest
    - max_depth: Maximum depth of each tree
    - min_samples_split: Minimum samples for a split
    - max_features: 'sqrt' for number of features per split
  """
  def __init__(self,n_estimators = 100,max_depth=10,min_samples_split=2,max_features=2):
    self.n_estimators = n_estimators
    self.max_depth = max_depth
    self.min_samples_split =  min_samples_split
    self.max_features = max_features
    self.trees = []


  #bootstrap sampling
  def _bootstrap_samples(self,X, y):
      n_samples = X.shape[0]
      idxs = np.random.choice(n_samples,n_samples,replace=True)
      return X[idxs],y[idxs]

  def _most_common_label(self,y):
        counter = Counter(y)
        most_common = counter.most_common(1)[0][0]
        return most_common

  def random_feature_selection(self, n_features):
        if self.max_features == 'sqrt':
            return max(1, int(np.sqrt(n_features)))
        else:
            return n_features

  def fit(self,X,y):
      self.trees = []
      n_features = X.shape[1]
      n_feats_per_split = self.random_feature_selection(n_features)
      for _ in range(self.n_estimators):
        tree = DecisionTree(max_depth = self.max_depth,
                    min_samples_split = self.min_samples_split,
                            max_features = n_feats_per_split)
        X_sample,y_sample = self._bootstrap_samples(X,y)
        tree.fit(X_sample,y_sample)
        self.trees.append(tree)

  def predict(self,X,):
      predictions =  np.array([tree.predict(X) for tree in self.trees])
      """
      [[s1t1,s2t1,s3t1],[s1t2,s2t2,s3t2],[s1t3],s2t3,s3t3]
      """
      tree_preds = np.swapaxes(predictions,0,1)
      """
      [[s1t1,s1t2,s1t3],[s2t1,s2t2,s2t3],[s3t1,s3t2,s3t3]]
      """
      predictions = np.array([self._most_common_label(pred) for pred in tree_preds])
      return predictions

#4. Model Evaluation and Hyperparameter Tuning

###4.1 Model Testing


In [None]:
from sklearn.model_selection import train_test_split

X=train.drop('Survived',axis=1).values
y=train['Survived'].values
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

def accuracy(y_true,y_pred):
  accuracy = np.sum(y_true == y_pred)/len(y_true)
  return accuracy

clf=RandomForest()
clf.fit(X_train,y_train)
predictions = clf.predict(X_test)
acc = accuracy(y_test,predictions)
print(f"Accuracy: {acc:.4f}")

n_features = X_train.shape[1]    # Set to the number of input features (not including target)

forest_importances = np.zeros(n_features)
for tree in clf.trees:
    importances = np.zeros(n_features)
    collect_tree_importances(tree.tree, importances)
    forest_importances += importances

forest_importances /= forest_importances.sum()

# printing importances for each feature in order:
feature_names = train.drop('Survived', axis=1).columns.tolist()

#making a list of important features
feature_importance_pairs = list(zip(feature_names, forest_importances))
feature_importance_pairs.sort(key=lambda x: x[1], reverse=True) #sorting the list

#setting N=5 for top 5 features to display
N=5
print(f"\nTop {N} Most Important Features:")
for feat, score in feature_importance_pairs[:N]:
    print(f"{feat}: {score:.3f}")

###4.1 Hyperparameter Tuning

In [None]:
import itertools

# Defining the hyperparameter grid
n_estimators_list = [10, 50, 100, 200]
max_depth_list = [3, 5, 10, None]
min_samples_split_list = [2, 5, 10]

# Preparing to track results
results = []

# Looping through all combinations
for n_estimators, max_depth, min_samples_split in itertools.product(n_estimators_list, max_depth_list, min_samples_split_list):

    # # Print which set is running
    # print(f"Training: n_estimators={n_estimators}, max_depth={max_depth}, min_samples_split={min_samples_split}")

    rf = RandomForest(n_estimators = n_estimators,
                      max_depth = max_depth,
                      min_samples_split = min_samples_split)
    rf.fit(X_train, y_train)
    preds = rf.predict(X_test)

    #calculating accuracy
    accuracy = np.mean(preds == y_test)

    #storing in results
    results.append({'n_estimators': n_estimators,
                    'max_depth': max_depth,
                    'min_samples_split': min_samples_split,
                    'accuracy': accuracy})

#converting to dataframe
df_results = pd.DataFrame(results)

#sorting by accuracy
df_results = df_results.sort_values(by='accuracy', ascending=False)

print(df_results.head(5))

###4.2 Performance Metrics Implementation

In [None]:
y_test_pred = clf.predict(X_test)

#Accuracy
def accuracy_score(y_test, y_test_pred):
    """
    Returns the accuracy of the predictions.
    """
    return np.sum(y_test == y_test_pred) / len(y_test)

#Confusion Matrix
def confusion_matrix(y_test, y_test_pred):
    """
    Returns the confusion matrix in the form:
        [[TN, FP],
         [FN, TP]]
    for binary classification (0 = negative, 1 = positive).
    """
    tn = np.sum((y_test == 0) & (y_test_pred == 0))
    fp = np.sum((y_test == 0) & (y_test_pred == 1))
    fn = np.sum((y_test == 1) & (y_test_pred == 0))
    tp = np.sum((y_test == 1) & (y_test_pred == 1))
    return np.array([[tn, fp],
                     [fn, tp]])


#Precision,Recall,F1 score
def precision_recall_f1(y_test, y_test_pred):
    cm = confusion_matrix(y_test, y_test_pred)
    tn, fp, fn, tp = cm.ravel()
    precision = tp / (tp + fp) if (tp + fp) else 0
    recall = tp / (tp + fn) if (tp + fn) else 0
    f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) else 0
    return precision, recall, f1


acc = accuracy_score(y_test, y_test_pred)
print(f"Accuracy: {acc:.4f}")

cm = confusion_matrix(y_test, y_test_pred)
print(f"Confusion Matrix:\n{cm}")

precision, recall, f1 = precision_recall_f1(y_test, y_test_pred)
print(f"Precision: {precision:.4f}  Recall: {recall:.4f}  F1 Score: {f1:.4f}")