In [1]:
import numpy as np
from sklearn.datasets import load_iris
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

import seaborn as sns

# Load Titanic dataset
titanic = sns.load_dataset('titanic')

# Drop columns that are not useful for prediction
columns_to_drop = ['deck', 'embarked', 'who', 'sex', 'age']  # Example columns to drop
titanic_dropped = titanic.drop(columns=columns_to_drop)

# Target variable (y) - 'survived' column
y = titanic_dropped['survived']

# Features (X) - All columns except 'survived'
X = titanic_dropped.drop(columns='survived')

# View the first few rows of X and y
print("Features (X):")
print(X.head())

print("\nTarget (y):")
print(y.head())




Features (X):
   pclass  sibsp  parch     fare  class  adult_male  embark_town alive  alone
0       3      1      0   7.2500  Third        True  Southampton    no  False
1       1      1      0  71.2833  First       False    Cherbourg   yes  False
2       3      0      0   7.9250  Third       False  Southampton   yes   True
3       1      1      0  53.1000  First       False  Southampton   yes  False
4       3      0      0   8.0500  Third        True  Southampton    no   True

Target (y):
0    0
1    1
2    1
3    1
4    0
Name: survived, dtype: int64


In [2]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

# View the shape of the splits
print(f"Training features (X_train): {X_train.shape}")
print(f"Test features (X_test): {X_test.shape}")
print(f"Training target (y_train): {y_train.shape}")
print(f"Test target (y_test): {y_test.shape}")


Training features (X_train): (757, 9)
Test features (X_test): (134, 9)
Training target (y_train): (757,)
Test target (y_test): (134,)


In [3]:
def convert_to_numeric(df):
    for column in df.columns:
        # Convert column to numeric, invalid parsing will be set to NaN
        df[column] = pd.to_numeric(df[column], errors='coerce')
    return df

X_train = convert_to_numeric(X_train)
X_train_np = X_train.to_numpy()

X_test = convert_to_numeric(X_test)
X_test_np = X_test.to_numpy()

y_train_np = y_train.to_numpy()

y_test_np = y_test.to_numpy()

In [4]:
class DecisionTreeClassifier:
    def __init__(self, X, y):
        self.X = X
        self.y = y
        self.tree = dict()
        
    def entropy(self, splits):
        total_size = sum(len(split) for split in splits)
        if(total_size == 0):
            return 0
        total_entropy = 0
        
        for split in splits:
            split = np.array(split)
            classes = np.unique(split)
            split_entropy = 0
            for classs in classes:
                p = len(split[split ==classs])/len(split)
                split_entropy += - p * np.log2(p) if p > 0 else 0
                
            total_entropy += (len(split)/total_size)*split_entropy
            
        return total_entropy
        
    def best_feature(self):
        
        information_gains = []
        for idx in range(self.X.shape[1]):
            feature = self.X[:, idx]
            feature_vals = np.unique(feature)
            
            #split the data on basis of this feature
            splits = []
            for feature_val in feature_vals:
                # print(feature_val)
                splits.append(self.y[self.X[:, idx] == feature_val])
                
            entropyy = self.entropy(splits)
            parent_entropy = 0 if idx==0 else entropyy
            information_gain = parent_entropy - entropyy
            information_gains.append(information_gain)
            
        best_feature_idx = information_gains.index(max(information_gains))
        return best_feature_idx
           
    def build_tree(self, X, y, depth=0, max_depth=10, min_samples_split=2):
        if len(np.unique(y)) == 1:  
            return np.unique(y)[0]
        
        if len(X) < min_samples_split: 
            return np.bincount(y).argmax()  
        
        if depth >= max_depth: 
            return np.bincount(y).argmax()
        
        best_feature_idx = self.best_feature()
        best_feature_vals = np.unique(X[:, best_feature_idx])
        
        tree = {}
        for value in best_feature_vals:
            X_split = X[X[:, best_feature_idx] == value]
            y_split = y[X[:, best_feature_idx] == value]
            tree[value] = self.build_tree(X_split, y_split, depth + 1, max_depth, min_samples_split)
            
        return {best_feature_idx: tree}

    
    def fit(self, X, y):
        self.tree = self.build_tree(X, y)
        return self.tree


    def predict(self, X):
        predictions = [self.predict_single(x, self.tree) for x in X]
        return predictions
    
    def predict_single(self, x, tree):
        if isinstance(tree, dict):
            feature_idx = list(tree.keys())[0]
            feature_val = x[feature_idx]
        
            if feature_val in tree[feature_idx]:
                return self.predict_single(x, tree[feature_idx][feature_val])
        else:
            return tree
        
    # Metrics Calculation
    def calculate_metrics(self, y_pred, y):
        tp = tn = fp = fn = 0

        for idx, y_i in enumerate(y_pred):
            if y_i == 1 and y[idx] == 1:
                tp += 1  # True Positive
            elif y_i == 0 and y[idx] == 0:
                tn += 1  # True Negative
            elif y_i == 1 and y[idx] == 0:
                fp += 1  # False Positive
            elif y_i == 0 and y[idx] == 1:
                fn += 1  # False Negative
        
        accuracy = ((tp + tn) / len(y)) * 100
        precision = tp / (tp + fp) if tp + fp != 0 else 0
        recall = tp / (tp + fn) if tp + fn != 0 else 0
        f1_score = 2 * (precision * recall) / (precision + recall) if precision + recall != 0 else 0
        confusion_matrix = {'TP': tp, 'TN': tn, 'FP': fp, 'FN': fn}
        
        return accuracy, precision, recall, f1_score, confusion_matrix
    
    # Function to print the metrics
    def print_metrics(self, X_test, y_test):
        y_pred = self.predict(X_test)
        accuracy, precision, recall, f1_score, confusion_matrix = self.calculate_metrics(y_pred, y_test)

        print(f"Accuracy: {accuracy:.4f}%")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1-Score: {f1_score:.4f}")
        print("Confusion Matrix:")
        print(confusion_matrix)

In [5]:
dt = DecisionTreeClassifier(X_train_np, y_train_np)

tree = dt.fit(X_train_np, y_train_np)

In [6]:
# dt.predict(X_test_np)

In [7]:
print("TEST")
dt.print_metrics(X_test_np, y_test_np)

TEST
Accuracy: 64.1791%
Precision: 0.6333
Recall: 0.3393
F1-Score: 0.4419
Confusion Matrix:
{'TP': 19, 'TN': 67, 'FP': 11, 'FN': 37}


In [8]:
print("TRAIN")
dt.print_metrics(X_train_np, y_train_np)

TRAIN
Accuracy: 63.1440%
Precision: 0.5196
Recall: 0.3252
F1-Score: 0.4000
Confusion Matrix:
{'TP': 93, 'TN': 385, 'FP': 86, 'FN': 193}


## RANDOM FOREST

In [9]:
class RandomForestClassifier:
    def __init__(self, n_trees=100, max_depth=10, min_samples_split=2, max_features=None):
        self.n_trees = n_trees
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.max_features = max_features
        self.trees = []

    def bootstrap_sample(self, X, y):
        n_samples = X.shape[0]
        idxs = np.random.choice(n_samples, size=n_samples, replace=True)
        return X[idxs], y[idxs]

    def fit(self, X, y):
        self.trees = []
        for _ in range(self.n_trees):
            X_sample, y_sample = self.bootstrap_sample(X, y)

            tree = DecisionTreeClassifier(X_sample, y_sample)
            tree.fit(X_sample, y_sample)
            self.trees.append(tree)

    def predict_single(self, x):
        tree_predictions = [tree.predict_single(x, tree.tree) for tree in self.trees]
        return max(set(tree_predictions), key=tree_predictions.count)

    def predict(self, X):
        return [self.predict_single(x) for x in X]

    def print_metrics(self, X_test, y_test):
        y_pred = self.predict(X_test)
        accuracy, precision, recall, f1_score, confusion_matrix = self.calculate_metrics(y_pred, y_test)

        print(f"Accuracy: {accuracy:.4f}%")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1-Score: {f1_score:.4f}")
        print("Confusion Matrix:")
        print(confusion_matrix)
    
    # Metrics Calculation
    def calculate_metrics(self, y_pred, y):
        tp = tn = fp = fn = 0
        for idx, y_i in enumerate(y_pred):
            if y_i == 1 and y[idx] == 1:
                tp += 1  # True Positive
            elif y_i == 0 and y[idx] == 0:
                tn += 1  # True Negative
            elif y_i == 1 and y[idx] == 0:
                fp += 1  # False Positive
            elif y_i == 0 and y[idx] == 1:
                fn += 1  # False Negative
        
        accuracy = ((tp + tn) / len(y)) * 100
        precision = tp / (tp + fp) if tp + fp != 0 else 0
        recall = tp / (tp + fn) if tp + fn != 0 else 0
        f1_score = 2 * (precision * recall) / (precision + recall) if precision + recall != 0 else 0
        confusion_matrix = {'TP': tp, 'TN': tn, 'FP': fp, 'FN': fn}
        
        return accuracy, precision, recall, f1_score, confusion_matrix


In [18]:
rf = RandomForestClassifier(15, 5, 5)
rf.fit(X_train_np, y_train_np)

In [19]:
print("TEST")
rf.print_metrics(X_test_np, y_test_np)

TEST
Accuracy: 64.1791%
Precision: 0.6333
Recall: 0.3393
F1-Score: 0.4419
Confusion Matrix:
{'TP': 19, 'TN': 67, 'FP': 11, 'FN': 37}


In [20]:
print("TRAIN")
rf.print_metrics(X_train_np, y_train_np)

TRAIN
Accuracy: 63.1440%
Precision: 0.5196
Recall: 0.3252
F1-Score: 0.4000
Confusion Matrix:
{'TP': 93, 'TN': 385, 'FP': 86, 'FN': 193}
