Import the libraries

In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

Load the dataset

In [27]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


Preprocess the data

In [28]:
def extract_titles(name):
    # Extracts titles from names.
    tsearch = re.search('([A-Za-z]+)\.', name)
    if tsearch:
        return tsearch.group(1)
    return ""

def process_data(data):
    # Feature engineering
    data['FamilySize'] = data['SibSp'] + data['Parch'] + 1
    data['Has_Cabin'] = data["Cabin"].apply(lambda x: 0 if pd.isna(x) else 1)
    data['Fare'] = data['Fare'].fillna(train['Fare'].median())
    data.loc[data['FamilySize'] == 1, 'IsAlone'] = 1
    data['IsAlone'].fillna(0, inplace=True)  # Filling missing values
    data['Embarked'] = data['Embarked'].fillna('S')

    # Filling missing values in 'Age' column with random values within one standard deviation from the mean
    standard = data['Age'].std()
    average = data['Age'].mean()
    nullage = data['Age'].isnull().sum()
    nullagelist = np.random.randint(average - standard, average + standard, size=nullage)
    data.loc[np.isnan(data['Age']), 'Age'] = nullagelist
    data['Age'] = data['Age'].astype(int)

    # Extracting titles from names
    data['Title'] = data['Name'].apply(extract_titles)
    data['Title'] = data['Title'].replace({'Mme': 'Mrs', 'Ms': 'Miss', 'Mlle': 'Miss'})

    # Mapping categorical variables
    data['Sex'] = data['Sex'].map({'female': 0, 'male': 1}).astype(int)
    tmap = {"Mr": 1, "Master": 2, "Mrs": 3, "Miss": 4}
    data['Title'] = data['Title'].map(tmap)
    data['Title'] = data['Title'].fillna(0)
    data['Embarked'] = data['Embarked'].map({'S': 0, 'C': 1, 'Q': 2}).astype(int)

    # Binning numerical features
    data['Fare'] = pd.cut(data['Fare'], bins=[0, 10, 20, 30, np.inf], labels=[0, 1, 2, 3])
    data['Fare'] = data['Fare'].cat.codes
    data['Age'] = pd.cut(data['Age'], bins=[0, 16, 32, 48, 64, np.inf], labels=[0, 1, 2, 3, 4])
    data['Age'] = data['Age'].cat.codes

    # Dropping unnecessary columns
    drop_elements = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp', 'Sex']
    data.drop(drop_elements, axis=1, inplace=True)


  tsearch = re.search('([A-Za-z]+)\.', name)


Process the train and test datasets

In [29]:
process_data(train)
process_data(test)
train

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['IsAlone'].fillna(0, inplace=True)  # Filling missing values
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['IsAlone'].fillna(0, inplace=True)  # Filling missing values


Unnamed: 0,Survived,Pclass,Age,Parch,Fare,Embarked,FamilySize,Has_Cabin,IsAlone,Title
0,0,3,1,0,0,0,2,0,0.0,1.0
1,1,1,2,0,3,1,2,1,0.0,3.0
2,1,3,1,0,0,0,1,0,1.0,4.0
3,1,1,2,0,3,0,2,1,0.0,3.0
4,0,3,2,0,0,0,1,0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...
886,0,2,1,0,1,0,1,0,1.0,0.0
887,1,1,1,0,2,0,1,1,1.0,4.0
888,0,3,1,2,2,0,4,0,0.0,4.0
889,1,1,1,0,2,1,1,1,1.0,1.0


Decision Trees

In [30]:
class DecisionTree:
    def __init__(self, impurity='gini', max_depth=10, min_samples_split=4, min_samples_leaf=2):
        """
        Initialize the DecisionTree model.

        Parameters:
            impurity (str): The impurity criterion to be used for splitting.
            max_depth (int): The maximum depth of the decision tree.
            min_samples_split (int): The minimum number of samples required to split an internal node.
            min_samples_leaf (int): The minimum number of samples required to be at a leaf node.
        """
        self.impurity = impurity
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        
        # Define impurity criterion functions
        self.criterion_functions = {
            'gini': lambda y: 1 - np.sum((np.unique(y, return_counts=True)[1] / len(y)) ** 2),
            'entropy': lambda y: -np.sum((np.unique(y, return_counts=True)[1] / len(y)) *
                                          np.log2(np.unique(y, return_counts=True)[1] / len(y))),
            'misclassification_rate': lambda y: 1 - np.max(np.unique(y, return_counts=True)[1] / len(y))
        }
        
        # Select impurity measure based on user input or default to 'gini'
        self.homogeneity_measure = self.criterion_functions.get(impurity, self.criterion_functions['gini'])

    def fit(self, X, y):
        # Fit the decision tree model to the training data.
        # Convert input data to numpy arrays if they are DataFrame or Series
        if isinstance(X, pd.DataFrame):
            X = X.values
        if isinstance(y, pd.Series):
            y = y.values
        # Recursively split the data to construct the decision tree
        self.tree = self.split(X, y, 0)

    def split(self, X, y, depth):
        # Recursively splits the data to create the decision tree.
        n_samples, n_features = X.shape
        
        # Check stopping conditions: minimum samples or maximum depth reached
        if n_samples < self.min_samples_split or depth > self.max_depth:
            # Create leaf node and assign the mean value of target variable
            return {'leaf_value': np.round(np.mean(y))}
        
        # Initialize variables for best split
        best_left_X, best_left_y, best_right_X, best_right_y = None, None, None, None
        best_feature, best_threshold, best_impurity = None, None, np.inf
        
        # Iterate over features and their unique values to find best split
        for feature in range(n_features):
            unique_values = np.unique(X[:, feature])
            for threshold in unique_values:
                # Split data into left and right based on threshold
                left_index = X[:, feature] <= threshold
                right_index = ~left_index
                left_y, right_y = y[left_index], y[right_index]
                
                # Calculate impurity for left and right splits
                impurity = (len(left_y) / n_samples) * self.homogeneity_measure(left_y) + \
                           (len(right_y) / n_samples) * self.homogeneity_measure(right_y)
                
                # Update best split if impurity is minimized
                if impurity < best_impurity:
                    best_feature, best_threshold, best_impurity = feature, threshold, impurity
                    best_left_X, best_left_y = X[left_index], y[left_index]
                    best_right_X, best_right_y = X[right_index], y[right_index]
        
        # Check if a valid split was found
        if best_feature is not None:
            # Recursively split the left and right nodes
            left_subtree = self.split(best_left_X, best_left_y, depth + 1)
            right_subtree = self.split(best_right_X, best_right_y, depth + 1)
            
            # Construct tree node with split information and subtrees
            return {'feature': best_feature,
                    'threshold': best_threshold,
                    'left': left_subtree,
                    'right': right_subtree}
        
        # If no valid split was found, create leaf node with mean target value
        return {'leaf_value': np.round(np.mean(y))}

    def predict(self, X):
        # Predicts the target values for the input samples.
        X = pd.DataFrame(X)
        
        # Reset index to ensure correct iteration over rows
        X.reset_index(drop=True, inplace=True)
        
        # Initialize array to store predictions
        y_pred = np.zeros(X.shape[0])
        
        # Iterate over input samples and make predictions using the decision tree
        for i in range(X.shape[0]):
            y_pred[i] = self.predict_single(X.loc[i], self.tree)
        
        return y_pred

    def predict_single(self, x, node):
        # Traverse the tree until a leaf node is reached
        while 'leaf_value' not in node:
            # Determine which subtree to follow based on the feature and threshold
            if x[node['feature']] <= node['threshold']:
                node = node['left']
            else:
                node = node['right']
        
        # Return the predicted target value at the leaf node
        return node['leaf_value']


In [37]:
# Splitting data into features (X) and target variable (y)
y = train['Survived']
X = train.drop('Survived', axis=1)

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initializing and fitting the Decision Tree model
decisiontree = DecisionTree(max_depth=5)
decisiontree.fit(X_train, y_train)

# Making predictions on the test set
ypred = decisiontree.predict(X_test)

# Calculating accuracy of the Decision Tree model
dacc = accuracy_score(y_test, ypred)
print("Accuracy of Decision Tree Model : {:.2f}%".format(dacc * 100))

print(y)


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Accuracy of Decision Tree Model : 79.10%
0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64


  if x[node['feature']] <= node['threshold']:


Random Forests

In [35]:
class RandomForest:
    def __init__(self, classifier, num_trees, min_features):
        """
        Initialize the Random Forest classifier.

        Parameters:
            classifier (class): The base decision tree classifier class.
            num_trees (int): Number of decision trees in the forest.
            min_features (int): Minimum number of features to consider for each tree.
        """
        self.classifier = classifier
        self.num_trees = num_trees
        self.min_features = min_features
        self.trees = []

    def fit(self, X, y):
        # Fit the Random Forest model to the training data.
        # Reset index for both features and target variable
        X = X.reset_index(drop=True)
        y = y.reset_index(drop=True)

        # Build each decision tree in the forest
        for i in range(self.num_trees):
            # Randomly select subset of data with replacement
            cnt = np.random.randint(1, X.shape[0] + 1)
            sid = np.random.choice(X.index, cnt, replace=True)
            X_b, y_b = X.loc[sid], y.loc[sid]

            # Randomly select subset of features without replacement
            feats = np.random.randint(self.min_features, X.shape[1] + 1)
            fid = np.random.choice(X.columns, feats, replace=False)
            X_b = X_b[fid]

            # Initialize and fit the decision tree with the subset of data and features
            tree = self.classifier(max_depth=10, min_samples_split=2, min_samples_leaf=2)
            tree.fit(X_b, y_b)
            
            # Store the fitted tree along with the selected features
            self.trees.append((tree, fid))

    def predict(self, X):
        # Predict the target variable for the input data.
        # Make predictions using each decision tree in the forest
        aa = []
        for tree, fid in self.trees:
            xx = X[fid]
            aa.append(tree.predict(xx))
        
        # Combine predictions from all trees and return the most frequent prediction for each sample
        aa = np.array(aa).astype(int)
        return np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=0, arr=aa)


In [38]:
# Initializing and fitting the Random Forest model
randomforest = RandomForest(classifier=DecisionTreeClassifier, num_trees=100, min_features=4)
randomforest.fit(X_train, y_train)

# Making predictions on the test set
ypredr = randomforest.predict(X_test)

# Calculating accuracy of the Random Forest model
racc = accuracy_score(y_test, ypredr)
print("Accuracy of Random Forest Model : {:.2f}%".format(racc * 100))

print(y)


Accuracy of Random Forest Model : 78.36%
0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64


Boosting

In [39]:
class AdaBoost:
    def __init__(self, weak_learner, num_learners, learning_rate):
        """
        Initialize the AdaBoost classifier.

        Parameters:
            weak_learner (class): The base weak learner class.
            num_learners (int): Number of weak learners (decision stumps) to train.
            learning_rate (float): Learning rate to adjust the contribution of each weak learner.
        """
        self.weak_learner = weak_learner
        self.num_learners = num_learners
        self.learning_rate = learning_rate
    
    def fit(self, X, y):
        # Fit the AdaBoost model to the training data.
        self.mm = []  # List to store trained weak learners
        self.nn = []  # List to store weights of weak learners
        cnt = X.shape[0]
        yy = np.ones(cnt) / cnt  # Initialize sample weights

        # Train each weak learner
        for _ in range(self.num_learners):
            ml = self.weak_learner()  # Initialize weak learner
            ml.fit(X, y)  # Train weak learner
            ypred = ml.predict(X)  # Make predictions using weak learner
            grad = np.mean(np.abs(ypred - y) / 2 * yy) / np.mean(yy)  # Calculate error rate
            if grad > 0.5:
                break
            dd = self.learning_rate * np.log((1 - grad) / grad)  # Calculate learner weight
            self.mm.append(ml)  # Store weak learner
            self.nn.append(dd)  # Store learner weight
            yy *= np.exp(-dd * y * ypred)  # Update sample weights
            yy /= np.sum(yy)  # Normalize sample weights
            
    def predict(self, X):
        # Predict the target variable for the input data.
        n = X.shape[0]
        ypred = np.zeros(n)
        
        # Make predictions using each weak learner and weight
        for i in range(len(self.mm)):
            m1 = self.mm[i]
            m2 = self.nn[i]
            ypred += m2 * m1.predict(X)  # Weighted sum of predictions
        
        # Apply sign function to get final predictions
        return np.sign(ypred)


In [41]:
# Initializing and fitting the AdaBoost model
ab = AdaBoost(weak_learner=DecisionTreeClassifier, num_learners=200, learning_rate=0.1)
ab.fit(X_train, y_train)

# Making predictions on the test set
ypredb = ab.predict(X_test)

# Calculating accuracy of the AdaBoost model
bacc = accuracy_score(y_test, ypredb)
print("Accuracy of Boosting Model : {:.2f}%".format(bacc * 100))

print(y)


Accuracy of Boosting Model : 76.49%
0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64
