Importing all the required libraries

In [40]:
import numpy as np
import requests
from io import BytesIO

Filtering the dataset according to required classes

In [41]:
url = 'https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz'
response = requests.get(url)
data = np.load(BytesIO(response.content))
 
x_train = data['x_train']
y_train = data['y_train']
x_test = data['x_test']
y_test = data['y_test']

train_filter = np.where((y_train == 0) | (y_train == 1) | (y_train == 2))
test_filter = np.where((y_test == 0) | (y_test == 1) | (y_test == 2))

X_train = x_train[train_filter]
Y_train = y_train[train_filter]
X_test = x_test[test_filter]
Y_test = y_test[test_filter]

X_train = X_train.reshape((X_train.shape[0], 784))
X_test = X_test.reshape((X_test.shape[0], 784))

print("Dimensions of Test and Train filtered data: ")
print(X_train.shape)
print(X_test.shape)

Dimensions of Test and Train filtered data: 
(18623, 784)
(3147, 784)


Applying PCA on train dataset to obtain U matrix

In [42]:
X = X_train.T
X_mean = np.mean(X, axis = 0)
X_centre = X - X_mean
S = np.dot(X_centre, X_centre.T)/(X_centre.shape[1]-1)
eigenvalues, eigenvectors = np.linalg.eigh(S)
idx = eigenvalues.argsort()[::-1]
eigenvalues = eigenvalues[idx]
print("Dimensions of U: ")
U = eigenvectors[:, idx]
Y = np.dot(U.T, X_centre)
print(Y.shape)

Dimensions of U: 
(784, 18623)


Reducing features from 784 to 10 of Train and Test Dataset

In [43]:
p = 10
Up = U[:, :p]
print("Dimension of Up")
print(Up.shape)
Yp = np.dot(Up.T, X_centre)
X = Yp.T
Xre_p = np.dot(Up, Yp) + X_mean
print("New Dimension of Train Dataset")
print(X.shape)
X_test = X_test.T
X_test_centre = X_test - np.mean(X_test)
Yp2 = np.dot(Up.T, X_test_centre)
X_2 = Yp2.T
print("New Dimension of Test Dataset")
print(X_2.shape)

Dimension of Up
(784, 10)
New Dimension of Train Dataset
(18623, 10)
New Dimension of Test Dataset
(3147, 10)


Scatter plot of Train dataset where different color representing different classes

In [44]:
class DecisionTree:
    def __init__(self, max_depth):
        self.max_depth = max_depth
        self.tree = {}

    def fit(self, features, classes):
        self.tree = self.growTree(features, classes, depth=0)

    def giniImpurity(self, Y):
        classes = [0, 1, 2]
        gini = 0
        n = len(Y)
        for element in classes:
            p = np.sum(Y == element)/n
            gini += p*(1-p)
        return gini

    def bestSplit(self, features, Y):
        bestGini = float('inf')
        bestSplitDim = None
        threshold = None
        
        for dim in range(features.shape[1]):
            values = np.unique(features[:, dim])
            mini = np.min(values)
            maxi = np.max(values)
            for test in range(20):  
                value = np.random.uniform(mini, maxi)  
                leftIndices = features[:, dim] <= value
                rightIndices = features[:, dim] > value
                leftGini = self.giniImpurity(Y[leftIndices])
                rightGini = self.giniImpurity(Y[rightIndices])

                totalGini = (leftGini * np.sum(leftIndices) + rightGini * np.sum(rightIndices)) / len(Y)

                if totalGini < bestGini:
                    bestGini = totalGini
                    bestSplitDim = dim
                    threshold = value

        return bestSplitDim, threshold

    def growTree(self, features, Y, depth):
        if depth == self.max_depth or len(np.unique(Y)) == 1:
            return np.argmax(np.bincount(Y))
        splitDim, threshold = self.bestSplit(features, Y)
        leftIndices = features[:, splitDim] <= threshold
        rightIndices = features[:, splitDim] > threshold
        leftTree = self.growTree(features[leftIndices], Y[leftIndices], depth+1)
        rightTree = self.growTree(features[rightIndices], Y[rightIndices], depth+1)
        return {'splitDim': splitDim, 'threshold':threshold, 'left': leftTree, 'right': rightTree}

    def predict(self, features):
        pred = []
        for x in features:
            prediction = self.traverseTree(x, self.tree)
            pred.append(prediction)
        pred = np.array(pred)
        return pred

    def traverseTree(self, feature, tree):
        if isinstance(tree, dict):
            if feature[tree['splitDim']] <= tree['threshold']:
                return self.traverseTree(feature, tree['left'])
            else:
                return self.traverseTree(feature, tree['right'])
        else:
            return tree

tree = DecisionTree(2)
tree.fit(X, Y_train)

Testing and Determining overall and classwise accuracy

In [45]:
def accuracy(y_true, y_pred):
    for i in range(3):
        class_indices = np.where(y_true == i)[0]  
        real = np.sum(y_pred[class_indices] == i) 
        n = len(class_indices) 
        class_accuracy = (real/n)*100  
        print(f"Class {i} Accuracy: {class_accuracy:.2f} %")
    total_accuracy = np.mean(y_true==y_pred) * 100  
    print(f"Total Accuracy: {total_accuracy:.2f} %")

pred = tree.predict(X_2)
print("prediction Vector", pred)
accuracy(Y_test, pred)

prediction Vector [2 1 0 ... 0 1 2]
Class 0 Accuracy: 90.82 %
Class 1 Accuracy: 76.83 %
Class 2 Accuracy: 93.31 %
Total Accuracy: 86.59 %


Now applying Bagging on my Tree Model, Number of Dataset = 5 (with Replacement) and Printing the Overall and Classwise Accuracy

In [46]:
def createBaggedDatasets(X_train, Y_train):
    baggedDatasets = []
    for i in range(5):
        indexes = np.random.choice(X_train.shape[0], size=X_train.shape[0]-100, replace=True)
        X_temp = X_train[indexes]
        Y_temp = Y_train[indexes]
        baggedDatasets.append((X_temp, Y_temp))
    return baggedDatasets

def trainTrees(baggedDatasets):
    trees = []
    for X_temp, Y_temp in baggedDatasets:
        tree = DecisionTree(2)
        tree.fit(X_temp, Y_temp)
        trees.append(tree)
    return trees

baggedDatasets = createBaggedDatasets(X, Y_train)
trees = trainTrees(baggedDatasets)

In [47]:
totalPredictions = []

for tree in trees:
    predictions = tree.predict(X_2)
    totalPredictions.append(predictions)
totalPredictions = np.array(totalPredictions).T    

pred = []
for predictions in totalPredictions:
    labelCounts = {}
    for prediction in predictions:
        if prediction in labelCounts:
            labelCounts[prediction] += 1
        else:
            labelCounts[prediction] = 1
    predictedClass = None
    maxi = 0
    for label, cnt in labelCounts.items():
        if cnt>maxi:
            predictedClass=label
            maxi = cnt
    if maxi>=3:
        pred.append(predictedClass)
    else:
        pred.append(None)
    

In [48]:
def calculateAccuracy(y_true, y_pred):
    class_acc = {}
    for class_label in np.unique(y_true):
        indexes = np.where(y_true==class_label)[0]
        correct_predictions = np.sum(np.array(y_pred)[indexes]==class_label)
        n = len(indexes)
        class_acc[class_label] = (correct_predictions/n) * 100
        print(f"Class {class_label} Accuracy: {class_acc[class_label]:.2f} %")
    total_acc = np.mean(y_true == y_pred) * 100
    print(f"Total Accuracy: {total_acc:.2f} %")
    
calculateAccuracy(Y_test, pred)

Class 0 Accuracy: 93.06 %
Class 1 Accuracy: 83.26 %
Class 2 Accuracy: 84.98 %
Total Accuracy: 86.88 %
