# COSC 74: Machine Learning

## Homework  4

### Needed Imports

In [146]:
import pandas as pd
import numpy as np
from numpy import typing as npt
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
# from sklearn import naive_bayes
%matplotlib inline

### 1. Load the Data and Partition it

80% of the data is used for training and 20% for testing.

In [147]:
df = pd.read_csv("hw4_naive.csv")
print(df.head())

dataset = np.array(df)

trainset, testset = train_test_split(dataset, test_size=0.2, random_state=0)


   Feature_1  Feature_2  Feature_3  Feature_4  Feature_5  Feature_6  Label
0    7430.14    9529.78   -2453.33         19        123        621      0
1   11256.40   50455.10   -4220.00         18        216       2677      0
2   13093.00   51897.10   -2880.00         30        234       2464      0
3   14303.00  102632.00   -5702.20        144        281       4061      1
4   14688.00   83343.40   -2430.00         52        223       2822      1


### 2. Multinomial Naivë Bayes classifier with Smooting 

In [166]:
# naive_bayes.MultinomialNB()

class NaiveBayes():
    """
    Naive Bayes Predictor.

    """

    def __init__(self, trainset=None, smoothing_coeff=1):
        """
            Initializes the Naive Bayes predictor.

            Inputs:
            -------
                `trainset`: 2D numpy array of training data.
                    >>> NOTE: The last column of the data is the class label.
                `smoothing_coeff`: float, smoothing coefficient
                        for multinomial classifier.
                        Default value = 1.

        """

        self.possible_classes = list(np.unique(trainset[:, -1]))
        self.trainset: npt.NDArray = trainset
        self.smoothing_coeff: float = smoothing_coeff
        self.feature_count: int = self.trainset.shape[1] - 1

        # these variables are used by the gaussian classifier
        self.gaussian_means = None
        self.gaussian_variance = None

    def multinomial(self, feature_vector):
        """
            Calculates the probability of each class given the data.

            Inputs:
            -------

            `feature_vector`: 1D numpy array for aa single row of data.

            Returns:
            --------
            A classification for the feature vector,
            based on the priorly saved X_train and y_train.

        """

        class_count = len(self.possible_classes)
        classes = np.array([0 for _ in range(class_count)])
        class_totals = np.array([0 for _ in self.possible_classes])
        for record in self.trainset:
            label = record[-1]
            class_index = self.possible_classes.index(label)
            for feature in range(self.feature_count):
                if record[feature] == feature_vector[feature]:
                    classes[class_index] += 1

                class_totals[class_index] += 1

        probabilities = (classes + self.smoothing_coeff) / (class_totals + class_count)
        return self.possible_classes[np.argmax(probabilities)]

    def multinomial_predictions(self, feature_vectors):
        """
            Calculates the multinomial predictions for each record in a matrix.

            Inputs:
            -------
                `feature_vectors`: 2D numpy array of feature vectors.
            
            Outputs:
            --------
                `predictions`: 1D list of predictions.

        """
        return np.apply_along_axis(self.multinomial, 1, feature_vectors)
    
    def gaussian(self, feature_vector):
        """
            Calculates the probability of each class given the data.

            Inputs:
            -------

            `feature_vector`: 1D numpy array for aa single row of data.

            Returns:
            --------
            A classification for the feature vector,
            based on the priorly saved X_train and y_train.

        """
        if feature_vector.shape[0] == self.feature_count + 1:
            feature_vector = feature_vector[:-1]


        # if means and variance are not initialized, initialize them
        if self.gaussian_means is None:
            self.gaussian_means = []
            self.gaussian_variance = []

            for possible_class in self.possible_classes:
                class_data = self.trainset[self.trainset[:, -1] == possible_class]
                class_mean = np.mean(class_data[:, :-1], axis=0)
                class_variance = np.var(class_data[:, :-1], axis=0)
                self.gaussian_means.append(class_mean)
                self.gaussian_variance.append(class_variance)

        probabilities = []
        for possible_class in range(len(self.possible_classes)):
            class_mean = self.gaussian_means[possible_class]
            class_variance = self.gaussian_variance[possible_class]
            prob = (1 / (np.sqrt(2 * np.pi * class_variance)))
            prob *= np.exp(-( (feature_vector - class_mean) ** 2) / (2 * class_variance))
            probabilities.append(np.product(prob))
            
        return self.possible_classes[np.argmax(probabilities)]

    def gaussian_predictions(self, feature_vectors):
        """
            Calculates the multinomial predictions for each record in a matrix.

            Inputs:
            -------
                `feature_vectors`: 2D numpy array of feature vectors.
            
            Outputs:
            --------
                `predictions`: 1D list of predictions.

        """
        return np.apply_along_axis(self.gaussian, 1, feature_vectors)


### Test Multinomial Naive Bayes classifier with Smoothing.

In [164]:
nb = NaiveBayes(trainset)
multinomial_predictions = nb.multinomial_predictions(testset)

# print(f"{multinomial_predictions = }")

# check the accuracy of the predictions
count = len(multinomial_predictions)
true_positive = 0
true_negative = 0
false_positive = 0
false_negative = 0
for i in range(count):
    prediction = multinomial_predictions[i]
    if prediction == testset[i][-1]:
        if prediction == 1:
            true_positive += 1
        else:
            true_negative += 1
    else:
        if prediction == 1:
            false_positive += 1
        else:
            false_negative += 1

correct = true_positive + true_negative
precision = true_positive / (true_positive + false_positive)
recall = true_positive / (true_positive + false_negative)
f1 = 2 * precision * recall / (precision + recall)
print(f"{correct = }, total = {count}")
print(f"{precision = }\n{recall = }\n{f1 = }")
print(f"accuracy = {correct / count}")

self.trainset.shape = (4480, 7)
self.possible_classes = [0.0, 1.0]
multinomial_predictions = array([1., 0., 0., ..., 0., 0., 0.])
correct = 865, total = 1120
precision = 0.8523076923076923
recall = 0.5723140495867769
f1 = 0.684796044499382
accuracy = 0.7723214285714286


### Test Gaussian Naive Bayes classifier.

In [165]:
# nb = NaiveBayes(trainset)  # if initialized above, no need to initialize here.
gaussian_predictions = nb.gaussian_predictions(testset)

# print(gaussian_predictions)

# check the accuracy of the predictions
count = len(gaussian_predictions)
true_positive = 0
true_negative = 0
false_positive = 0
false_negative = 0
for i in range(count):
    prediction = gaussian_predictions[i]
    if prediction == testset[i][-1]:
        if prediction == 1:
            true_positive += 1
        else:
            true_negative += 1
    else:
        if prediction == 1:
            false_positive += 1
        else:
            false_negative += 1

correct = true_positive + true_negative
precision = true_positive / (true_positive + false_positive)
recall = true_positive / (true_positive + false_negative)
f1 = 2 * precision * recall / (precision + recall)
print(f"{correct = }, total = {count}")
print(f"{precision = }\n{recall = }\n{f1 = }")
print(f"accuracy = {correct / count}")

[1. 0. 0. ... 0. 1. 1.]
correct = 666, total = 1120
precision = 0.5707547169811321
recall = 0.25
f1 = 0.34770114942528735
accuracy = 0.5946428571428571


---


##  Bonus Questions: Clustering

- Given a [training dataset](./hw4_cluster.csv) containing 40 rows, each with 2 columns.
- Column 1 & 2 are the features.
- There are no labels for this dataset.
- Implement different clustering algorithms and run them on this
dataset. 
- **Assume the distance function is _Euclidean Distance_.**

1) (35 points) Implement a generalized K-means/median algorithm.

  - You should have a single function that takes in as input the data points, K,
    and some other hyperparameters specified below.
  - The function should return K sets of data points, each set corresponding to one cluster.