# Lab04: Non-Parametric Models: Bayes, Navie Bayes, and KNN. Regularization and CrossValidation

In [1]:
from IPython.display import clear_output
from torch.utils.data.dataset import T_co
!pip3 install ../Libs/plotting_funcs-0.0.1-py3-none-any.whl  ##--force-reinstall
clear_output()

In [295]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from typing import Optional, Dict, Any, Tuple, Union
from plotting_funcs.graphs import show, list_plot, line_plot, show_multi, confusion_graph

In [296]:
import torch
from torch import nn
from tqdm import tqdm, trange
from functools import partial
from torch.utils.data import Dataset, DataLoader

### Boston Dataset

In [297]:
boston_data = pd.read_csv("data/Boston.csv")
# Removes the index value in the dataset
boston_data.drop(boston_data.columns[boston_data.columns.map(lambda name: "unnamed" in name.lower())], axis=1,
                 inplace=True)
boston_data.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [298]:
# Check types
boston_data.dtypes.to_frame().T

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
0,float64,float64,float64,int64,float64,float64,float64,float64,int64,int64,float64,float64,float64,float64


In [299]:
# Count missing values
boston_data.where(boston_data.isna(), 0).where(boston_data.notna(), 1).to_numpy().sum()

0.0

In [300]:
predictors_boston, response_boston = boston_data.iloc[:, :-1], boston_data.iloc[:, -1]

### Iris Dataset

In [301]:
iris_dataset = pd.read_csv("data/Iris.csv")
iris_dataset.drop("Id", axis=1, inplace=True)
iris_dataset.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [302]:
# Encoding Species column, won't be needed
iris_dataset_enc = pd.get_dummies(iris_dataset, columns=["Species"], prefix=["Species"])
iris_dataset_enc.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species_Iris-setosa,Species_Iris-versicolor,Species_Iris-virginica
0,5.1,3.5,1.4,0.2,1,0,0
1,4.9,3.0,1.4,0.2,1,0,0
2,4.7,3.2,1.3,0.2,1,0,0
3,4.6,3.1,1.5,0.2,1,0,0
4,5.0,3.6,1.4,0.2,1,0,0


In [303]:
# Check types
iris_dataset_enc.dtypes.to_frame().T

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species_Iris-setosa,Species_Iris-versicolor,Species_Iris-virginica
0,float64,float64,float64,float64,uint8,uint8,uint8


In [304]:
iris_dataset_enc.describe()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species_Iris-setosa,Species_Iris-versicolor,Species_Iris-virginica
count,150.0,150.0,150.0,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667,0.333333,0.333333,0.333333
std,0.828066,0.433594,1.76442,0.763161,0.472984,0.472984,0.472984
min,4.3,2.0,1.0,0.1,0.0,0.0,0.0
25%,5.1,2.8,1.6,0.3,0.0,0.0,0.0
50%,5.8,3.0,4.35,1.3,0.0,0.0,0.0
75%,6.4,3.3,5.1,1.8,1.0,1.0,1.0
max,7.9,4.4,6.9,2.5,1.0,1.0,1.0


In [306]:
train_iris_dataset = iris_dataset.sample(frac=0.8)
test_iris_dataset = iris_dataset.drop(train_iris_dataset.index)

In [307]:
x_train_iris_dataset, y_train_iris_dataset = train_iris_dataset.iloc[:, :-1], train_iris_dataset.iloc[:, -1]
x_test_iris_dataset, y_test_iris_dataset = test_iris_dataset.iloc[:, :-1], test_iris_dataset.iloc[:, -1]

### Naive Bayes Classifier
implementation mainly with pandas apis

In [305]:
class NaiveBayesClassifier:
    def __init__(self, dataset: Tuple[pd.DataFrame, pd.Series]):
        predictors, responses = dataset
        n_sample = len(responses)
        self.klasses = responses.unique()
        self.predictor_labels = predictors.columns
        self.P_c = np.log(responses.value_counts() / n_sample)

        # pd.Series to a probability distribution function (pdf)
        def series_to_pdf(series_desc):
            mean = series_desc["mean"]
            std = series_desc["std"]
            return lambda x: np.exp(-0.5 * ((x - mean) / std) ** 2) / np.sqrt(2 * np.pi * (std ** 2))

        # For each unique class, get the independent predictor probability distribution
        self.P_c_x = {m_klass: (
            predictors[responses.map(lambda klass: klass == m_klass)]
            .describe()
            .apply(series_to_pdf, axis=0)
        ) for m_klass in self.klasses}

    def __call__(self, input):
        match type(input):
            case pd.DataFrame: return [self.predict(inp) for inp in input.to_numpy()]
        match np.asarray(input).shape:
            case (_, ):
                return self.predict(input)
            case _:
                return [self.predict(inp) for inp in input]

    def predict(self, input):
        probabilities = [
            self.P_c[klass] + sum(self.predictor_labels.map(
                lambda label: np.log(self.P_c_x[klass][label](
                    input[self.predictor_labels.to_list().index(label)]
                ))
            )) for klass in self.klasses]
        return list(self.klasses).index(self.klasses[np.asarray(probabilities).argmax()])

In [308]:
nbc = NaiveBayesClassifier((x_train_iris_dataset, y_train_iris_dataset))
nbc_predictions = nbc(x_test_iris_dataset)

In [309]:
def encode(species: [str], klasses):
    conv = {klass: i for (i, klass) in enumerate(klasses)}
    return list(map(lambda klass: conv[klass], species))

In [310]:
show(confusion_graph(
    labels=nbc.klasses,
    enc_labels=[0, 1, 2],
    actual=np.asarray(encode(y_test_iris_dataset, nbc.klasses)),
    prediction=np.asarray(nbc_predictions)
))

### K-Nearest Neighbours

In [331]:
class KNNClassifier:
    def __init__(self, dataset: Tuple[pd.DataFrame, pd.Series], k=3):
        assert k % 2 == 1  # To avoid the case of having split decision
        self.k = k
        self.points, self.classes = dataset[0].to_numpy(), dataset[1].to_numpy()
        self.klasses = dataset[1].unique()

    def __call__(self, input: Union[list, np.ndarray, pd.DataFrame], dist_func):
        # return self.forward(input, dist_func)
        match type(input):
            case pd.DataFrame: return [self.predict(inp, dist_func) for inp in input.to_numpy()]
        match np.asarray(input).shape:
            case (_, ):
                return self.predict(input, dist_func)
            case _:
                return [self.predict(inp, dist_func) for inp in input]

    def predict(self, focus, dist_func):
        points, classes = self.points, self.classes

        distances = [dist_func(focus, pt) for pt in points]
        idx_nearest = np.argsort(distances)[:self.k]
        labels_nearest = [classes[idx] for idx in idx_nearest]
        labels, count = np.unique(labels_nearest, return_counts=True)

        return list(self.klasses).index(labels[np.argmax(count)])

In [332]:
knn = KNNClassifier((x_train_iris_dataset, y_train_iris_dataset))
distance_func = lambda x, y: np.linalg.norm(x - y)
knn_predictions = knn(x_test_iris_dataset, distance_func)

In [334]:
show(confusion_graph(
    labels=knn.klasses,
    enc_labels=[0, 1, 2],
    actual=np.asarray(encode(y_test_iris_dataset, knn.klasses)),
    prediction=np.asarray(knn_predictions)
))

### Cross Validation

In [388]:
class IrisDataset(Dataset):
    def __init__(self, dataset: Tuple[pd.DataFrame, pd.Series]):
        self.predictors, self.responses = dataset
        self.klasses = self.responses.unique()
        self.converter = {klass: i for (i, klass) in enumerate(self.klasses)}

    def __len__(self):
        return len(self.predictors)

    def __getitem__(self, index):
        return self.predictors.iloc[index].to_numpy(), self.responses.iloc[index]


train_iris_dataloader = DataLoader(IrisDataset((x_train_iris_dataset, y_train_iris_dataset)), batch_size=15, shuffle=True, num_workers=0)

In [None]:
# for (_, batch) in enumerate(train_iris_dataloader):
#     knn = KNNClassifier((x_train_iris_dataset, y_train_iris_dataset))
#     distance_func = lambda x, y: np.linalg.norm(x - y)
#     knn_predictions = knn(x_test_iris_dataset, distance_func)