## Gaussian Naive Bayes Implemention from Scratch

In [109]:
import numpy as np
import pandas as pd

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [110]:
cancer_sklearn = load_breast_cancer(as_frame=True)
print(cancer_sklearn.DESCR)

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

:Number of Instances: 569

:Number of Attributes: 30 numeric, predictive attributes and the class

:Attribute Information:
    - radius (mean of distances from center to points on the perimeter)
    - texture (standard deviation of gray-scale values)
    - perimeter
    - area
    - smoothness (local variation in radius lengths)
    - compactness (perimeter^2 / area - 1.0)
    - concavity (severity of concave portions of the contour)
    - concave points (number of concave portions of the contour)
    - symmetry
    - fractal dimension ("coastline approximation" - 1)

    The mean, standard error, and "worst" or largest (mean of the three
    worst/largest values) of these features were computed for each image,
    resulting in 30 features.  For instance, field 0 is Mean Radius, field
    10 is Radius SE, field 20 is Worst Radius.

    - 

In [111]:
cancer: pd.DataFrame= cancer_sklearn.frame
cancer.head().transpose()

Unnamed: 0,0,1,2,3,4
mean radius,17.99,20.57,19.69,11.42,20.29
mean texture,10.38,17.77,21.25,20.38,14.34
mean perimeter,122.8,132.9,130.0,77.58,135.1
mean area,1001.0,1326.0,1203.0,386.1,1297.0
mean smoothness,0.1184,0.08474,0.1096,0.1425,0.1003
mean compactness,0.2776,0.07864,0.1599,0.2839,0.1328
mean concavity,0.3001,0.0869,0.1974,0.2414,0.198
mean concave points,0.1471,0.07017,0.1279,0.1052,0.1043
mean symmetry,0.2419,0.1812,0.2069,0.2597,0.1809
mean fractal dimension,0.07871,0.05667,0.05999,0.09744,0.05883


In [112]:
cancer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error               569 non-null    float64
 14  smoothness error         5

In [113]:
cancer.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
mean radius,569.0,14.127292,3.524049,6.981,11.7,13.37,15.78,28.11
mean texture,569.0,19.289649,4.301036,9.71,16.17,18.84,21.8,39.28
mean perimeter,569.0,91.969033,24.298981,43.79,75.17,86.24,104.1,188.5
mean area,569.0,654.889104,351.914129,143.5,420.3,551.1,782.7,2501.0
mean smoothness,569.0,0.09636,0.014064,0.05263,0.08637,0.09587,0.1053,0.1634
mean compactness,569.0,0.104341,0.052813,0.01938,0.06492,0.09263,0.1304,0.3454
mean concavity,569.0,0.088799,0.07972,0.0,0.02956,0.06154,0.1307,0.4268
mean concave points,569.0,0.048919,0.038803,0.0,0.02031,0.0335,0.074,0.2012
mean symmetry,569.0,0.181162,0.027414,0.106,0.1619,0.1792,0.1957,0.304
mean fractal dimension,569.0,0.062798,0.00706,0.04996,0.0577,0.06154,0.06612,0.09744


In [114]:
X: pd.DataFrame = cancer.drop("target", axis=1)
y: pd.Series = cancer["target"]

In [115]:
X: np.ndarray = X.to_numpy()
y: np.ndarray = y.to_numpy()

In [116]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True)

In [117]:
class GaussianNaiveBayes:  
    def fit(self, X: np.ndarray, y: np.ndarray):  
        self.mean_1 = (np.sum(X[y == 1], axis=0)) / (np.sum(y))
        self.mean_0 = (np.sum(X[y == 0], axis=0)) / (len(y) - np.sum(y))
        self.variance_1 = (np.sum(((X - self.mean_1)**2)[y == 1], axis=0)) / (np.sum(y))
        self.variance_0 = (np.sum(((X - self.mean_0)**2)[y == 0], axis=0)) / (len(y) - np.sum(y))
        self.prior = np.sum(y) / len(y)
   
    def predict(self, X: np.ndarray) -> np.ndarray:
        joint_log_likelihood_1 = -0.5 * np.sum(np.log(2 * np.pi * self.variance_1) + ((X - self.mean_1)**2) / self.variance_1, axis=1)
        joint_log_likelihood_0 = -0.5 * np.sum(np.log(2 * np.pi * self.variance_0) + ((X - self.mean_0)**2) / self.variance_0, axis=1)
        log_posterior_1 = joint_log_likelihood_1 + np.log(self.prior)
        log_posterior_0 = joint_log_likelihood_0 + np.log(1 - self.prior)
        y_pred = (log_posterior_1 > log_posterior_0).astype(int)
        
        return y_pred

In [118]:
model = GaussianNaiveBayes()
model.fit(X_train, y_train)

In [119]:
y_pred = model.predict(X_test)

In [120]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.91      0.94        55
           1       0.95      0.99      0.97        88

    accuracy                           0.96       143
   macro avg       0.96      0.95      0.96       143
weighted avg       0.96      0.96      0.96       143

