# Gaussian Naive Bayes #

## Implementation Using Python 3 ##




In [1]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.datasets import load_breast_cancer
from sklearn.datasets import load_iris
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split

### Loading Datasets ###

In [2]:
cancer_dataset = load_breast_cancer()
iris_dataset = load_iris()
wine_dataset = load_wine()

### Data Preprocessing ###

#### Cancer Dataset ####

In [3]:
cancer_df = pd.DataFrame(cancer_dataset.data, columns = cancer_dataset.feature_names)
cancer_df['target'] = cancer_dataset.target
print(cancer_df.head())
cancer_x_train, cancer_x_test, cancer_y_train, cancer_y_test = train_test_split(cancer_dataset.data,
                                                                               cancer_dataset.target,
                                                                                test_size = 0.2,
                                                                               random_state = 0)


   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0        17.99         10.38          122.80     1001.0          0.11840   
1        20.57         17.77          132.90     1326.0          0.08474   
2        19.69         21.25          130.00     1203.0          0.10960   
3        11.42         20.38           77.58      386.1          0.14250   
4        20.29         14.34          135.10     1297.0          0.10030   

   mean compactness  mean concavity  mean concave points  mean symmetry  \
0           0.27760          0.3001              0.14710         0.2419   
1           0.07864          0.0869              0.07017         0.1812   
2           0.15990          0.1974              0.12790         0.2069   
3           0.28390          0.2414              0.10520         0.2597   
4           0.13280          0.1980              0.10430         0.1809   

   mean fractal dimension  ...  worst texture  worst perimeter  worst area  \
0             

#### Wine Dataset ####

In [4]:
wine_df = pd.DataFrame(wine_dataset.data, columns = wine_dataset.feature_names)
wine_df['target'] = wine_dataset.target
print(wine_df.head())
wine_x_train, wine_x_test, wine_y_train, wine_y_test = train_test_split(wine_dataset.data,
                                                                               wine_dataset.target,
                                                                                test_size = 0.4,
                                                                               random_state = 0)

   alcohol  malic_acid   ash  alcalinity_of_ash  magnesium  total_phenols  \
0    14.23        1.71  2.43               15.6      127.0           2.80   
1    13.20        1.78  2.14               11.2      100.0           2.65   
2    13.16        2.36  2.67               18.6      101.0           2.80   
3    14.37        1.95  2.50               16.8      113.0           3.85   
4    13.24        2.59  2.87               21.0      118.0           2.80   

   flavanoids  nonflavanoid_phenols  proanthocyanins  color_intensity   hue  \
0        3.06                  0.28             2.29             5.64  1.04   
1        2.76                  0.26             1.28             4.38  1.05   
2        3.24                  0.30             2.81             5.68  1.03   
3        3.49                  0.24             2.18             7.80  0.86   
4        2.69                  0.39             1.82             4.32  1.04   

   od280/od315_of_diluted_wines  proline  target  
0          

#### Iris Flower Dataset ####

In [5]:
iris_df = pd.DataFrame(iris_dataset.data, columns = iris_dataset.feature_names)
iris_df['target'] = iris_dataset.target
print(iris_df.head())
iris_x_train, iris_x_test, iris_y_train, iris_y_test = train_test_split(iris_dataset.data,
                                                                               iris_dataset.target,
                                                                                test_size = 0.4,
                                                                               random_state = 0)

   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

   target  
0       0  
1       0  
2       0  
3       0  
4       0  


### Implementation Using custom GNB ###

In [6]:
from Classifiers.NaiveBayes import GaussianNaiveBayes as GNBC 

In [13]:
gnbc = GNBC()
cancer_y_pred = gnbc.predict(cancer_x_train, cancer_y_train, cancer_x_test)
print("Number of mislabeled points out of a total %d points : %d in cancer Dataset" % (cancer_x_test.shape[0], (cancer_y_test != cancer_y_pred).sum()))

Custom GNB
{1: 7.103965321384188e-08, 0: 643.2656474078558}  for row  [1.340e+01 2.052e+01 8.864e+01 5.567e+02 1.106e-01 1.469e-01 1.445e-01
 8.172e-02 2.116e-01 7.325e-02 3.906e-01 9.306e-01 3.093e+00 3.367e+01
 5.414e-03 2.265e-02 3.452e-02 1.334e-02 1.705e-02 4.005e-03 1.641e+01
 2.966e+01 1.133e+02 8.444e+02 1.574e-01 3.856e-01 5.106e-01 2.051e-01
 3.585e-01 1.109e-01]
{1: 33033724.531556897, 0: 8.759951221253114e-07}  for row  [1.321e+01 2.525e+01 8.410e+01 5.379e+02 8.791e-02 5.205e-02 2.772e-02
 2.068e-02 1.619e-01 5.584e-02 2.084e-01 1.350e+00 1.314e+00 1.758e+01
 5.768e-03 8.082e-03 1.510e-02 6.451e-03 1.347e-02 1.828e-03 1.435e+01
 3.423e+01 9.129e+01 6.329e+02 1.289e-01 1.063e-01 1.390e-01 6.005e-02
 2.444e-01 6.788e-02]
{1: 13815199.606593328, 0: 3.03705841705274e-08}  for row  [1.402e+01 1.566e+01 8.959e+01 6.065e+02 7.966e-02 5.581e-02 2.087e-02
 2.652e-02 1.589e-01 5.586e-02 2.142e-01 6.549e-01 1.606e+00 1.925e+01
 4.837e-03 9.238e-03 9.213e-03 1.076e-02 1.171e-02 2.104e

In [None]:
wine_y_pred = gnbc.predict(wine_x_train, wine_y_train, wine_x_test)
print("Number of mislabeled points out of a total %d points : %d in wine Dataset" % (wine_x_test.shape[0], (wine_y_test != wine_y_pred).sum()))

In [None]:
iris_y_pred = gnbc.predict(iris_x_train, iris_y_train, iris_x_test)
print("Number of mislabeled points out of a total %d points : %d" % (iris_x_test.shape[0], (iris_y_test != iris_y_pred).sum()))

### Implementation Using SCIKIT's inbuilt function ###

In [8]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()


In [9]:
cancer_y_pred = gnb.fit(cancer_x_train, cancer_y_train).predict(cancer_x_test)
print("Number of mislabeled points out of a total %d points : %d in cancer Dataset" % (cancer_x_test.shape[0], (cancer_y_test != cancer_y_pred).sum()))

Number of mislabeled points out of a total 114 points : 8 in cancer Dataset


In [10]:
wine_y_pred = gnb.fit(wine_x_train, wine_y_train).predict(wine_x_test)
print("Number of mislabeled points out of a total %d points : %d in wine Dataset" % (wine_x_test.shape[0], (wine_y_test != wine_y_pred).sum()))

Number of mislabeled points out of a total 72 points : 4 in wine Dataset


In [11]:
iris_y_pred = gnb.fit(iris_x_train, iris_y_train).predict(iris_x_test)
print("Number of mislabeled points out of a total %d points : %d" % (iris_x_test.shape[0], (iris_y_test != iris_y_pred).sum()))

Number of mislabeled points out of a total 60 points : 4
