In [1]:
# Make Predictions with Naive Bayes On The Wine Dataset
from math import sqrt
from math import exp
from math import pi
import pandas as pd
from sklearn.model_selection import train_test_split

class GNB:
    def __init__(self):
        self.summaries = dict()
        self.prior_probabilities = dict()

    # Split the dataset by class values, returns a dictionary
    def separate_by_class(self, x, y):
        separated = dict()
        for class_value in pd.unique(y):
            separated[class_value] = x[y == class_value]
            
        return separated
 
    # Split dataset by class then calculate statistics for each row
    def train(self, x, y):
        separated = self.separate_by_class(x, y)
        for class_value, rows in separated.items():
            self.prior_probabilities[class_value] = len(rows)/len(x)
            print(self.prior_probabilities[class_value])
            self.summaries[class_value] = [rows.mean(), rows.std()]
 
    # Calculate the Gaussian probability distribution function for x
    def calculate_probability(self, x, mean, stdev):
        exponent = exp(-((x-mean)**2 / (2 * stdev**2 )))
        
        return (1 / (sqrt(2 * pi) * stdev)) * exponent
 
    # Calculate the probabilities of predicting each class for a given row
    def calculate_class_probabilities(self, row):
        probabilities = self.prior_probabilities
        for class_value, class_summaries in self.summaries.items():
            for i in range(1, len(class_summaries[0]) + 1):
                mean, stdev = class_summaries[0][i], class_summaries[1][i]
                probabilities[class_value] *= self.calculate_probability(row[i], mean, stdev)
                
        return probabilities
 
    # Predict the class for a given row
    def predict(self, row):
        probabilities = self.calculate_class_probabilities(row)
        best_label, best_prob = None, -1
        for class_value, probability in probabilities.items():
            if best_label is None or probability > best_prob:
                best_prob = probability
                best_label = class_value
        return best_label
 

In [3]:
# Make a prediction with Naive Bayes on Iris Dataset
filename = 'data/wine.data'
dataset = pd.read_csv(filename, header=None)
y = dataset[0]
X = dataset.drop(columns=[0])
X, X_test, y, y_test = train_test_split(X, y, stratify=y, test_size=0.2)

# fit model
gnb_clf = GNB()
gnb_clf.train(X, y)

# predict the label
print(X_test.iloc[0])
label = gnb_clf.predict(X_test.iloc[0])
print(label)
print(y_test.iloc[0])
# print('Data=%s, Predicted: %s' % (row, label))

0.33098591549295775
0.4014084507042254
0.2676056338028169
1      14.22
2       1.70
3       2.30
4      16.30
5     118.00
6       3.20
7       3.00
8       0.26
9       2.03
10      6.38
11      0.94
12      3.31
13    970.00
Name: 56, dtype: float64
1
1
