In [14]:
# Make Predictions with Naive Bayes On The Wine Dataset
from math import sqrt
from math import exp
from math import pi
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.metrics import accuracy_score
import numpy as np

class GNB(BaseEstimator, ClassifierMixin):
    def __init__(self):
        super().__init__()
        self.summaries = dict()
        self.prior_probabilities = dict()

    # Split the dataset by class values, returns a dictionary
    def separate_by_class(self, x, y):
        separated = dict()
        for class_value in pd.unique(y):
            separated[class_value] = x[y == class_value]
            
        return separated
 
    # Split dataset by class then calculate statistics for each row
    def fit(self, x, y):
        separated = self.separate_by_class(x, y)
        for class_value, rows in separated.items():
            self.prior_probabilities[class_value] = len(rows)/len(x)
            print(self.prior_probabilities[class_value])
            self.summaries[class_value] = [rows.mean(), rows.std()]
 
    # Calculate the Gaussian probability distribution function for x
    def calculate_probability(self, x, mean, stdev):
        exponent = exp(-((x-mean)**2 / (2 * stdev**2 )))
        
        return (1 / (sqrt(2 * pi) * stdev)) * exponent
 
    # Calculate the probabilities of predicting each class for a given row
    def calculate_class_probabilities(self, row):
        probabilities = self.prior_probabilities
        for class_value, class_summaries in self.summaries.items():
            for i in range(1, len(class_summaries[0]) + 1):
                mean, stdev = class_summaries[0][i], class_summaries[1][i]
                probabilities[class_value] *= self.calculate_probability(row[i], mean, stdev)
                
        return probabilities
 
    # Predict the class for a given row
    def predict(self, X):
        predictions = np.array([])
        print(X.shape)
        print(type(X))
        for _, row in X.iterrows():
            print(row)
            probabilities = self.calculate_class_probabilities(row)
            best_label, best_prob = None, -1
            for class_value, probability in probabilities.items():
                if best_label is None or probability > best_prob:
                    best_prob = probability
                    best_label = class_value
            predictions = np.append(predictions, best_label)
        
        return predictions
    
    def score(self, X, y_true):
        return accuracy_score(y_true, self.predict(X))

 

In [15]:
# Make a prediction with Naive Bayes on Iris Dataset
filename = 'data/wine.data'
dataset = pd.read_csv(filename, header=None)
y = dataset[0]
X = dataset.drop(columns=[0])
X, X_test, y, y_test = train_test_split(X, y, stratify=y, test_size=0.2)

# fit model
gnb_clf = GNB()
gnb_clf.fit(X, y)

# predict the label
# print(X_test)
label = gnb_clf.predict(X_test)
print(label)
print(y_test.iloc[0])
# print('Data=%s, Predicted: %s' % (row, label))

0.4014084507042254
0.33098591549295775
0.2676056338028169
(36, 13)
<class 'pandas.core.frame.DataFrame'>
1      12.52
2       2.43
3       2.17
4      21.00
5      88.00
6       2.55
7       2.27
8       0.26
9       1.22
10      2.00
11      0.90
12      2.78
13    325.00
Name: 111, dtype: float64
1      13.87
2       1.90
3       2.80
4      19.40
5     107.00
6       2.95
7       2.97
8       0.37
9       1.76
10      4.50
11      1.25
12      3.40
13    915.00
Name: 28, dtype: float64
1      12.42
2       4.43
3       2.73
4      26.50
5     102.00
6       2.20
7       2.13
8       0.43
9       1.71
10      2.08
11      0.92
12      3.12
13    365.00
Name: 122, dtype: float64
1       13.94
2        1.73
3        2.27
4       17.40
5      108.00
6        2.88
7        3.54
8        0.32
9        2.08
10       8.90
11       1.12
12       3.10
13    1260.00
Name: 49, dtype: float64
1      12.29
2       1.41
3       1.98
4      16.00
5      85.00
6       2.55
7       2.50
8       0.29


In [16]:
from sklearn.model_selection import cross_val_score

cv_results = cross_val_score(gnb_clf, X, y, cv=3)
print(cv_results)

0.26595744680851063
0.32978723404255317
0.40425531914893614
(48, 13)
<class 'pandas.core.frame.DataFrame'>
1      11.62
2       1.99
3       2.28
4      18.00
5      98.00
6       3.02
7       2.26
8       0.17
9       1.35
10      3.25
11      1.16
12      2.96
13    345.00
Name: 94, dtype: float64
1      13.67
2       1.25
3       1.92
4      18.00
5      94.00
6       2.10
7       1.79
8       0.32
9       0.73
10      3.80
11      1.23
12      2.46
13    630.00
Name: 62, dtype: float64
1       13.76
2        1.53
3        2.70
4       19.50
5      132.00
6        2.95
7        2.74
8        0.50
9        1.35
10       5.40
11       1.25
12       3.00
13    1235.00
Name: 33, dtype: float64
1      12.85
2       3.27
3       2.58
4      22.00
5     106.00
6       1.65
7       0.60
8       0.60
9       0.96
10      5.58
11      0.87
12      2.11
13    570.00
Name: 162, dtype: float64
1       14.30
2        1.92
3        2.72
4       20.00
5      120.00
6        2.80
7        3.14
8    