In [34]:
from sklearn import datasets
import pandas as pd
import numpy as np
import sklearn
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

wine_dataset = datasets.load_wine()
df = pd.DataFrame(wine_dataset.data, columns=wine_dataset.feature_names)
df['target'] = wine_dataset.target
df.head()

X, y = wine_dataset.data, wine_dataset.target

# spliting data to train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3,stratify=y, random_state=42)
X_train.shape, y_train.shape, X_test.shape, y_test.shape


((124, 13), (124,), (54, 13), (54,))

In [35]:
def get_params(X_train, y_train): 
    num_examples, num_features = X_train.shape
    num_classes = len(np.unique(y_train))
    return num_examples, num_features, num_classes

In [36]:
# testing utility function
num_examples, num_features, num_classes = get_params(X_train, y_train)
print(num_examples, num_features, num_classes)

124 13 3


In [37]:
def get_stats_by_class(X_train, y_train, num_examples=num_examples, num_classes=num_classes): 
    """
    Get stats of dataset by the class
    """
    # dictionaries to store stats
    class_mean = {}
    class_var = {} 
    class_prior = {} 
    
    # loop through each class and get mean, variance and prior by class
    for cls in range(num_classes): 
        X_cls = X_train[y_train == cls]
        class_mean[str(cls)] = np.mean(X_cls, axis=0)
        class_var[str(cls)] = np.var(X_cls, axis=0)
        class_prior[str(cls)] = X_cls.shape[0] / num_examples
    return class_mean, class_var, class_prior

In [38]:
def gaussian_density_function(X, mean, std, num_examples=num_examples, num_features=num_features, eps=1e-6): 
    num_exambles, num_features = X_train.shape
    const = -num_features/2 * np.log(2*np.pi) - 0.5 * np.sum(np.log(std + eps))
    probs = 0.5 * np.sum(np.power(X - mean, 2)/(std + eps), 1)
    return const - probs

In [39]:
def class_probabilities(X, class_mean, class_var, class_prior, num_classes=num_classes):
    """
    calculate the probability of each class given the data
    """
    num_examples = X.shape[0]
    probs = np.zeros((num_examples, num_classes))

    for cls in range(num_classes): 
        prior = class_prior[str(cls)]
        probs_cls = gaussian_density_function(X, class_mean[str(cls)], class_var[str(cls)])
        probs[:, cls] = probs_cls + np.log(prior)
    return probs

In [40]:
def predict(X_test, X_train, y_train): 
    num_examples, num_features, num_classes = get_params(X_test, y_train)
    class_mean, class_std, class_prior = get_stats_by_class(X_train, y_train)
    probs = class_probabilities(X_test, class_mean, class_std, class_prior)
    return np.argmax(probs, 1)

In [41]:
#separate by class
def seperate_by_class(dataset):
  zero = dataset[dataset['target'] == 0]
  one = dataset[dataset['target'] == 1]
  two = dataset[dataset['target'] == 2]
  return zero, one, two

In [42]:
def accuracy(test_set, my_preds):
    correct = 0
    actual = test_set
    for x, y in zip(actual, my_preds):
        if x == y:
            correct += 1
    return correct / float(len(test_set))

Que 2A)


In [43]:
# scikit learn implementation 
nb = GaussianNB()
nb.fit(X_train, y_train)
sklearn_preds = nb.predict(X_test)

print(f"sklearn accuracy:{accuracy_score(y_test, sklearn_preds)}")
print(f"predictions: {sklearn_preds}")

# Mean, Variance, class prior 
cm, var, cp = get_stats_by_class(X_train, y_train)
print(f"mean: {cm}\n\nvariance: {var}\n\npriors: {cp}")

my_preds = predict(X_test, X_train, y_train)
sklearn_preds == my_preds

probs = class_probabilities(X_train, cm, var, cp)

print('Accuracy')
print(accuracy(y_test, my_preds))

cm = confusion_matrix(y_test, my_preds)
print(cm)

sklearn accuracy:1.0
predictions: [0 1 0 0 0 0 2 1 1 2 1 1 2 1 0 2 1 0 2 2 1 2 2 2 1 2 0 1 0 1 0 1 2 1 1 2 1
 1 1 0 2 0 0 0 0 1 1 0 2 0 1 1 2 0]
mean: {'0': array([1.37304878e+01, 1.94707317e+00, 2.44975610e+00, 1.71024390e+01,
       1.06634146e+02, 2.82853659e+00, 2.94024390e+00, 3.01707317e-01,
       1.85121951e+00, 5.56780488e+00, 1.05097561e+00, 3.08853659e+00,
       1.11280488e+03]), '1': array([1.22424e+01, 1.96260e+00, 2.23280e+00, 2.05240e+01, 9.51400e+01,
       2.25360e+00, 2.04680e+00, 3.50800e-01, 1.71220e+00, 2.96080e+00,
       1.05892e+00, 2.80220e+00, 5.31260e+02]), '2': array([1.30745455e+01, 3.20090909e+00, 2.45424242e+00, 2.15606061e+01,
       9.92727273e+01, 1.68757576e+00, 7.87575758e-01, 4.46363636e-01,
       1.13878788e+00, 7.36272724e+00, 6.73030303e-01, 1.69060606e+00,
       6.24393939e+02])}

variance: {'0': array([2.02950982e-01, 3.95762165e-01, 6.05975015e-02, 7.29535990e+00,
       1.16280785e+02, 1.14880785e-01, 1.37626770e-01, 5.39464604e-03,
      

Que 2B) i)40-40-20

In [44]:
def get_stats_by_class_ratio(X_train, y_train, num_examples=num_examples, num_classes=num_classes): 
    # dictionaries to store stats
    mean = {}
    variance = {} 
    prior = {'0':0.40,'1':0.40,'2':0.20} 
    
    # loop through each class and get mean, variance and prior by class
    for cls in range(num_classes): 
        X_cls = X_train[y_train == cls]
        mean[str(cls)] = np.mean(X_cls, axis=0)
        variance[str(cls)] = np.var(X_cls, axis=0)
        prior[str(cls)] = X_cls.shape[0] / num_examples
    return mean, variance, prior

# Mean, Variance, class prior 
cm, var, cp = get_stats_by_class_ratio(X_train, y_train)
print(f"mean: {cm}\n\nvariance: {var}\n\npriors: {cp}")

my_preds = predict(X_test, X_train, y_train)
sklearn_preds == my_preds

probs = class_probabilities(X_train, cm, var, cp)

print('Accuracy')
print(accuracy(y_test, my_preds))

cm = confusion_matrix(y_test, my_preds)
print(cm)

mean: {'0': array([1.37304878e+01, 1.94707317e+00, 2.44975610e+00, 1.71024390e+01,
       1.06634146e+02, 2.82853659e+00, 2.94024390e+00, 3.01707317e-01,
       1.85121951e+00, 5.56780488e+00, 1.05097561e+00, 3.08853659e+00,
       1.11280488e+03]), '1': array([1.22424e+01, 1.96260e+00, 2.23280e+00, 2.05240e+01, 9.51400e+01,
       2.25360e+00, 2.04680e+00, 3.50800e-01, 1.71220e+00, 2.96080e+00,
       1.05892e+00, 2.80220e+00, 5.31260e+02]), '2': array([1.30745455e+01, 3.20090909e+00, 2.45424242e+00, 2.15606061e+01,
       9.92727273e+01, 1.68757576e+00, 7.87575758e-01, 4.46363636e-01,
       1.13878788e+00, 7.36272724e+00, 6.73030303e-01, 1.69060606e+00,
       6.24393939e+02])}

variance: {'0': array([2.02950982e-01, 3.95762165e-01, 6.05975015e-02, 7.29535990e+00,
       1.16280785e+02, 1.14880785e-01, 1.37626770e-01, 5.39464604e-03,
       1.46386318e-01, 1.48304152e+00, 1.26624628e-02, 1.08866151e-01,
       3.97256692e+04]), '1': array([2.70046240e-01, 1.10783524e+00, 7.88841600e

2B) 80-10-10


In [45]:
def get_stats_by_class_ratio(X_train, y_train, num_examples=num_examples, num_classes=num_classes): 
    # dictionaries to store stats
    mean = {}
    variance = {} 
    prior = {'0':0.80,'1':0.10,'2':0.10} 
    
    # loop through each class and get mean, variance and prior by class
    for cls in range(num_classes): 
        X_cls = X_train[y_train == cls]
        mean[str(cls)] = np.mean(X_cls, axis=0)
        variance[str(cls)] = np.var(X_cls, axis=0)
        prior[str(cls)] = X_cls.shape[0] / num_examples
    return mean, variance, prior

# Mean, Variance, class prior 
cm, var, cp = get_stats_by_class_ratio(X_train, y_train)
print(f"mean: {cm}\n\nvariance: {var}\n\npriors: {cp}")

my_preds = predict(X_test, X_train, y_train)
sklearn_preds == my_preds

probs = class_probabilities(X_train, cm, var, cp)

print('Accuracy')
print(accuracy(y_test, my_preds))

cm = confusion_matrix(y_test, my_preds)
print(cm)

mean: {'0': array([1.37304878e+01, 1.94707317e+00, 2.44975610e+00, 1.71024390e+01,
       1.06634146e+02, 2.82853659e+00, 2.94024390e+00, 3.01707317e-01,
       1.85121951e+00, 5.56780488e+00, 1.05097561e+00, 3.08853659e+00,
       1.11280488e+03]), '1': array([1.22424e+01, 1.96260e+00, 2.23280e+00, 2.05240e+01, 9.51400e+01,
       2.25360e+00, 2.04680e+00, 3.50800e-01, 1.71220e+00, 2.96080e+00,
       1.05892e+00, 2.80220e+00, 5.31260e+02]), '2': array([1.30745455e+01, 3.20090909e+00, 2.45424242e+00, 2.15606061e+01,
       9.92727273e+01, 1.68757576e+00, 7.87575758e-01, 4.46363636e-01,
       1.13878788e+00, 7.36272724e+00, 6.73030303e-01, 1.69060606e+00,
       6.24393939e+02])}

variance: {'0': array([2.02950982e-01, 3.95762165e-01, 6.05975015e-02, 7.29535990e+00,
       1.16280785e+02, 1.14880785e-01, 1.37626770e-01, 5.39464604e-03,
       1.46386318e-01, 1.48304152e+00, 1.26624628e-02, 1.08866151e-01,
       3.97256692e+04]), '1': array([2.70046240e-01, 1.10783524e+00, 7.88841600e