### Naive Bayes Training Script
#### Author: Austin Fernandez
#### Date Modified: April 8, 2020

In [2]:
# dependencies
import csv
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.naive_bayes import CategoricalNB, ComplementNB, MultinomialNB
# from sklearn.naive_bayes import ComplementNB, MultinomialNB
import numpy as np
from scipy.stats import uniform, norm

import warnings
warnings.filterwarnings('ignore')

In [3]:
x_train = np.empty((0,5))
y_train = np.array([])
    
with open("Toy Dataset.csv","r") as f:
    d_reader = csv.reader(f,delimiter=",",quotechar="\"")
    first = True
    for line in d_reader:
        if first:
            first = False
            continue
        x_train = np.append(x_train,np.array(list(map(int,line[2:7]))).reshape((1,5)),axis=0)
        y_train = np.append(y_train,int(line[8]))
print(x_train.shape)
print(y_train.shape)
# print(y_train)

(419, 5)
(419,)


In [4]:
# MultinomialNB, Grid Search

parameters = {'alpha':[0.01 * i for i in range(1,101)], 'fit_prior':[True, False]}
mnb = MultinomialNB()
gscv = GridSearchCV(mnb, parameters,cv=10)
gscv.fit(x_train,y_train)
print("Best Score: {:.4f}".format(gscv.best_score_))
for k,v in gscv.best_params_.items():
    print("{} => {}".format(k,v))
print("Splits: {}".format(gscv.n_splits_))
y_out = gscv.predict(x_train)
print("Accuracy: {:.4f}%".format(np.mean(y_out == y_train) * 100.0))
# print(y_out)


Best Score: 0.3175
alpha => 0.08
fit_prior => True
Splits: 10
Accuracy: 31.7422%


In [11]:
# CategoricalNB, Grid Search
parameters = {'alpha':[0.01 * i for i in range(1,101)], 'fit_prior':[True, False]}
canb = CategoricalNB()
gscv = GridSearchCV(canb, parameters,cv=10)
gscv.fit(x_train,y_train)
print("Best Score: {:.4f}".format(gscv.best_score_))
for k,v in gscv.best_params_.items():
    print("{} => {}".format(k,v))
print("Splits: {}".format(gscv.n_splits_))
y_out = gscv.predict(x_train)
print("Accuracy: {:.4f}%".format(np.mean(y_out == y_train) * 100.0))
# print(y_out)

Best Score: 0.3271
alpha => 0.01
fit_prior => True
Splits: 10
Accuracy: 33.8902%


In [6]:
# ComplementNB, Grid Search
parameters = {'alpha':[0.01 * i for i in range(1,101)], 'fit_prior':[True, False], 'norm' : [True, False]}
cnb = ComplementNB()
gscv = GridSearchCV(cnb, parameters,cv=10)
gscv.fit(x_train,y_train)
print("Best Score: {:.4f}".format(gscv.best_score_))
for k,v in gscv.best_params_.items():
    print("{} => {}".format(k,v))
print("Splits: {}".format(gscv.n_splits_))
y_out = gscv.predict(x_train)
print("Accuracy: {:.4f}%".format(np.mean(y_out == y_train) * 100.0))
# print(y_out)

Best Score: 0.2937
alpha => 0.01
fit_prior => True
norm => False
Splits: 10
Accuracy: 31.7422%


In [7]:
# ComplementNB, RandomizedSearch
random_search_iterations = 1000
k_folds = 10

parameters = {'alpha':uniform(loc=0,scale=1.0), 'fit_prior':[True, False], 'norm' : [True, False]}
cnb = ComplementNB()
rscv = RandomizedSearchCV(cnb, parameters,cv=k_folds,n_iter=random_search_iterations)
rscv.fit(x_train,y_train)
print("Best Score: {:.4f}".format(rscv.best_score_))
for k,v in rscv.best_params_.items():
    print("{} => {}".format(k,v))
print("Splits: {}".format(rscv.n_splits_))
y_out = rscv.predict(x_train)
print("Accuracy: {:.4f}%".format(np.mean(y_out == y_train) * 100.0))
# print(y_out)

Best Score: 0.2937
alpha => 0.2533169787835904
fit_prior => True
norm => False
Splits: 10
Accuracy: 31.7422%


In [8]:
# ComplementNB, RandomizedSearch
random_search_iterations = 1000
k_folds = 10

parameters = {'alpha':uniform(loc=0,scale=1.0), 'fit_prior':[True, False]}
canb = CategoricalNB()
rscv = RandomizedSearchCV(canb, parameters,cv=k_folds,n_iter=random_search_iterations)
rscv.fit(x_train,y_train)
print("Best Score: {:.4f}".format(rscv.best_score_))
for k,v in rscv.best_params_.items():
    print("{} => {}".format(k,v))
print("Splits: {}".format(rscv.n_splits_))
y_out = rscv.predict(x_train)
print("Accuracy: {:.4f}%".format(np.mean(y_out == y_train) * 100.0))
# print(y_out)

Best Score: 0.3271
alpha => 0.5221199475573253
fit_prior => True
Splits: 10
Accuracy: 33.8902%


In [9]:
# MultinomialNB, RandomizedSearch
random_search_iterations = 1000
k_folds = 10

parameters = {'alpha':uniform(loc=0,scale=1.0), 'fit_prior':[True, False]}
mnb = MultinomialNB()
rscv = RandomizedSearchCV(mnb, parameters,cv=k_folds,n_iter=random_search_iterations)
rscv.fit(x_train,y_train)
print("Best Score: {:.4f}".format(rscv.best_score_))
for k,v in rscv.best_params_.items():
    print("{} => {}".format(k,v))
print("Splits: {}".format(rscv.n_splits_))
y_out = rscv.predict(x_train)
print("Accuracy: {:.4f}%".format(np.mean(y_out == y_train) * 100.0))
# print(y_out)

Best Score: 0.3175
alpha => 0.09731698161710889
fit_prior => True
Splits: 10
Accuracy: 31.7422%
