In [1]:
import numpy as np
import pandas as pd
import sklearn.datasets as datasets
from model import NaiveBayesClassifier

# Bayes Model From Scratch

# Test with weather dataset

In [2]:
weather_dataset = pd.read_table('data/weather.txt', sep=" ")
weather_dataset

Unnamed: 0,Outlook,Temp,Humidity,Windy,Play
0,Rainy,Hot,High,f,no
1,Rainy,Hot,High,t,no
2,Overcast,Hot,High,f,yes
3,Sunny,Mild,High,f,yes
4,Sunny,Cool,Normal,f,yes
5,Sunny,Cool,Normal,t,no
6,Overcast,Cool,Normal,t,yes
7,Rainy,Mild,High,f,no
8,Rainy,Cool,Normal,f,yes
9,Sunny,Mild,Normal,f,yes


In [3]:
def pre_processing(df):

	""" partioning data into features and target """

	X = df.drop([df.columns[-1]], axis = 1)
	y = df[df.columns[-1]]

	return X, y

In [4]:
X, y = pre_processing(weather_dataset)

In [5]:
BayesModel = NaiveBayesClassifier('Bernoulli')
BayesModel.fit(X, y)
print("Train Accuracy: {}".format(BayesModel.findAccuracyScore(y, BayesModel.predict(X))))

Train Accuracy: 92.86


In [6]:
query = np.array([['Rainy','Mild', 'Normal', 't']])
BayesModel.printPredict(query)

Query:- [['Rainy' 'Mild' 'Normal' 't']] ---> ['yes']


In [7]:
query = np.array([['Overcast','Cool', 'Normal', 't']])
BayesModel.printPredict(query)

Query:- [['Overcast' 'Cool' 'Normal' 't']] ---> ['yes']


In [8]:
query = np.array([['Sunny','Hot', 'High', 't']])
BayesModel.printPredict(query)

Query:- [['Sunny' 'Hot' 'High' 't']] ---> ['no']


# Test with Iris dataset

In [9]:
iris = datasets.load_iris()
iris_df = pd.DataFrame(iris.data, columns = iris.feature_names)
iris_df['target'] = iris.target
iris_df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [10]:
X, y = pre_processing(iris_df)

In [11]:
def train_test_split(x, y, test_size = 0.25, random_state = None):

	""" partioning the data into train and test sets """

	x_test = x.sample(frac = test_size, random_state = random_state)
	y_test = y[x_test.index]

	x_train = x.drop(x_test.index)
	y_train = y.drop(y_test.index)

	return x_train, x_test, y_train, y_test

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
BayesModel = NaiveBayesClassifier('Gaussian')
BayesModel.fit(X_train, y_train)
print("Train Accuracy: {}".format(BayesModel.findAccuracyScore(y_test, BayesModel.predict(X_test))))

Train Accuracy: 100.0


In [14]:
for i in range(10):
    query = np.array([X_test.iloc[i]])
    BayesModel.printPredict(query)
    print("Real Label: {}".format(y_test.iloc[i]))

Query:- [[6.1 2.8 4.7 1.2]] ---> [1]
Real Label: 1
Query:- [[5.7 3.8 1.7 0.3]] ---> [0]
Real Label: 0
Query:- [[7.7 2.6 6.9 2.3]] ---> [2]
Real Label: 2
Query:- [[6.  2.9 4.5 1.5]] ---> [1]
Real Label: 1
Query:- [[6.8 2.8 4.8 1.4]] ---> [1]
Real Label: 1
Query:- [[5.4 3.4 1.5 0.4]] ---> [0]
Real Label: 0
Query:- [[5.6 2.9 3.6 1.3]] ---> [1]
Real Label: 1
Query:- [[6.9 3.1 5.1 2.3]] ---> [2]
Real Label: 2
Query:- [[6.2 2.2 4.5 1.5]] ---> [1]
Real Label: 1
Query:- [[5.8 2.7 3.9 1.2]] ---> [1]
Real Label: 1
