In [176]:
import numpy as np
import pandas as pd
import sklearn.datasets as datasets

# Bayes Model From Scratch

In [171]:
class NaiveBayesClassifier():
    def __init__(self, typeBayes):

        self.features = list()
        self.prior = {}
        self.likelihood = {}
        self.pred_prior = {}
        
        self.train_size = int
        self.num_feats = int
        self.X_train = np.array
        self.y_train = np.array

        self.typeBayes = typeBayes

    def setClassPrior(self):
        """ P(c) - Prior class probability P(c) = count(c) / count(all) """
        for c in np.unique(self.y_train):
            self.prior[c] = np.sum(self.y_train == c) / self.train_size

    def setLikelihoodBernoulli(self):
        """ P(x|c) - Likelihood of feature given class P(x|c) = count(x|c) / count(c) """
        for feature in self.features:
            for c in np.unique(self.y_train):
                feat_likelihood = self.X_train[feature][self.y_train[self.y_train == c].index].value_counts().to_dict()
                for feat_val, count in feat_likelihood.items():
                    self.likelihood[feature][feat_val + "|" + c] = count / np.sum(self.y_train == c)
    
    def calculateGaussian(self, query, feature, c):
        """ Gaussian distribution"""
        return (1 / (np.sqrt(2 * np.pi) * self.likelihood[feature][c]['std'])) * np.exp(-0.5 * ((query - self.likelihood[feature][c]['mean']) / self.likelihood[feature][c]['std']) ** 2)
    
    def  setLikelihoodGaussian(self):
        """ P(x|c) - Gaussian distribution of feature given class P(x|c) = count(x|c) / count(c) """
        for feature in self.features:
            for c in np.unique(self.y_train):
                mean = np.mean(self.X_train[feature][self.y_train[self.y_train == c].index])
                std = np.std(self.X_train[feature][self.y_train[self.y_train == c].index])
                self.likelihood[feature][c]['mean'] = mean
                self.likelihood[feature][c]['std'] = std


    def setPredictPrior(self):
        for feature in self.features:
            feat_val = self.X_train[feature].value_counts().to_dict()

            for val, count in feat_val.items():
                self.pred_prior[feature][val] = count / self.train_size

    def setBernoulliBayes(self):
        for feature in self.features:
            self.likelihood[feature] = {}
            self.pred_prior[feature] = {}
            for feat_val in np.unique(self.X_train[feature]):
                self.pred_prior[feature].update({feat_val: 0})
                for c in np.unique(self.y_train):
                    self.likelihood[feature].update({feat_val+'|'+c:0})
                    self.prior.update({c: 0})
        self.setClassPrior()
        self.setLikelihoodBernoulli()
        self.setPredictPrior()

    def setGaussianBayes(self):
        for feature in self.features:
            self.likelihood[feature] = {}
            self.pred_prior[feature] = {}
            for c in np.unique(self.y_train):
                self.likelihood[feature].update({c:{}})
                self.prior.update({c: 0})

        self.setClassPrior()
        self.setLikelihoodGaussian()
        
    def predictBernoulliNaiveBayes(self, X):
        result = []
        X = np.array(X)
        for query in X:
            probs_outcome = {} # P(c|x) probability of outcome given query
            for c in np.unique(self.y_train):
                priors = self.prior[c]
                likelihood = 1
                evidence = 1
                for feature, feat_val in zip(self.features, query):
                   likelihood *= self.likelihood[feature][feat_val + "|" + c] # P(A, B | C) = P(A | C) * P(B | C)
                   evidence *= self.pred_prior[feature][feat_val]
                probs_outcome[c] = (likelihood * priors) / (evidence)
            result.append(max(probs_outcome, key=probs_outcome.get)) 
        return result

    def predictGaussianNaiveBayes(self, X):
        result = []
        X = np.array(X)
        for query in X:
            probs_outcome = {} # P(c|x) probability of outcome given query
            for c in np.unique(self.y_train):
                likelihood = 1
                evidence = 1
                probs_outcome[c] = self.prior[c]
                for feature, feat_val in zip(self.features, query):
                   likelihood *=  self.calculateGaussian(feat_val, feature, c) # P(A, B | C) = P(A | C) * P(B | C)
                probs_outcome[c] *= likelihood
            result.append(max(probs_outcome, key=probs_outcome.get)) 
        return result

    def fit(self, X, y):
        self.features = list(X.columns)
        self.X_train = X
        self.y_train = y
        self.train_size = X.shape[0]
        self.num_feats = X.shape[1]

        if self.typeBayes == 'Bernoulli':
            self.setBernoulliBayes()
        elif self.typeBayes == 'Gaussian':
            self.setGaussianBayes()

    def predict(self, X):
        if self.typeBayes == 'Bernoulli':
            return self.predictBernoulliNaiveBayes(X)
        elif self.typeBayes == 'Gaussian':
            return self.predictGaussianNaiveBayes(X)

    def findAccuracyScore(self, y_true, y_pred):
        """	score = (y_true - y_pred) / len(y_true) """
        return round(float(sum(y_pred == y_true))/float(len(y_true)) * 100 ,2)

    def printPredict(self, query):
        print("Query:- {} ---> {}".format(query, self.predict(query)))

# Test with weather dataset

In [177]:
weather_dataset = pd.read_table('data/weather.txt', sep=" ")
weather_dataset

Unnamed: 0,Outlook,Temp,Humidity,Windy,Play
0,Rainy,Hot,High,f,no
1,Rainy,Hot,High,t,no
2,Overcast,Hot,High,f,yes
3,Sunny,Mild,High,f,yes
4,Sunny,Cool,Normal,f,yes
5,Sunny,Cool,Normal,t,no
6,Overcast,Cool,Normal,t,yes
7,Rainy,Mild,High,f,no
8,Rainy,Cool,Normal,f,yes
9,Sunny,Mild,Normal,f,yes


In [70]:
def pre_processing(df):

	""" partioning data into features and target """

	X = df.drop([df.columns[-1]], axis = 1)
	y = df[df.columns[-1]]

	return X, y

In [175]:
X, y = pre_processing(weather_dataset)

In [129]:
BayesModel = NaiveBayesClassifier('Bernoulli')
BayesModel.fit(X, y)
print("Train Accuracy: {}".format(BayesModel.findAccuracyScore(y, BayesModel.predict(X))))

Train Accuracy: 92.86


In [130]:
query = np.array([['Rainy','Mild', 'Normal', 't']])
BayesModel.printPredict(query)

Query:- [['Rainy' 'Mild' 'Normal' 't']] ---> ['yes']


In [131]:
query = np.array([['Overcast','Cool', 'Normal', 't']])
BayesModel.printPredict(query)

Query:- [['Overcast' 'Cool' 'Normal' 't']] ---> ['yes']


In [132]:
query = np.array([['Sunny','Hot', 'High', 't']])
BayesModel.printPredict(query)

Query:- [['Sunny' 'Hot' 'High' 't']] ---> ['no']


# Test with Iris dataset

In [135]:
iris = datasets.load_iris()
iris_df = pd.DataFrame(iris.data, columns = iris.feature_names)
iris_df['target'] = iris.target
iris_df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [143]:
X, y = pre_processing(iris_df)

In [145]:
def train_test_split(x, y, test_size = 0.25, random_state = None):

	""" partioning the data into train and test sets """

	x_test = x.sample(frac = test_size, random_state = random_state)
	y_test = y[x_test.index]

	x_train = x.drop(x_test.index)
	y_train = y.drop(y_test.index)

	return x_train, x_test, y_train, y_test

In [148]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [172]:
BayesModel = NaiveBayesClassifier('Gaussian')
BayesModel.fit(X_train, y_train)
print("Train Accuracy: {}".format(BayesModel.findAccuracyScore(y_test, BayesModel.predict(X_test))))

Train Accuracy: 100.0


In [174]:
for i in range(10):
    query = np.array([X_test.iloc[i]])
    BayesModel.printPredict(query)
    print("Real Label: {}".format(y_test.iloc[i]))

Query:- [[6.1 2.8 4.7 1.2]] ---> [1]
Real Label: 1
Query:- [[5.7 3.8 1.7 0.3]] ---> [0]
Real Label: 0
Query:- [[7.7 2.6 6.9 2.3]] ---> [2]
Real Label: 2
Query:- [[6.  2.9 4.5 1.5]] ---> [1]
Real Label: 1
Query:- [[6.8 2.8 4.8 1.4]] ---> [1]
Real Label: 1
Query:- [[5.4 3.4 1.5 0.4]] ---> [0]
Real Label: 0
Query:- [[5.6 2.9 3.6 1.3]] ---> [1]
Real Label: 1
Query:- [[6.9 3.1 5.1 2.3]] ---> [2]
Real Label: 2
Query:- [[6.2 2.2 4.5 1.5]] ---> [1]
Real Label: 1
Query:- [[5.8 2.7 3.9 1.2]] ---> [1]
Real Label: 1
