<a href="https://colab.research.google.com/github/ekaratnida/Applied-machine-learning/blob/master/Week04-workshop-1/Lab04_NaiveBayes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [170]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math

In [171]:
def accuracy_score(y_true, y_pred):

	"""	score = (y_true - y_pred) / len(y_true) """

	return round((sum(y_pred == y_true))/(len(y_true)) * 100 ,4)

In [172]:
y_pred = np.array([0,0,1,1,0])
y_true = np.array([0,0,0,0,0])
accuracy_score(y_true, y_pred)

np.float64(60.0)

In [173]:
def pre_processing(df):
  """ partioning data into features (X) and target (Y) """
  #df is dataframe
  df = df.drop([df.columns[0]], axis = 1) #Drop day
  #print(df.head())

  X = df.drop([df.columns[-1]], axis = 1) #Drop play_tennis
  y = df[df.columns[-1]]
  return X, y


In [174]:
def fit():
  pass

In [175]:
class  NaiveBayes:

	"""
		Bayes Theorem:
										                    Likelihood * Class prior probability
				Posterior Probability = -------------------------------------
											                      Predictor prior probability

							  			          P(x|c) * p(c)
							   P(c|x) = ------------------
											                P(x)
	"""

	def __init__(self):

		"""
			Attributes:
				likelihoods: Likelihood of each feature per class
				class_priors: Prior probabilities of classes
				pred_priors: Prior probabilities of features
				features: All features of dataset
		"""
		self.features = list
		self.likelihoods = {} # {"id":123}
		self.class_priors = {}
		self.pred_priors = {} #Evidence

		self.X_train = np.array
		self.y_train = np.array
		self.train_size = int
		self.num_feats = int

	def fit(self, X, y):

		self.features = list(X.columns)
		self.X_train = X
		self.y_train = y
		self.train_size = X.shape[0]
		self.num_feats = X.shape[1]

		for feature in self.features:
			self.likelihoods[feature] = {}
			self.pred_priors[feature] = {}

			for feat_val in np.unique(self.X_train[feature]):
				self.pred_priors[feature].update({feat_val: 0})

				for outcome in np.unique(self.y_train):
					self.likelihoods[feature].update({feat_val+'_'+outcome:0})
					self.class_priors.update({outcome: 0})

		self._calc_class_prior()
		self._calc_likelihoods()
		self._calc_predictor_prior()

	def _calc_class_prior(self):

		""" P(c) - Prior Class Probability """

		for outcome in np.unique(self.y_train):
			outcome_count = sum(self.y_train == outcome)
			self.class_priors[outcome] = outcome_count / self.train_size

	def _calc_likelihoods(self):

		""" P(x|c) - Likelihood """

		for feature in self.features:

			for outcome in np.unique(self.y_train):
				outcome_count = sum(self.y_train == outcome)
				feat_likelihood = self.X_train[feature][self.y_train[self.y_train == outcome].index.values.tolist()].value_counts().to_dict()

				for feat_val, count in feat_likelihood.items():
					self.likelihoods[feature][feat_val + '_' + outcome] = count/outcome_count

	def _calc_predictor_prior(self):

		""" P(x) - Evidence """

		for feature in self.features:
			feat_vals = self.X_train[feature].value_counts().to_dict()

			for feat_val, count in feat_vals.items():
				self.pred_priors[feature][feat_val] = count/self.train_size

	def predict(self, X):

		""" Calculates Posterior probability P(c|x) """

		results = []
		X = np.array(X)

		for query in X:
			probs_outcome = {}
			for outcome in np.unique(self.y_train):
				prior = self.class_priors[outcome]
				likelihood = 1
				evidence = 1

				for feat, feat_val in zip(self.features, query):
					likelihood *= self.likelihoods[feat][feat_val + '_' + outcome]
					evidence *= self.pred_priors[feat][feat_val]

				posterior = (likelihood * prior) / (evidence)
				#print(" posterior ",posterior)

				probs_outcome[outcome] = posterior

			result = max(probs_outcome, key = lambda x: probs_outcome[x])
			results.append(result)
			print(probs_outcome)
		#print(" results",results)
		return np.array(results)


In [176]:
if __name__ == "__main__":

	#Weather Dataset
	print("\nWeather Dataset:")

	df = pd.read_csv("https://raw.githubusercontent.com/ekaratnida/Applied-machine-learning/master/Week10-desicion-tree/PlayTennis.csv")
	#print(df.head())

	print("*********Begin Pre processing...****************")

	#Split fearures and target
	X,y  = pre_processing(df)
	print("X.head()")
	#print(X.head())
	print("y.head()")
	#print(y.head())

	print("*********End Pre processing...****************")

	nb_clf = NaiveBayes()
	nb_clf.fit(X, y)

	#print("\nTrain Accuracy: {}".format(accuracy_score(y, nb_clf.predict(X))))

	#Query 1:
	query1 = np.array([['Rain','Mild', 'Normal', 'Strong']])
	print("Query 1:- {} ---> {}".format(query1, nb_clf.predict(query1)))

	#Query 2:
	query2 = np.array([['Overcast','Cool', 'Normal', 'Strong']])
	print("Query 2:- {} ---> {}".format(query2, nb_clf.predict(query2)))

	#Query 3:
	query3 = np.array([['Sunny','Hot', 'High', 'Strong']])
	print("Query 3:- {} ---> {}".format(query3, nb_clf.predict(query3)))


Weather Dataset:
*********Begin Pre processing...****************
X.head()
y.head()
*********End Pre processing...****************
{'No': 0.20906666666666676, 'Yes': 0.6452674897119342}
Query 1:- [['Rain' 'Mild' 'Normal' 'Strong']] ---> ['Yes']
{'No': 0.0, 'Yes': 1.2098765432098766}
Query 2:- [['Overcast' 'Cool' 'Normal' 'Strong']] ---> ['Yes']
{'No': 1.8816, 'Yes': 0.1613168724279835}
Query 3:- [['Sunny' 'Hot' 'High' 'Strong']] ---> ['No']


In [177]:
#Use sklearn to train naive bayes

In [178]:
print(X.head())
print(y.head())

    Outlook Temperature Humidity    Wind
0     Sunny         Hot     High    Weak
1     Sunny         Hot     High  Strong
2  Overcast         Hot     High    Weak
3      Rain        Mild     High    Weak
4      Rain        Cool   Normal    Weak
0     No
1     No
2    Yes
3    Yes
4    Yes
Name: Play_Tennis, dtype: object


In [179]:
#X_new = pd.get_dummies(X,dtype='int')
#print(X_new.head())

In [180]:
#y_new = pd.get_dummies(y,dtype='int')
#print(y_new.head())

In [None]:
query1_new = pd.DataFrame(query1, columns=['Outlook', 'Temperature', 'Humidity', 'Wind'])
query2_new = pd.DataFrame(query2, columns=['Outlook', 'Temperature', 'Humidity', 'Wind'])
query3_new = pd.DataFrame(query3, columns=['Outlook', 'Temperature', 'Humidity', 'Wind'])
X_with_test = pd.concat([X, query1_new, query2_new, query3_new],axis=0)
#print(X_with_test)
#print(query1_new.head())
X_new = pd.get_dummies(X_with_test, dtype='int')
print(X_new)

In [184]:
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
clf = BernoulliNB()
clf.fit(X_new[0:14], y)

In [187]:
print(clf.predict(X_new[14:]))

print(clf.predict_proba(X_new[14:]))


['Yes' 'Yes' 'No']
[[0.20062098 0.79937902]
 [0.02888552 0.97111448]
 [0.96657583 0.03342417]]
