<a href="https://colab.research.google.com/github/elinabhasin/DIY-Deep-Learning/blob/main/categorical_naive_bayes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [387]:
#Categorical Naive Bayes

In [388]:
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn import naive_bayes

In [389]:
data = pd.read_csv('/content/tennis.csv')

In [390]:
data

Unnamed: 0,outlook,temp,humidity,windy,play
0,sunny,hot,high,False,no
1,sunny,hot,high,True,no
2,overcast,hot,high,False,yes
3,rainy,mild,high,False,yes
4,rainy,cool,normal,False,yes
5,rainy,cool,normal,True,no
6,overcast,cool,normal,True,yes
7,sunny,mild,high,False,no
8,sunny,cool,normal,False,yes
9,rainy,mild,normal,False,yes


In [391]:
data.isnull().sum()

Unnamed: 0,0
outlook,0
temp,0
humidity,0
windy,0
play,0


In [392]:
data.nunique()

Unnamed: 0,0
outlook,3
temp,3
humidity,2
windy,2
play,2


In [393]:
X = data.drop('play',axis=1)
X.head()

Unnamed: 0,outlook,temp,humidity,windy
0,sunny,hot,high,False
1,sunny,hot,high,True
2,overcast,hot,high,False
3,rainy,mild,high,False
4,rainy,cool,normal,False


In [394]:
y = data.play
y = pd.DataFrame(y)
y

Unnamed: 0,play
0,no
1,no
2,yes
3,yes
4,yes
5,no
6,yes
7,no
8,yes
9,yes


In [395]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [396]:
#X,y | y is a pandas dataframe

**From Scratch**

In [397]:
class CategoricalNB:
  def __init__(self,alpha=1):
    self.alpha = alpha
    self.target_counts = set()
    self.features = None
    self.prior={}
    self.likelihoods={}
    self.y_pred = []

  def calc_prior(self,y):
    total_vals = len(y)

    for _,val_count in enumerate(y.value_counts().items()):
      value,count = val_count
      self.target_counts.add((value[0],count))
      self.prior[value[0]] = count/total_vals

  def calc_likelihood(self,X,y):
    for feature in self.features:
      self.likelihoods[feature]={}
      for target in self.target_counts:
        self.likelihoods[feature][target[0]] = {}
        for _,val_count in enumerate(X[f'{feature}'].value_counts().items()):
          value,count = val_count
          self.likelihoods[feature][target[0]][value] = float((((X[f'{feature}'] == value)  & (y.play == target[0])).sum())/target[1])

  def fit(self,X,y):
    self.features = X.columns
    self.calc_prior(y)
    self.calc_likelihood(X,y)

  def predict(self,X):
    cols = X.columns
    for idx in range(len(X)):
      prob = {x:i for x,i in self.prior.items()}
      for target in prob.keys():
        for col in cols:
          prob[f'{target}']*=self.likelihoods[col][target][X.iloc[idx][col]]
      pred = [key for key, val in prob.items() if val == max(prob.values())]
      self.y_pred.append(pred[0])
    return self.y_pred

In [398]:
model = CategoricalNB()
model.fit(X_train,y_train)
y_pred=model.predict(X_test)

In [399]:
def accuracy(y_true, y_pred):
  #y_true is a dataframe with n rows and one column.
  y_true = y_true.values.flatten()
  accuracy = np.sum(y_true == y_pred) / len(y_true)
  return accuracy

In [400]:
print(accuracy(y_test,y_pred))

0.6666666666666666


**Using Sklearn**

In [401]:
sk_model = naive_bayes.CategoricalNB()

In [402]:
le = LabelEncoder()
for col in X_train.columns:
  X_train[col] = le.fit_transform(X_train[col])
  X_test[col] = le.fit_transform(X_test[col])
X_train

Unnamed: 0,outlook,temp,humidity,windy
12,0,1,1,0
5,1,0,1,1
8,2,0,1,0
2,0,1,0,0
1,2,1,0,1
13,1,2,0,1
4,1,0,1,0
7,2,2,0,0
10,2,2,1,1
3,1,2,0,0


In [403]:
for col in y_train.columns:
  y_train[col] = le.fit_transform(y_train[col])
  y_test[col] = le.fit_transform(y_test[col])
print(y_train)

    play
12     1
5      0
8      1
2      1
1      0
13     0
4      1
7      0
10     1
3      1
6      1


In [404]:
sk_model.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)


In [405]:
y_pred = sk_model.predict(X_test)

In [406]:
print(accuracy_score(y_test,y_pred))

0.6666666666666666
