<a href="https://colab.research.google.com/github/gourab-sinha/Machine_Learning/blob/master/Naive%20Bayes%20Classifier/Naive_Bayes_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

In [0]:
class NaiveBayesClassifier:
  # Train
  def train_classifier(self, train_data):

    # Different classes in Target
    classes = dict(train_data['target'].value_counts().items())

    # To store all information
    result = {}
    for class_type, count in classes.items():

      # Extract only data points which are equal to class_type
      features = train_data[train_data['target']==class_type]

      # Features present in train_data
      feature_names = list(features.columns)

      # Target is not require hence removed
      feature_names.remove('target')
      
      # To store features and their differnt labels with count
      feature_with_counts = {}

      for feature_name in feature_names:

        # Differnt labels with count for ith feature
        categories_in_feature = dict(features[feature_name].value_counts().items())

        # To get total count of the ith feature present 
        total_count = 0
        for category,val in categories_in_feature.items():
          total_count += val

        # Insert new key with name total_count to hold the total_count
        categories_in_feature['total_count'] = total_count

        # To hold details of ith feature
        feature_with_counts[feature_name] = categories_in_feature
      
      # Count of class that appeared in data points after filteration 
      feature_with_counts["total_count"] = count

      # To hold with class_type which is present in Target.
      result[class_type] = feature_with_counts
    
    # Number of data points in the train_data set
    result["overall"] = train_data.shape[0]
    return result


  def __probability(self, dictionary,X,current_class):

    # Summation of P(X=xj/Y=ai)
    output = np.log(dictionary[current_class]['total_count']) - np.log(dictionary['overall'])

    # Features present in dictionary
    features = dictionary[current_class].keys()
    for feature in features:
      if feature=='total_count':
        continue
      
      # Value present in testing data for selected feature
      feature_category = X[feature].iloc[0]

      # For selected value total count present, default 1 for laplace correction
      total_count_category = dictionary[current_class][feature].get(feature_category,1)

      # Calculation for the  P(X=x_jth/Y=a_ith)
      cal = np.log(total_count_category) - np.log(dictionary[current_class]['total_count']+len(dictionary[current_class][feature]))
      output = output + cal

    return output

  def __predictSinglePoint(self, dictionary,X):

    # Target classes 
    classes = dictionary.keys()

    # To pick best probability and best target class
    best_p = -1000
    best_class = -1
    first_run = True
    for current_class in classes:
      if current_class=='overall':
        continue
      
      # Get the probability based on the target class
      p_current_class = self.__probability(dictionary,X,current_class)

      # Update if p_current_class is greater than the best_p
      if(first_run or p_current_class>best_p):
        best_p = p_current_class
        best_class = current_class

      # Once best_p and best_class updated make it false
      first_run = False
    return best_class
  
  def predict(self,dictionary,X_test,columns):

    # Store results
    result = []

    # Iterate over all test data points
    for x in X_test.values:
      x = pd.DataFrame([x],columns=columns)
      class_name = self.__predictSinglePoint(dictionary,x)
      result.append(class_name)
    
    return result

  



      

In [309]:
data = [["Sunny","High","Yes"],
        ["Sunny","Low","Yes"],
        ["Overcast","Medium","No"],
        ["Rainy","High","No"],
        ["Overcast","High","Dont Know"],
        ["Overcast","Medium","Dont Know"],
        ["Overcast","Medium","Dont Know"],
        ["Rainy","Medium","No"],
        ["Overcast","Medium","No"],
        ["Sunny","High","Yes"],
        ["Sunny","Medium","Yes"],
        ["Rainy","High","No"],
        ["Overcast","Low","Yes"],
        ["Overcast","Medium","Yes"],
        ["Rainy","Low","Yes"],
        ["Overcast","Medium","Yes"]
        ]

data = pd.DataFrame(data, columns=["Outlook","Wind","target"])
naive_classifier = NaiveBayesClassifier()
result = naive_classifier.train_classifier(data)

print(result)

# Correctness check
print(result['Dont Know']['Outlook']['total_count'])

{'Yes': {'Outlook': {'Sunny': 4, 'Overcast': 3, 'Rainy': 1, 'total_count': 8}, 'Wind': {'Medium': 3, 'Low': 3, 'High': 2, 'total_count': 8}, 'total_count': 8}, 'No': {'Outlook': {'Rainy': 3, 'Overcast': 2, 'total_count': 5}, 'Wind': {'Medium': 3, 'High': 2, 'total_count': 5}, 'total_count': 5}, 'Dont Know': {'Outlook': {'Overcast': 3, 'total_count': 3}, 'Wind': {'Medium': 2, 'High': 1, 'total_count': 3}, 'total_count': 3}, 'overall': 16}
3


In [311]:
data = pd.DataFrame([["Sunny","High"],
        ["Sunny","Low"],
        ["Overcast","Medium"],
        ["Rainy","High"],
        ["Rainy","Low"],
        ["Overcast","Low"],
        ["Rainy","Medium"],
        ["Sunny","Medium"],
        ["Overcast","Low"],
        ["Overcast","High"]
        ],columns=["Outlook","Wind"])



predicted = naive_classifier.predict(result,data,["Outlook","Wind"])
print(predicted)

['Yes', 'Yes', 'Dont Know', 'No', 'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes']


# Iris Dataset Prediction

In [351]:

def markLabel(column):
  second_mean = column.mean()
  first_mean = 0.5*second_mean
  third_mean = 1.5*second_mean
  for i in range(column.shape[0]):
    if column.loc[i]<=first_mean:
      column.loc[i] = int(0)
    elif column.iloc[i]<=second_mean:
      column.loc[i] = int(1)
    elif column.iloc[i]<=third_mean:
      column.loc[i] = int(2)
    else:
      column.loc[i] = int(3)
    
  return column


iris = datasets.load_iris()
data = iris.data
targets = iris.target
X = pd.DataFrame(data,columns=iris.feature_names)
X['target'] = iris.target
for i in X.columns:
  if i=='target':
    continue
  X[i] = markLabel(X[i].copy())

X_train,X_test,Y_train,Y_test = train_test_split(X,iris.target,test_size=0.40)
print(X_train.shape,X_test.shape)

(90, 5) (60, 5)


In [0]:
naive_classifier = NaiveBayesClassifier()
result = naive_classifier.train_classifier(X_train)

In [353]:
Y_actual = X_test['target'].values.copy()
X_test.drop(['target'],axis = 1,inplace=True)
predict = naive_classifier.predict(result,X_test,iris.feature_names)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [354]:
print(classification_report(Y_actual,predict))
print(confusion_matrix(Y_actual,predict))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        23
           1       1.00      1.00      1.00        21
           2       1.00      1.00      1.00        16

    accuracy                           1.00        60
   macro avg       1.00      1.00      1.00        60
weighted avg       1.00      1.00      1.00        60

[[23  0  0]
 [ 0 21  0]
 [ 0  0 16]]
