<a href="https://colab.research.google.com/github/gourab-sinha/Machine_Learning/blob/master/Naive%20Bayes%20Classifier/Naive_Bayes_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd

In [0]:
class NaiveBayesClassifier:
  # Train
  def train_classifier(self, train_data):

    # Different classes in Target
    classes = dict(train_data['target'].value_counts().items())

    # To store all information
    result = {}
    for class_type, count in classes.items():

      # Extract only data points which are equal to class_type
      features = train_data[train_data['target']==class_type]

      # Features present in train_data
      feature_names = list(features.columns)

      # Target is not require hence removed
      feature_names.remove('target')
      
      # To store features and their differnt labels with count
      feature_with_counts = {}

      for feature_name in feature_names:

        # Differnt labels with count for ith feature
        categories_in_feature = dict(features[feature_name].value_counts().items())

        # To get total count of the ith feature present 
        total_count = 0
        for category,val in categories_in_feature.items():
          total_count += val

        # Insert new key with name total_count to hold the total_count
        categories_in_feature['total_count'] = total_count

        # To hold details of ith feature
        feature_with_counts[feature_name] = categories_in_feature
      
      # Count of class that appeared in data points after filteration 
      feature_with_counts["total_count"] = count

      # To hold with class_type which is present in Target.
      result[class_type] = feature_with_counts
    
    # Number of data points in the train_data set
    result["overall"] = train_data.shape[0]
    return result


  def __probability(self, dictionary,X,current_class):

    # Summation of P(X=xj/Y=ai)
    output = dictionary[current_class]['total_count']/dictionary['overall']  # 4/7

    # Features present in dictionary
    features = dictionary[current_class].keys()
    for feature in features:
      if feature=='total_count':
        continue
      
      # Value present in testing data for selected feature
      feature_category = X[feature].iloc[0]

      # For selected value total count present, default 1 for laplace correction
      total_count_category = dictionary[current_class][feature].get(feature_category,1)

      # Calculation for the  P(X=x_jth/Y=a_ith)
      cal = total_count_category/(dictionary[current_class]['total_count']+len(dictionary[current_class][feature]))
      output = output*cal

    return output

  def __predictSinglePoint(self, dictionary,X):

    # Target classes 
    classes = dictionary.keys()

    # To pick best probability and best target class
    best_p = -1000
    best_class = -1
    first_run = True
    for current_class in classes:
      if current_class=='overall':
        continue
      
      # Get the probability based on the target class
      p_current_class = self.__probability(dictionary,X,current_class)

      # Update if p_current_class is greater than the best_p
      if(first_run or p_current_class>best_p):
        best_p = p_current_class
        best_class = current_class

      # Once best_p and best_class updated make it false
      first_run = False
    return best_class
  
  def predict(self,dictionary,X_test,columns):

    # Store results
    result = []

    # Iterate over all test data points
    for x in X_test:
      x = pd.DataFrame([x],columns=columns)
      class_name = self.__predictSinglePoint(dictionary,x)
      result.append(class_name)
    
    return result

  



      

In [241]:
data = [["Sunny","High","Yes"],
        ["Sunny","Low","Yes"],
        ["Overcast","Medium","No"],
        ["Rainy","High","No"],
        ["Overcast","High","Dont Know"],
        ["Rainy","Medium","No"],
        ["Overcast","Medium","No"],
        ["Sunny","High","Yes"],
        ["Sunny","Medium","Yes"],
        ["Rainy","High","No"],
        ["Overcast","Low","Yes"],
        ["Overcast","Medium","Yes"],
        ["Rainy","Low","Yes"]
        ]

data = pd.DataFrame(data, columns=["Outlook","Wind","target"])
naive_classifier = NaiveBayesClassifier()
result = naive_classifier.train_classifier(data)

print(result)

# Correctness check
print(result['Dont Know']['Outlook']['total_count'])

{'Yes': {'Outlook': {'Sunny': 4, 'Overcast': 2, 'Rainy': 1, 'total_count': 7}, 'Wind': {'Low': 3, 'Medium': 2, 'High': 2, 'total_count': 7}, 'total_count': 7}, 'No': {'Outlook': {'Rainy': 3, 'Overcast': 2, 'total_count': 5}, 'Wind': {'Medium': 3, 'High': 2, 'total_count': 5}, 'total_count': 5}, 'Dont Know': {'Outlook': {'Overcast': 1, 'total_count': 1}, 'Wind': {'High': 1, 'total_count': 1}, 'total_count': 1}, 'overall': 13}
1


In [0]:
data = [["Sunny","High"],
        ["Sunny","Low"],
        ["Overcast","Medium"],
        ["Rainy","High"],
        ["Rainy","Low"],
        ["Overcast","Low"],
        ["Rainy","Medium"],
        ["Sunny","Medium"],
        ]
# ['Yes', 'Yes', 'No', 'No', 'No', 'No', 'No']


predicted = naive_classifier.predict(result,data,["Outlook","Wind"])

In [243]:
print(predicted)

['Yes', 'Yes', 'No', 'No', 'No', 'Yes', 'No', 'Yes']
