In [1]:
import math
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from matplotlib import pyplot as plt
from google.colab import drive

drive.mount('/content/drive')

GOOGLE_DRIVE_PATH_AFTER_MYDRIVE = 'academic/graduate/courses/1. fall 2023/cs760/hw4/'
GOOGLE_DRIVE_PATH = os.path.join('drive', 'My Drive', GOOGLE_DRIVE_PATH_AFTER_MYDRIVE)

Mounted at /content/drive


In [94]:
class naive_bayes_classifier:
  def __init__(self, total, n, alpha, K):
    # total number of datapoints
    self.total = total
    # datapoints per class
    self.n = n
    # smoothing parameter
    self.alpha = alpha
    # number of classes
    self.K = K
    # priors
    self.priors = {"e": math.log((n+alpha)/(total + alpha*K)),
                   "j" : math.log((n+alpha)/(total + alpha*K)),
                   "s" : math.log((n+alpha)/(total + alpha*K))}
    self.english_class_conditional_probabilities = dict()
    self.japanese_class_conditional_probabilities = dict()
    self.spanish_class_conditional_probabilities = dict()

  def get_char_count(self, language_char):
    char_count = {}
    total = 0
    for i in range(10):

      # from https://www.geeksforgeeks.org/python-program-to-read-character-by-character-from-a-file/
      file = open(GOOGLE_DRIVE_PATH + "languageID/" + language_char + str(i) + ".txt", 'r')

      while 1:

          # read by character
          char = file.read(1)
          if not char:
              break
          if char == "\n":
            continue
          if char not in char_count:
            char_count[char] = 1
            total += 1
          else:
            char_count[char] += 1
            total += 1


      if language_char == "j":
        char_count["x"] = 0

      file.close()

    char_count_sorted = sorted(char_count.items())

    space = char_count_sorted[0]
    char_count_sorted = char_count_sorted[1:]
    char_count_sorted.append(space)
    return char_count_sorted, total

  def train(self, char_count):
    # character count & total number of characters
    # for English
    english_char_count, english_total = self.get_char_count("e")
    # for Japanese
    japanese_char_count, japanese_total = self.get_char_count("j")
    # for Spanish
    spanish_char_count, spanish_total = self.get_char_count("s")

    # class conditional probabilities
    # i.e. given y="English", how likely is "a"?
    # for English
    # english_class_conditional_probabilities = {}
    for idx in range(len(english_char_count)):
      self.english_class_conditional_probabilities[english_char_count[idx][0]] = math.log((english_char_count[idx][1] + self.alpha) / (english_total + self.alpha * char_count))
      # self.english_class_conditional_probabilities[english_char_count[idx][0]] = (english_char_count[idx][1] + self.alpha) / (english_total + self.alpha * self.n)
    print("english_class_conditional_probabilities")
    print(self.english_class_conditional_probabilities)

    # for Japanese
    # japanese_class_conditional_probabilities = {}
    for idx in range(len(japanese_char_count)):
      self.japanese_class_conditional_probabilities[japanese_char_count[idx][0]] = math.log((japanese_char_count[idx][1] + self.alpha) / (japanese_total + self.alpha * char_count))
      # self.japanese_class_conditional_probabilities[japanese_char_count[idx][0]] = (japanese_char_count[idx][1] + self.alpha) / (japanese_total + self.alpha * self.n)
    print("japanese_class_conditional_probabilities")
    print(self.japanese_class_conditional_probabilities)

    # for Spanish
    # spanish_class_conditional_probabilities = {}
    for idx in range(len(spanish_char_count)):
      self.spanish_class_conditional_probabilities[spanish_char_count[idx][0]] = math.log((spanish_char_count[idx][1] + self.alpha) / (spanish_total + self.alpha * char_count))
      # self.spanish_class_conditional_probabilities[spanish_char_count[idx][0]] = (spanish_char_count[idx][1] + self.alpha) / (spanish_total + self.alpha * self.n)
    print("spanish_class_conditional_probabilities")
    print(self.spanish_class_conditional_probabilities)

  def classify(self, file_name):

    # get bag of words representation
    # ("a": count) for all 27 characters
    char_count = {}
    total = 0

      # from https://www.geeksforgeeks.org/python-program-to-read-character-by-character-from-a-file/
    file = open(GOOGLE_DRIVE_PATH + "languageID/" + file_name + ".txt", 'r')

    while 1:

        # read by character
        char = file.read(1)
        if not char:
            break
        if char == "\n":
          continue
        if char not in char_count:
          char_count[char] = 1
          total += 1
        else:
          char_count[char] += 1
          total += 1

    file.close()
    char_count_sorted = sorted(char_count.items())
    # rearranging so space is at the end
    space = char_count_sorted[0]
    char_count_sorted = char_count_sorted[1:]
    char_count_sorted.append(space)

    # now, calculate p(x|y) for each y

    # y = English
    english_likelihood = 0
    for char in char_count_sorted:
      # print(char)
      # print(char[1])
      char_prob = 1
      # for i in range(char[1]):
        # print("multiplying " + str(char[0]) + str(char[1]) + " times")
      english_likelihood += char[1] * self.english_class_conditional_probabilities[char[0]]
      # english_posterior += char_prob
    print("english_likelihood")
    print(english_likelihood)


    # y = Japanese
    japanese_likelihood = 0
    for char in char_count_sorted:
      # for i in range(char[1]):
        # japanese_posterior *= self.japanese_class_conditional_probabilities[char[0]]
      japanese_likelihood += char[1] * self.japanese_class_conditional_probabilities[char[0]]
    print("japanese_likelihood")
    print(japanese_likelihood)

    # y = Spanish
    spanish_likelihood = 0
    for char in char_count_sorted:
      # for i in range(char[1]):
      spanish_likelihood += char[1] * self.spanish_class_conditional_probabilities[char[0]]
    print("spanish_likelihood")
    print(spanish_likelihood)
    print()

    # posteriors
    print("posteriors: e, j, s")
    english_posterior = self.priors["e"] + english_likelihood
    print(english_posterior)
    japanese_posterior = self.priors["j"] + japanese_likelihood
    print(japanese_posterior)
    spanish_posterior = self.priors["s"] + spanish_likelihood
    print(spanish_posterior)







In [95]:
classifier = naive_bayes_classifier(30, 10, 1/2, 3)
classifier.train(27)

english_class_conditional_probabilities
{'a': -2.8106061285981796, 'b': -4.497664277973426, 'c': -3.8392375661172182, 'd': -3.8179601676699337, 'e': -2.25028454131573, 'f': -3.966861491611048, 'g': -4.046758776467488, 'h': -3.05301703039592, 'i': -2.892985446502579, 'j': -6.556547092632226, 'k': -5.590359389613447, 'l': -3.5412402159536045, 'm': -3.8864161263923087, 'n': -2.848663323404553, 'o': -2.741649867702676, 'p': -4.089236204734605, 'q': -7.484533864269571, 'r': -2.9220255997237423, 's': -2.7153458726599693, 't': -2.524160404954204, 'u': -3.624423540055839, 'v': -4.679392538992042, 'w': -4.167144439879508, 'x': -6.7623991468363736, 'y': -4.27987628840365, 'z': -7.373308229159347, ' ': -1.7189740299171272}
japanese_class_conditional_probabilities
{'a': -2.0267306146406696, 'b': -4.522033199719792, 'c': -5.2055803065507185, 'd': -4.061316937856336, 'e': -2.810003875371097, 'f': -5.552295910586692, 'g': -4.267864684592458, 'h': -3.449481017721548, 'i': -2.332699624592227, 'j': -6.0

In [96]:
classifier.classify("e10")

english_likelihood
-7841.865447060635
japanese_likelihood
-8771.433079075032
spanish_likelihood
-8467.282044010557

posteriors: e, j, s
-7842.964059349303
-8772.5316913637
-8468.380656299225
