In [16]:
import math
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from matplotlib import pyplot as plt
from google.colab import drive

drive.mount('/content/drive')

GOOGLE_DRIVE_PATH_AFTER_MYDRIVE = 'academic/graduate/courses/1. fall 2023/cs760/hw4/'
GOOGLE_DRIVE_PATH = os.path.join('drive', 'My Drive', GOOGLE_DRIVE_PATH_AFTER_MYDRIVE)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [17]:
# training

# get a dict of 27 chars and their counts based on language
def get_char_count(language_char):
  char_count = {}
  total = 0
  for i in range(10):

    # from https://www.geeksforgeeks.org/python-program-to-read-character-by-character-from-a-file/
    file = open(GOOGLE_DRIVE_PATH + "languageID/" + language_char + str(i) + ".txt", 'r')

    while 1:

        # read by character
        char = file.read(1)
        if not char:
            break
        if char == "\n":
          continue
        if char not in char_count:
          char_count[char] = 1
          total += 1
        else:
          char_count[char] += 1
          total += 1

    if language_char == "j":
        char_count["x"] = 0
    file.close()

  char_count_sorted = sorted(char_count.items())

  space = char_count_sorted[0]
  char_count_sorted = char_count_sorted[1:]
  char_count_sorted.append(space)
  return char_count_sorted, total


# for English
english_char_count, english_total = get_char_count("e")

# for Japanese
japanese_char_count, japanese_total = get_char_count("j")

# for Spanish
spanish_char_count, spanish_total = get_char_count("s")

### Part 1

In [18]:
total = 60
n = 27
alpha = 1/2

priors = {"e": math.log((n+alpha)/(total + alpha*total)),
          "j" : math.log((n+alpha)/(total + alpha*total)),
          "s" : math.log((n+alpha)/(total + alpha*total))}

priors

{'e': -1.1856236656577395, 'j': -1.1856236656577395, 's': -1.1856236656577395}

### Part 2

In [19]:
english_class_conditional_probabilities = {}
for idx in range(len(english_char_count)):
  english_class_conditional_probabilities[english_char_count[idx][0]] = (english_char_count[idx][1] + alpha) / (english_total + alpha * n)
english_class_conditional_probabilities

{'a': 0.0601685114819098,
 'b': 0.011134974392863043,
 'c': 0.021509995043779945,
 'd': 0.021972575582355856,
 'e': 0.1053692383941847,
 'f': 0.018932760614571286,
 'g': 0.017478936064761277,
 'h': 0.047216256401784236,
 'i': 0.055410540227986124,
 'j': 0.001420783082768875,
 'k': 0.0037336857756484387,
 'l': 0.028977366595076822,
 'm': 0.020518751032545846,
 'n': 0.057921691723112505,
 'o': 0.06446390219725756,
 'p': 0.01675202378985627,
 'q': 0.0005617049396993227,
 'r': 0.053824549810011564,
 's': 0.06618205848339666,
 't': 0.08012555757475633,
 'u': 0.026664463902197257,
 'v': 0.009284652238559392,
 'w': 0.015496448042293078,
 'x': 0.001156451346439782,
 'y': 0.013844374690236246,
 'z': 0.0006277878737815959,
 ' ': 0.1792499586981662}

### Part 3

In [20]:
japanese_class_conditional_probabilities = {}
for idx in range(len(japanese_char_count)):
  japanese_class_conditional_probabilities[japanese_char_count[idx][0]] = (japanese_char_count[idx][1] + alpha) / (japanese_total + alpha * n)
japanese_class_conditional_probabilities

{'a': 0.1317656102589189,
 'b': 0.010866906600510151,
 'c': 0.005485866033054963,
 'd': 0.01722631818022992,
 'e': 0.06020475907613823,
 'f': 0.003878542227191726,
 'g': 0.014011670568503443,
 'h': 0.03176211607673224,
 'i': 0.09703343932352633,
 'j': 0.0023411020650616725,
 'k': 0.05740941332681086,
 'l': 0.001432614696530277,
 'm': 0.03979873510604843,
 'n': 0.05671057688947902,
 'o': 0.09116321324993885,
 'p': 0.0008735455466648031,
 'q': 0.00010482546559977637,
 'r': 0.04280373178657535,
 's': 0.0421747789929767,
 't': 0.056990111464411755,
 'u': 0.07061742199238269,
 'v': 0.0002445927530661449,
 'w': 0.01974212935462455,
 'x': 3.4941821866592126e-05,
 'y': 0.01415143785596981,
 'z': 0.00772214263251686,
 ' ': 0.12344945665466997}

In [21]:
spanish_class_conditional_probabilities = {}
for idx in range(len(spanish_char_count)):
  spanish_class_conditional_probabilities[spanish_char_count[idx][0]] = (spanish_char_count[idx][1] + alpha) / (spanish_total + alpha * n)
spanish_class_conditional_probabilities

{'a': 0.10456045141993771,
 'b': 0.008232863618143134,
 'c': 0.03752582405722919,
 'd': 0.039745922111559924,
 'e': 0.1138108599796491,
 'f': 0.00860287996053159,
 'g': 0.0071844839813758445,
 'h': 0.0045327001942585795,
 'i': 0.049859702136844375,
 'j': 0.006629459467793161,
 'k': 0.0002775122567913416,
 'l': 0.052943171656748174,
 'm': 0.02580863988159477,
 'n': 0.054176559464709693,
 'o': 0.07249236841293824,
 'p': 0.02426690512164287,
 'q': 0.007677839104560451,
 'r': 0.05929511886774999,
 's': 0.06577040485954797,
 't': 0.03561407295488884,
 'u': 0.03370232185254849,
 'v': 0.00588942678301625,
 'w': 9.250408559711388e-05,
 'x': 0.0024976103111220747,
 'y': 0.007862847275754679,
 'z': 0.0026826184823163022,
 ' ': 0.16826493170115014}

### Part 4

In [22]:
char_count = {}
total = 0
file = open(GOOGLE_DRIVE_PATH + "languageID/" + "e10" + ".txt", 'r')

while 1:

    # read by character
    char = file.read(1)
    if not char:
        break
    if char == "\n":
      continue
    if char not in char_count:
      char_count[char] = 1
      total += 1
    else:
      char_count[char] += 1
      total += 1

file.close()

char_count_sorted = sorted(char_count.items())

space = char_count_sorted[0]
char_count_sorted = char_count_sorted[1:]
char_count_sorted.append(space)


char_count_sorted

[('a', 164),
 ('b', 32),
 ('c', 53),
 ('d', 57),
 ('e', 311),
 ('f', 55),
 ('g', 51),
 ('h', 140),
 ('i', 140),
 ('j', 3),
 ('k', 6),
 ('l', 85),
 ('m', 64),
 ('n', 139),
 ('o', 182),
 ('p', 53),
 ('q', 3),
 ('r', 141),
 ('s', 186),
 ('t', 225),
 ('u', 65),
 ('v', 31),
 ('w', 47),
 ('x', 4),
 ('y', 38),
 ('z', 2),
 (' ', 498)]

### Part 5

In [23]:
# english

english_likelihood = 1

for char in english_class_conditional_probabilities:
  english_likelihood *= english_class_conditional_probabilities[char]
english_likelihood

6.315537467105295e-48

In [24]:
# japanese

japanese_likelihood = 1

for char in japanese_class_conditional_probabilities:
  japanese_likelihood *= japanese_class_conditional_probabilities[char]
japanese_likelihood

8.185787648530995e-53

In [25]:
# spanish

spanish_likelihood = 1

for char in spanish_class_conditional_probabilities:
  spanish_likelihood *= spanish_class_conditional_probabilities[char]
spanish_likelihood

8.487609517797253e-50

### Part 6

In [26]:
priors["e"]*english_likelihood

-7.487850682348176e-48

In [27]:
priors["j"]*japanese_likelihood

-9.705263558147166e-53

In [28]:
priors["s"]*spanish_likelihood

-1.0063110709162297e-49