In [1]:
import os
import string
import re
import numpy as np
import math

In [2]:
s = set(string.ascii_lowercase + " ")
langs = {"English":"e", "Japanese":"j", "Spanish":"s"}

In [3]:
# returns disctionary of chars and their counts 
def read_file(file_name):    
    data_file = open(os.getcwd()+"/languageID/"+file_name, "r")
    text = data_file.read()
    data_file.close()
    
    counts = {}
    for char in s:
        counts[char] = text.count(char)
    
    return dict(sorted(counts.items(), key=lambda x:x[1], reverse=True))

In [4]:
# return dictionary of dictionaries
# outer key: file_name
# inner key: char
# value: count
def get_training_data():
    counts_dict_dict = {}
    for file_name in sorted(os.listdir(os.getcwd()+"/languageID/")):
        if re.search("^(e|j|s)[0-9](?!\d)", file_name) is not None:
            counts_dict_dict[file_name] = read_file(file_name)
            
    return counts_dict_dict

In [5]:
# returns the prior probability of a language
# default additive smoothing parameter 1/2
def get_prior_prob(training_data, lang, a=0.5):
    count = 0
    for file_name in training_data:
        if re.search("^"+lang, file_name) is not None:
            count += 1
            
    return (count + a)/(len(training_data) + a*len(langs))

In [6]:
# FIXME?? smoothing is messing up p(x)?

# returns dictionary of chars and their corresponding probabilities in training data given a language
# normalize sets the largest prob to 1 if set to true, default false
def get_class_cond_probs(training_data, lang="all", a=0.5, normalize=False):
    
    # initialize return variable
    class_cond_probs = {}
    for char in s:
        class_cond_probs[char] = 0
    
    # getting the counts conditioned on language
    if not lang == "all": 
        for file_name in training_data:
            if re.search("^"+lang, file_name) is not None:
                for char in s:
                    class_cond_probs[char] += training_data[file_name][char]
    # getting the counts not conditioned on language
    else:
        for file_name in training_data:
            for char in s:
                class_cond_probs[char] += training_data[file_name][char]
    
    # getting the probs with smoothing
    tot_chars = 0
    for char in class_cond_probs:
        tot_chars += class_cond_probs[char]
    
    for char in class_cond_probs:
        class_cond_probs[char] = (class_cond_probs[char] + a)/(tot_chars + a*len(s))
    
    # sets the largest prob to 1 if set to true, default false
    if normalize:
        max_prob = max(class_cond_probs.values())
        for char in class_cond_probs:
            class_cond_probs[char] *= (1/max_prob)
    
    return dict(sorted(class_cond_probs.items(), key=lambda x:x[1], reverse=True))

In [7]:
def get_probs_x(test_data, a=0.5):
    probs = {}
    for char in s:
        probs[char] = test_data[char]
        
    tot_chars = 0
    for char in probs:
        tot_chars += probs[char]
        
    for char in probs:
#         probs[char] = (probs[char] + a)/(tot_chars + a*len(s))
        probs[char] /= tot_chars 
        
    return dict(sorted(probs.items(), key=lambda x:x[1], reverse=True))

In [8]:
# returns the log probability of count-vextor x given conditional probability vector theta
def p_of_x_given_y(x, theta):
    prob = 0
    for char in s:
        prob += x[char] * np.log(theta[char])
    
    return prob

## Question 2.1

In [9]:
training_data = get_training_data()
for lang in langs:
    print(lang + ": " + str(get_prior_prob(training_data, langs[lang])))

English: 0.3333333333333333
Japanese: 0.3333333333333333
Spanish: 0.3333333333333333


## Question 2.2

In [10]:
training_data = get_training_data()
class_cond_probs = get_class_cond_probs(training_data, lang="e")

print("English Class Conditional Probabilities:")
for item in class_cond_probs.items():
    print(item)

English Class Conditional Probabilities:
(' ', 0.1792499586981662)
('e', 0.1053692383941847)
('t', 0.08012555757475633)
('s', 0.06618205848339666)
('o', 0.06446390219725756)
('a', 0.0601685114819098)
('n', 0.057921691723112505)
('i', 0.055410540227986124)
('r', 0.053824549810011564)
('h', 0.047216256401784236)
('l', 0.028977366595076822)
('u', 0.026664463902197257)
('d', 0.021972575582355856)
('c', 0.021509995043779945)
('m', 0.020518751032545846)
('f', 0.018932760614571286)
('g', 0.017478936064761277)
('p', 0.01675202378985627)
('w', 0.015496448042293078)
('y', 0.013844374690236246)
('b', 0.011134974392863043)
('v', 0.009284652238559392)
('k', 0.0037336857756484387)
('j', 0.001420783082768875)
('x', 0.001156451346439782)
('z', 0.0006277878737815959)
('q', 0.0005617049396993227)


## Question 2.3

In [11]:
training_data = get_training_data()
class_cond_probs = get_class_cond_probs(training_data, lang="j")

print("Japanese Class Conditional Probabilities:")
for item in class_cond_probs.items():
    print(item)

Japanese Class Conditional Probabilities:
('a', 0.1317656102589189)
(' ', 0.12344945665466997)
('i', 0.09703343932352633)
('o', 0.09116321324993885)
('u', 0.07061742199238269)
('e', 0.06020475907613823)
('k', 0.05740941332681086)
('t', 0.056990111464411755)
('n', 0.05671057688947902)
('r', 0.04280373178657535)
('s', 0.0421747789929767)
('m', 0.03979873510604843)
('h', 0.03176211607673224)
('w', 0.01974212935462455)
('d', 0.01722631818022992)
('y', 0.01415143785596981)
('g', 0.014011670568503443)
('b', 0.010866906600510151)
('z', 0.00772214263251686)
('c', 0.005485866033054963)
('f', 0.003878542227191726)
('j', 0.0023411020650616725)
('l', 0.001432614696530277)
('p', 0.0008735455466648031)
('v', 0.0002445927530661449)
('q', 0.00010482546559977637)
('x', 3.4941821866592126e-05)


In [12]:
training_data = get_training_data()
class_cond_probs = get_class_cond_probs(training_data, lang="s")

print("Spanish Class Conditional Probabilities:")
for item in class_cond_probs.items():
    print(item)

Spanish Class Conditional Probabilities:
(' ', 0.16826493170115014)
('e', 0.1138108599796491)
('a', 0.10456045141993771)
('o', 0.07249236841293824)
('s', 0.06577040485954797)
('r', 0.05929511886774999)
('n', 0.054176559464709693)
('l', 0.052943171656748174)
('i', 0.049859702136844375)
('d', 0.039745922111559924)
('c', 0.03752582405722919)
('t', 0.03561407295488884)
('u', 0.03370232185254849)
('m', 0.02580863988159477)
('p', 0.02426690512164287)
('f', 0.00860287996053159)
('b', 0.008232863618143134)
('y', 0.007862847275754679)
('q', 0.007677839104560451)
('g', 0.0071844839813758445)
('j', 0.006629459467793161)
('v', 0.00588942678301625)
('h', 0.0045327001942585795)
('z', 0.0026826184823163022)
('x', 0.0024976103111220747)
('k', 0.0002775122567913416)
('w', 9.250408559711388e-05)


## Question 2.4

In [13]:
x = read_file("e10.txt")
for item in x.items():
    print(item)

(' ', 498)
('e', 311)
('t', 225)
('s', 186)
('o', 182)
('a', 164)
('r', 141)
('h', 140)
('i', 140)
('n', 139)
('l', 85)
('u', 65)
('m', 64)
('d', 57)
('f', 55)
('c', 53)
('p', 53)
('g', 51)
('w', 47)
('y', 38)
('b', 32)
('v', 31)
('k', 6)
('x', 4)
('j', 3)
('q', 3)
('z', 2)


## Question 2.5

In [14]:
training_data = get_training_data()
theta_e = get_class_cond_probs(training_data, lang="e")
theta_j = get_class_cond_probs(training_data, lang="j")
theta_s = get_class_cond_probs(training_data, lang="s")
x = read_file("e10.txt")

print("Log Likelihood - English:  " + str(p_of_x_given_y(x, theta_e)))
print("Log Likelihood - Japanese: " + str(p_of_x_given_y(x, theta_j)))
print("Log Likelihood - Spanish:  " + str(p_of_x_given_y(x, theta_s)))

Log Likelihood - English:  -7841.865447060634
Log Likelihood - Japanese: -8771.433079075034
Log Likelihood - Spanish:  -8467.282044010559


## Question 2.6

In [15]:
# TODO: FIXME(P(x) very far from P(x|y))
def get_predicted_prob(training_data, test_data, lang):
    theta_lang = get_class_cond_probs(training_data, lang=lang)
    theta_all = get_class_cond_probs(training_data, lang="all")
#     theta_all = get_probs_x(test_data)     used to be called theta_x
    x = get_probs_x(test_data)
    
#     print(get_prior_prob(training_data, lang))
#     print(p_of_x_given_y(test_data, theta_lang))
#     print(p_of_x_given_y(test_data, theta_x))
    
#     return get_prior_prob(training_data, lang) 
#             * math.e**(p_of_x_given_y(get_probs_x(test_data), theta_lang) 
#                      - p_of_x_given_y(get_probs_x(test_data), theta_all))
    


    return math.e**(p_of_x_given_y(x, theta_lang)
                    + np.log(get_prior_prob(training_data, lang))
                    - p_of_x_given_y(x, theta_all))

In [16]:
training_data = get_training_data()
test_data = read_file("e10.txt")

for lang in langs:
    print(lang + ": " + str(get_predicted_prob(training_data, test_data, langs[lang])))

English: 0.3530348267904746
Japanese: 0.2525444782475066
Spanish: 0.2817982420844059


In [17]:
def make_prediction(training_data, test_data):
    
    probs = {}
    for lang in langs:
        theta = get_class_cond_probs(training_data, langs[lang])
#         probs[lang] = p_of_x_given_y(test_data, theta)
        probs[lang] = get_predicted_prob(training_data, test_data, langs[lang])
    
#     print("\n" + str(sum(probs.values())))
    return max(probs, key=probs.get)

In [18]:
training_data = get_training_data()
test_data = read_file("e10.txt")

print(make_prediction(training_data, test_data))

English


## Question 2.7

In [19]:
training_data = get_training_data()
for file_name in sorted(os.listdir(os.getcwd()+"/languageID/")):
    test_data = read_file(file_name)
    
    print(list(langs.keys()) [list(langs.values()).index(file_name[0])] + ": " +
         make_prediction(training_data, get_probs_x(test_data)))

English: English
English: English
English: English
English: English
English: English
English: English
English: English
English: English
English: English
English: English
English: English
English: English
English: English
English: English
English: English
English: English
English: English
English: English
English: English
English: English
Japanese: Japanese
Japanese: Japanese
Japanese: Japanese
Japanese: Japanese
Japanese: Japanese
Japanese: Japanese
Japanese: Japanese
Japanese: Japanese
Japanese: Japanese
Japanese: Japanese
Japanese: Japanese
Japanese: Japanese
Japanese: Japanese
Japanese: Japanese
Japanese: Japanese
Japanese: Japanese
Japanese: Japanese
Japanese: Japanese
Japanese: Japanese
Japanese: Japanese
Spanish: Spanish
Spanish: Spanish
Spanish: Spanish
Spanish: Spanish
Spanish: Spanish
Spanish: Spanish
Spanish: Spanish
Spanish: Spanish
Spanish: Spanish
Spanish: Spanish
Spanish: Spanish
Spanish: Spanish
Spanish: Spanish
Spanish: Spanish
Spanish: Spanish
Spanish: Spanish
Spanish:

## Question 2.8