In [1]:
import os
import string
import re
import numpy as np
import math
import random

In [2]:
s = set(string.ascii_lowercase + " ")
langs = {"English":"e", "Japanese":"j", "Spanish":"s"}

In [3]:
# returns disctionary of chars and their counts 
def read_file(file_name):    
    data_file = open(os.getcwd()+"/languageID/"+file_name, "r")
    text = data_file.read()
    data_file.close()
    
    counts = {}
    for char in s:
        counts[char] = text.count(char)
    
    return dict(sorted(counts.items(), key=lambda x:x[1], reverse=True))

In [4]:
# return dictionary of dictionaries
# outer key: file_name
# inner key: char
# value: count
def get_training_data():
    counts_dict_dict = {}
    for file_name in sorted(os.listdir(os.getcwd()+"/languageID/")):
        if re.search("^(e|j|s)[0-9](?!\d)", file_name) is not None:
            counts_dict_dict[file_name] = read_file(file_name)
            
    return counts_dict_dict

In [5]:
# returns the prior probability of a language
# default additive smoothing parameter 1/2
def get_prior_prob(training_data, lang, a=0.5):
    count = 0
    for file_name in training_data:
        if re.search("^"+lang, file_name) is not None:
            count += 1
            
    return (count + a)/(len(training_data) + a*len(langs))

In [6]:
# returns dictionary of chars and their corresponding probabilities in training data given a language
# default smoothing parameter 1/2
def get_class_cond_probs(training_data, lang="all", a=0.5):
    
    # initialize return variable
    class_cond_probs = {}
    for char in s:
        class_cond_probs[char] = 0
    
    # getting the counts conditioned on language
    if not lang == "all": 
        for file_name in training_data:
            if re.search("^"+lang, file_name) is not None:
                for char in s:
                    class_cond_probs[char] += training_data[file_name][char]
    # getting the counts not conditioned on language
    else:
        for file_name in training_data:
            for char in s:
                class_cond_probs[char] += training_data[file_name][char]
    
    # getting the probs with smoothing    
    tot_chars = sum(class_cond_probs.values())
    
    for char in class_cond_probs:
        class_cond_probs[char] = (class_cond_probs[char] + a)/(tot_chars + a*len(s))
    
    return dict(sorted(class_cond_probs.items(), key=lambda x:x[1], reverse=True))

In [7]:
# returns dictionary of chars and their corresponding probabilities in the test_data
def get_probs_x(test_data, a=0.5):
    probs = {}
    for char in s:
        probs[char] = test_data[char]
        
    tot_chars = sum(probs.values())
        
    for char in probs:
        probs[char] /= tot_chars 
        
    return dict(sorted(probs.items(), key=lambda x:x[1], reverse=True))

In [8]:
# returns the log probability of count-vextor x given conditional probability vector theta
def p_of_x_given_y(x, theta):
    prob = 0
    for char in s:
        prob += x[char] * np.log(theta[char])
    
    return prob

## Question 2.1

In [9]:
training_data = get_training_data()
for lang in langs:
    print(lang + ": " + str(get_prior_prob(training_data, langs[lang])))

English: 0.3333333333333333
Japanese: 0.3333333333333333
Spanish: 0.3333333333333333


## Question 2.2

In [10]:
training_data = get_training_data()
class_cond_probs = get_class_cond_probs(training_data, lang="e")

print("English Class Conditional Probabilities:")
for item in class_cond_probs.items():
    print(item)

English Class Conditional Probabilities:
(' ', 0.1792499586981662)
('e', 0.1053692383941847)
('t', 0.08012555757475633)
('s', 0.06618205848339666)
('o', 0.06446390219725756)
('a', 0.0601685114819098)
('n', 0.057921691723112505)
('i', 0.055410540227986124)
('r', 0.053824549810011564)
('h', 0.047216256401784236)
('l', 0.028977366595076822)
('u', 0.026664463902197257)
('d', 0.021972575582355856)
('c', 0.021509995043779945)
('m', 0.020518751032545846)
('f', 0.018932760614571286)
('g', 0.017478936064761277)
('p', 0.01675202378985627)
('w', 0.015496448042293078)
('y', 0.013844374690236246)
('b', 0.011134974392863043)
('v', 0.009284652238559392)
('k', 0.0037336857756484387)
('j', 0.001420783082768875)
('x', 0.001156451346439782)
('z', 0.0006277878737815959)
('q', 0.0005617049396993227)


## Question 2.3

In [11]:
training_data = get_training_data()
class_cond_probs = get_class_cond_probs(training_data, lang="j")

print("Japanese Class Conditional Probabilities:")
for item in class_cond_probs.items():
    print(item)

Japanese Class Conditional Probabilities:
('a', 0.1317656102589189)
(' ', 0.12344945665466997)
('i', 0.09703343932352633)
('o', 0.09116321324993885)
('u', 0.07061742199238269)
('e', 0.06020475907613823)
('k', 0.05740941332681086)
('t', 0.056990111464411755)
('n', 0.05671057688947902)
('r', 0.04280373178657535)
('s', 0.0421747789929767)
('m', 0.03979873510604843)
('h', 0.03176211607673224)
('w', 0.01974212935462455)
('d', 0.01722631818022992)
('y', 0.01415143785596981)
('g', 0.014011670568503443)
('b', 0.010866906600510151)
('z', 0.00772214263251686)
('c', 0.005485866033054963)
('f', 0.003878542227191726)
('j', 0.0023411020650616725)
('l', 0.001432614696530277)
('p', 0.0008735455466648031)
('v', 0.0002445927530661449)
('q', 0.00010482546559977637)
('x', 3.4941821866592126e-05)


In [12]:
training_data = get_training_data()
class_cond_probs = get_class_cond_probs(training_data, lang="s")

print("Spanish Class Conditional Probabilities:")
for item in class_cond_probs.items():
    print(item)

Spanish Class Conditional Probabilities:
(' ', 0.16826493170115014)
('e', 0.1138108599796491)
('a', 0.10456045141993771)
('o', 0.07249236841293824)
('s', 0.06577040485954797)
('r', 0.05929511886774999)
('n', 0.054176559464709693)
('l', 0.052943171656748174)
('i', 0.049859702136844375)
('d', 0.039745922111559924)
('c', 0.03752582405722919)
('t', 0.03561407295488884)
('u', 0.03370232185254849)
('m', 0.02580863988159477)
('p', 0.02426690512164287)
('f', 0.00860287996053159)
('b', 0.008232863618143134)
('y', 0.007862847275754679)
('q', 0.007677839104560451)
('g', 0.0071844839813758445)
('j', 0.006629459467793161)
('v', 0.00588942678301625)
('h', 0.0045327001942585795)
('z', 0.0026826184823163022)
('x', 0.0024976103111220747)
('k', 0.0002775122567913416)
('w', 9.250408559711388e-05)


## Question 2.4

In [13]:
x = read_file("e10.txt")
for item in x.items():
    print(item)

(' ', 498)
('e', 311)
('t', 225)
('s', 186)
('o', 182)
('a', 164)
('r', 141)
('h', 140)
('i', 140)
('n', 139)
('l', 85)
('u', 65)
('m', 64)
('d', 57)
('f', 55)
('p', 53)
('c', 53)
('g', 51)
('w', 47)
('y', 38)
('b', 32)
('v', 31)
('k', 6)
('x', 4)
('q', 3)
('j', 3)
('z', 2)


## Question 2.5

In [14]:
training_data = get_training_data()
theta_e = get_class_cond_probs(training_data, lang="e")
theta_j = get_class_cond_probs(training_data, lang="j")
theta_s = get_class_cond_probs(training_data, lang="s")
x = read_file("e10.txt")

print("Log Likelihood - English:  " + str(p_of_x_given_y(x, theta_e)))
print("Log Likelihood - Japanese: " + str(p_of_x_given_y(x, theta_j)))
print("Log Likelihood - Spanish:  " + str(p_of_x_given_y(x, theta_s)))

Log Likelihood - English:  -7841.865447060635
Log Likelihood - Japanese: -8771.43307907503
Log Likelihood - Spanish:  -8467.282044010557


## Question 2.6

In [15]:
# returns log(p(y|x) * p(x))
def get_predicted_prob(training_data, test_data, lang):
    theta_lang = get_class_cond_probs(training_data, lang=lang)
    theta_all = get_class_cond_probs(training_data, lang="all")
    x = test_data
        
    return p_of_x_given_y(x, theta_lang) + np.log(get_prior_prob(training_data, lang))

# Below returns atual probabilites, uneccessary for homework per Piaza

#     return (get_prior_prob(training_data, lang) 
#             * math.e**(p_of_x_given_y(test_data, theta_lang) 
#             - p_of_x_given_y(test_data, theta_all)))

#     return math.e**(p_of_x_given_y(x, theta_lang)
#                     + np.log(get_prior_prob(training_data, lang))
#                     - p_of_x_given_y(x, theta_all))

In [16]:
training_data = get_training_data()
test_data = read_file("e10.txt")

for lang in langs:
    print(lang + ": " + str(get_predicted_prob(training_data, test_data, langs[lang])))

English: -7842.964059349303
Japanese: -8772.531691363698
Spanish: -8468.380656299225


In [17]:
def make_prediction(training_data, test_data):
    
    probs = {}
    for lang in langs:
        theta = get_class_cond_probs(training_data, langs[lang])
        probs[lang] = get_predicted_prob(training_data, test_data, langs[lang])
    
    return max(probs, key=probs.get)

In [18]:
training_data = get_training_data()
test_data = read_file("e10.txt")

print(make_prediction(training_data, test_data))

English


## Question 2.7

In [19]:
training_data = get_training_data()

test_file_names = []
print("Actual: Predicted\n-----------------")
for file_name in sorted(os.listdir(os.getcwd()+"/languageID/")):
    if re.search("^(e|j|s)[0-9]{2}(?!\d)", file_name) is not None:
        test_data = read_file(file_name)
        test_file_names.append(file_name)
    
        print(list(langs.keys()) [list(langs.values()).index(file_name[0])] + ": " +
             make_prediction(training_data, test_data))
        
print("\nFiles Tested: ")
for name in test_file_names:
    print(name)

Actual: Predicted
-----------------
English: English
English: English
English: English
English: English
English: English
English: English
English: English
English: English
English: English
English: English
Japanese: Japanese
Japanese: Japanese
Japanese: Japanese
Japanese: Japanese
Japanese: Japanese
Japanese: Japanese
Japanese: Japanese
Japanese: Japanese
Japanese: Japanese
Japanese: Japanese
Spanish: Spanish
Spanish: Spanish
Spanish: Spanish
Spanish: Spanish
Spanish: Spanish
Spanish: Spanish
Spanish: Spanish
Spanish: Spanish
Spanish: Spanish
Spanish: Spanish

Files Tested: 
e10.txt
e11.txt
e12.txt
e13.txt
e14.txt
e15.txt
e16.txt
e17.txt
e18.txt
e19.txt
j10.txt
j11.txt
j12.txt
j13.txt
j14.txt
j15.txt
j16.txt
j17.txt
j18.txt
j19.txt
s10.txt
s11.txt
s12.txt
s13.txt
s14.txt
s15.txt
s16.txt
s17.txt
s18.txt
s19.txt


## Question 2.8

### - Without Shuffling

In [20]:
file_name = "s11.txt"

data_file = open(os.getcwd()+"/languageID/"+file_name, "r")
text = data_file.read()
data_file.close()

print(text)









cuales son los problemas

la depilacion de las cejas siempre debe seguir su linea natural hasta hacerlo impecable depilar demasiado las cejas es privar al rostro de personalidad mientras que no depilarlas supone lucir una mirada poco cuidada

son varias la razones por las que la piel de este area es mas susceptible a las arrugas la sequedad y la flacidez la piel del contorno de ojos tiene un espesor insignificante  mm mientras que la del resto del rostro posee mm y la del cuello mm presenta escasas glandulas sebaceas y las sudoriparas aunque numerosas resultan extremadamente pequenas

entre ceja y ceja







publicidad








In [21]:
training_data = get_training_data()

test_data = {}
for char in s:
    test_data[char] = text.count(char)
    
print(make_prediction(training_data, test_data))

Spanish


### - With Shuffling

In [22]:
file_name = "s11.txt"
data_file = open(os.getcwd()+"/languageID/"+file_name, "r")
text = data_file.read()

s = text
l = list(s)
random.shuffle(l)
text = ''.join(l)
data_file.close()

print(text)

lres eecraoosmpicu a
 os  taa aec le
p esdd  o ar   amylro amsaudisuleataayujesapsrs nscoleli lsri
e  
anu  
dpl eti
ncy sed
eaene e dpmzelimis lpqystgraell a
as de llp
zaupnlae sseildevonu
uietd eaaeepq
nseeor ca
rrmauu r nrsso seplisnsesundfpsesjsi eaeilda jrenneoesciessstst iarcssorsdirriastiuaoaasanan
smoeluax
oc  seie     rleiqq ilg
alnepmlam
adln ncassmaosc d
eaasponl 
aoaele  nae tenvir faed a cpeeba elaa s  
je eheltrcutlom dblias ss  s
atie  rn mnraejaoddr aeos 
ae el dc  aq  remneol irtc
 oroa eor
c a olsrs  a pdeaoeuemubhat  duiptu
 ranurcp  
padgc aem dodl
slstetml
ea e 
aaluabbl rnrnomlippriddrblue   oeanuqi uilga euca d


In [23]:
training_data = get_training_data()

test_data = {}
for char in s:
    test_data[char] = text.count(char)
    
print(make_prediction(training_data, test_data))

Spanish
