Jaeseung Lee

## 0. Library

In [1]:
import os
import numpy as np
from math import log, exp
from sklearn.linear_model import LinearRegression
import sys
import pandas as pd

## 1. Import Data

### 1.1 Training Data

#### English

In [2]:
eng_input_path = "../Data/Input/LangId.train.English"

# load english data, preprocessed data and save it
with open(eng_input_path, "r") as eng_content:
    
    eng_preprocessed = []
    for i in eng_content.readlines(): # read text line by line 
        
        word = i.split() # split into letters
        # remove punctuation at the end of the sentence
        if word[-1] == '.' or word[-1]=='?' or word[-1]=='!':
            word = word[:-1]
        # add start and end of the letter for the bigram
        word = ['<s>']+ word + ['</s>']

        eng_preprocessed.append(word) # save it to the list


#### French

In [3]:
fre_input_path = "../Data/Input/LangId.train.French"
# load french data, preprocessed data and save it
with open(fre_input_path, "r") as fre_content:
    
    fre_preprocessed = []
    for i in fre_content.readlines(): # read text line by line 
        
        
        word = i.split() # split into letters
        
        # some empty list exists
        if len(word) > 0:
            
            # remove punctuation at the end of the sentence
            if word[-1] == '.' or word[-1]=='?' or word[-1]=='!':
                word = word[:-1]
                    
            # add start and end of the letter for the bigram
            word = ['<s>']+ word + ['</s>']

            fre_preprocessed.append(word) # save it to the list

#### Italian

In [4]:
it_input_path = "../Data/Input/LangId.train.Italian"
# load french data, preprocessed data and save it
with open(it_input_path, "r") as it_content:
    
    it_preprocessed = []
    for i in it_content.readlines(): # read text line by line 
        
        
        word = i.split() # split into letters
        if len(word) > 0:
                          
           # remove punctuation at the end of the sentence
            if word[-1] == '.' or word[-1]=='?' or word[-1]=='!':
                word = word[:-1]
                
            # add start and end of the letter for the bigram
            word = ['<s>']+ word + ['</s>']

            it_preprocessed.append(word) # save it to the list


### 1.2 Test Data

In [5]:
test_path = "../Data/Validation/LangId.test"
with open(test_path,"r") as test_content:
    
    test_preprocessed = []
    
    for i in test_content.readlines():
        word = i.split()
                
        # remove punctuation at the end of the sentence
        if word[-1] == '.' or word[-1]=='?' or word[-1]=='!':
            word = word[:-1]
            
        # add start and end of the letter for the bigram
        word = ['<s>']+ word + ['</s>']
        test_preprocessed.append(word)

## 2. Generate bigram model

In [6]:
def gen_bigram(data):
    
    bigram = {}
    
    for sent in data: # iterate sentence
        for word in range(0, len(sent) -1): # iterate word in sentence
            
            # if word already exists in a bigram dictionary
            if sent[word] in bigram.keys():
                # if the next word already exists in the previous words dict, add 1
                if sent[word+1] in bigram[sent[word]].keys():
                    bigram[sent[word]][sent[word+1]] += 1
                # if the next word not exists in the previous words dict, set as a key, and init the value to 1
                else:
                    bigram[sent[word]][sent[word+1]] = 1
            # make key - value in dictionary, and init the value to 1
            else: 
                bigram[sent[word]] = {}
                bigram[sent[word]][sent[word + 1]] = 1
#             break
#         break
    return bigram

In [7]:
eng_bigram = gen_bigram(eng_preprocessed)

In [8]:
fre_bigram = gen_bigram(fre_preprocessed)

In [9]:
it_bigram = gen_bigram(it_preprocessed)

In [10]:
def smooth_gt(model):
    
    # part 1 - setting up vars
    c_nc = {}
    
    # below two lists are for linear regression
    c = [] # contain c values, which is frequency value  
    nc = [] # frequency of frequency c, contain Nc values
    
    # part 2
    # model has nested-dictionary structure
    # to count all the Nc, we need to use for loop twice
    
    # Build "c_nc" dictionary for smoothing model
    for key in model.keys():

        # iterate sub-dictionary
        for sub_key in model[key].keys():

            count = model[key][sub_key]
            if count in c_nc.keys():
                c_nc[count] += 1
                
            else:
                c_nc[count] = 1

                
            if count in c:
                nc[c.index(count)] += 1
            else:
                c.append(count)
                nc.append(1)                

    # part 3 - Simple good-turing
    # In cases of Nc + 1 is zero or N_(c+1) does not exist, 
    # we have to estimate these value using linear regression
    # f(n) = a + b*log(n)
    log_nc = [log(y) for y in nc]
    c = np.array(c).reshape(-1,1)
    log_nc = np.array(log_nc)
    reg = LinearRegression().fit(c, log_nc)
    a = reg.intercept_
#     print("INTERCEPT",a)
    b = reg.coef_
#     print("COEF",b)
    
    # part 4 - Update model
    for key in model.keys():
        for sub_key in model[key].keys():
            count = model[key][sub_key]
            
            # if both Nc and Nc+1 are not 0, and 
            # if both (count+1) and (count) exist in c_nc dictionary,
            # we can update using the formula (c+1)*Nc+1/Nc
            if count+1 in c_nc.keys() and count in c_nc.keys():
                model[key][sub_key] = (count+1)*(c_nc[count+1])/c_nc[count]
                
            # when Nc+1 is zero, or either (count+1) or (count) not exist in c_nc dictionary,
            #  we can estime N_(c+1) using linear regression
            else:
#                 print("predict", exp(reg.predict([[count+1],])))
                model[key][sub_key] = (count+1)*(exp(reg.predict([[count+1],])))/c_nc[count]
                                                 
    # unseen case: N1/N                                               
    model["unseen"] = c_nc[1]/(len(model.keys())*len(model.keys())-sum(nc))
#     print("N", (len(model.keys())*len(model.keys())-sum(nc)))

    return model

In [11]:
eng_bigram_gt = smooth_gt(eng_bigram)
fre_bigram_gt = smooth_gt(fre_bigram)
it_bigram_gt = smooth_gt(it_bigram)

In [12]:
eng_bigram_gt["unseen"], fre_bigram_gt["unseen"], it_bigram_gt["unseen"]

(0.0005683761702188131, 0.0004082777949164141, 0.0003750029511272075)

## 3. Apply Model

### Calculate probability for each language 

In [13]:
# word_prev: word_(n-1), word_n: word_(n)
# We have to calculate this P(word_(n)| word_(n-1))

def calc_prob_gt_smooth(model,word_prev,word_n):
    
    # start with 1, not zero which prevent return probability 0
    count = 0
    # As we started with 1, we have to add the total number of word tokens
    total = 0
    
    # if there are word_(n-1) in the model,
    if word_prev in model.keys():
        
        # iterate model by keys and count the total frequency of word_prev
        for key in model[word_prev].keys():
            total += model[word_prev][key]
            
        # if there are word_n in the dictionary of word_(n-1)
        if word_n in model[word_prev].keys():
            count = model[word_prev][word_n]   
        else:
            return model["unseen"]
    else:
        return model["unseen"]
        
    # calcuate the probability 
    result = count/total
    return result

In [14]:
# word_prev: word_(n-1), word_n: word_(n)
# We have to calculate this P(word_(n)| word_(n-1))

def calc_prob(model,word_prev,word_n):
    
    # start with 1, not zero which prevent return probability 0
    count = 0
    # As we started with 1, we have to add the total number of word tokens
    total = 0
    
    # if there are word_(n-1) in the model,
    if word_prev in model.keys():
        
        # if there are word_n in the dictionary of word_(n-1)
        if word_n in model[word_prev].keys():
            count = model[word_prev][word_n]
        else:
            count = 0
        
        # iterate model by keys and count the total frequency of word_prev
        for key in model[word_prev].keys():
            total += model[word_prev][key]
    else:
        return 0
    
    result = count/total
    
    return result

## 4. Check Accuracy

### Experiment 1: with smoothing

In [15]:
output_path = "../Data/Output/wordLangId2.out"

with open(output_path, "w+") as result:
    # iterate test data by sentence
    for idx, sent in enumerate(test_preprocessed):

        prob_dict = {"English": 0, "French" : 0, "Italian" : 0}
        # iterate words in sentence
        for word in range(0, len(sent) -1):
            # apply bigram model for english and calcuate probability
            prob_dict["English"] += calc_prob_gt_smooth(eng_bigram_gt,sent[word], sent[word+1])
            # apply bigram model for french and calcuate probability
            prob_dict["French"] += calc_prob_gt_smooth(fre_bigram_gt,sent[word], sent[word+1])
            # apply bigram model for italian and calcuate probability
            prob_dict["Italian"] += calc_prob_gt_smooth(it_bigram_gt,sent[word], sent[word+1])
            
        # compare probability and extract language with the high probability
        lang = max(prob_dict, key=prob_dict.get)
        result.write(str(idx+1) + " " + lang + "\n")

In [16]:
import pandas as pd

result_path = "../Data/Output/wordLangId2.out"
with open(result_path, "r") as result:
    result_list = []
    for i in result.readlines():
        result_list.append(i.split()[1])
        

In [17]:
ans_path = "../Data/Validation/labels.sol"
with open(ans_path, "r") as ans:
    ans_list = []
    for i in ans.readlines():
        ans_list.append(i.split()[1])

In [18]:
data = {"model": result_list, "answer":ans_list}
df = pd.DataFrame(data)
df

Unnamed: 0,model,answer
0,Italian,Italian
1,English,English
2,Italian,Italian
3,French,French
4,French,French
...,...,...
295,French,French
296,Italian,Italian
297,Italian,Italian
298,French,French


In [19]:
def check_match(model,answer):
    if model == answer:
        result = 1
    else:
        result = 0
    return result

In [20]:
df["match"] = df[["model", "answer"]].apply(lambda x: check_match(x[0],x[1]), axis =1)

In [21]:
acc = sum(df.match)/len(df)
print(acc)

0.9866666666666667


### Experiment 2: without smoothing

In [22]:
output_path = "../Data/Output/wordLangId2_optional.out"

with open(output_path, "w+") as result:
    # iterate test data by sentence
    for idx, sent in enumerate(test_preprocessed):

        prob_dict = {"English": 0, "French" : 0, "Italian" : 0}
        # iterate words in sentence
        for word in range(0, len(sent) -1):
            # apply bigram model for english and calcuate probability
            prob_dict["English"] += calc_prob(eng_bigram,sent[word], sent[word+1])
            # apply bigram model for french and calcuate probability
            prob_dict["French"] += calc_prob(fre_bigram,sent[word], sent[word+1])
            # apply bigram model for italian and calcuate probability
            prob_dict["Italian"] += calc_prob(it_bigram,sent[word], sent[word+1])
            
        # compare probability and extract language with the high probability
        lang = max(prob_dict, key=prob_dict.get)
        result.write(str(idx+1) + " " + lang + "\n")

In [23]:

result_path = "../Data/Output/wordLangId2_optional.out"
with open(result_path, "r") as result:
    result_list = []
    for i in result.readlines():
        result_list.append(i.split()[1])
        

In [24]:
ans_path = "../Data/Validation/labels.sol"
with open(ans_path, "r") as ans:
    ans_list = []
    for i in ans.readlines():
        ans_list.append(i.split()[1])

In [25]:
data_opt = {"model": result_list, "answer":ans_list}
df_opt = pd.DataFrame(data_opt)
df_opt

Unnamed: 0,model,answer
0,Italian,Italian
1,English,English
2,Italian,Italian
3,French,French
4,French,French
...,...,...
295,French,French
296,Italian,Italian
297,Italian,Italian
298,French,French


In [26]:
def check_match(model,answer):
    if model == answer:
        result = 1
    else:
        result = 0
    return result

In [27]:
df_opt["match"] = df_opt[["model", "answer"]].apply(lambda x: check_match(x[0],x[1]), axis =1)

In [28]:
acc = sum(df_opt.match)/len(df_opt)
print(acc)

0.99


When we use word bigram model, as you see without applying GT smoothing is better than applying the smoothing.