Jaeseung Lee

## 0. Library

In [1]:
import os
import sys
import pandas as pd

## 1. Import Data

### Import Training Data

#### English

In [2]:
eng_input_path = "../Data/Input/LangId.train.English"

# load english data, preprocessed data and save it
with open(eng_input_path, "r") as eng_content:
    
    eng_preprocessed = []
    for i in eng_content.readlines(): # read text line by line 
        
        
        letters = list(i) # split into letters
        letters = letters[:-2] # remove trailing whitespace
        
        if letters[0] == "(":
            letters = letters[2:]
        if letters[-1] == " ":
            letters = letters[:-1]
            
        # remove punctuation at the end of the sentence
        if letters[-1] == '.' or letters[-1]=='?' or letters[-1]=='!':
            letters = letters[:-1]
            # sometimes there are white space between punctuation and last letter
            # so remove white space after removing punctuation
            if letters[-1] == ' ':
                letters = letters[:-1]

        # add start and end of the letter for the bigram
        letters = ['<s>']+ letters + ['</s>']
        eng_preprocessed.append(letters) # save it to the list


#### French

In [3]:
fre_input_path = "../Data/Input/LangId.train.French"
# load french data, preprocessed data and save it
with open(fre_input_path, "r") as fre_content:
    
    fre_preprocessed = []
    for i in fre_content.readlines(): # read text line by line 
        
        
        letters = list(i) # split into letters
        
        # some empty list exists
        if len(letters) > 2:
            
            letters = letters[:-2] # remove trailing whitespace
            
            if letters[0] == "(" :
                letters = letters[2:]
            if letters[-1] == " ":
                letters = letters[:-1]
            # remove punctuation at the end of the sentence
            if letters[-1] == '.' or letters[-1]=='?' or letters[-1]=='!':
                letters = letters[:-1]
                # sometimes there are white space between punctuation and last letter
                # so remove white space after removing punctuation
                if letters[-1] == ' ':
                    letters = letters[:-1]
                
                    
            # add start and end of the letter for the bigram
            letters = ['<s>']+ letters + ['</s>']

            fre_preprocessed.append(letters) # save it to the list

#### Italian

In [4]:
it_input_path = "../Data/Input/LangId.train.Italian"
# load french data, preprocessed data and save it
with open(it_input_path, "r") as it_content:
    
    it_preprocessed = []
    for i in it_content.readlines(): # read text line by line 
        
        
        letters = list(i) # split into letters
        if len(letters) > 2:
            letters = letters[:-2] # remove trailing whitespace
            
            if letters[0] == "(":
                letters = letters[2:]
                
            if letters[-1] == " ":
                letters = letters[:-1]
                
            # remove punctuation at the end of the sentence
            if letters[-1] == '.' or letters[-1]=='?' or letters[-1]=='!':
                letters = letters[:-1]
                # sometimes there are white space between punctuation and last letter
                # so remove white space after removing punctuation
                if letters[-1] == ' ':
                    letters = letters[:-1]

            # add start and end of the letter for the bigram
            letters = ['<s>']+ letters + ['</s>']

            it_preprocessed.append(letters) # save it to the list


### 1.2 Test Data

In [5]:
test_path = "../Data/Validation/LangId.test"
with open(test_path,"r") as test_content:
    
    test_preprocessed = []
    
    for i in test_content.readlines():
        letters = list(i)
        
        letters = letters[:-2] # remove trailing whitespace
        
        if letters[0] == "(" :
                letters = letters[2:]
        if letters[-1] == " ":
                letters = letters[:-1]
        # remove punctuation at the end of the sentence
        if letters[-1] == '.' or letters[-1]=='?' or letters[-1]=='!' or letters[-1]==';':
            letters = letters[:-1]
            # sometimes there are white space between punctuation and last letter
            # so remove white space after removing punctuation
            if letters[-1] == ' ':
                letters = letters[:-1]
                    
            # add start and end of the letter for the bigram
        letters = ['<s>']+ letters + ['</s>']
        test_preprocessed.append(letters)

## 2. Generate bigram model

In [6]:
def gen_bigram(data):
    
    bigram = {}
    
    for sent in data: # iterate sentence
        for word in range(0, len(sent) -1): # iterate word in sentence
            
            # if word already exists in a bigram dictionary
            if sent[word] in bigram.keys():
                # if the next word already exists in the previous words dict, add 1
                if sent[word+1] in bigram[sent[word]].keys():
                    bigram[sent[word]][sent[word+1]] += 1
                # if the next word not exists in the previous words dict, set as a key, and init the value to 1
                else:
                    bigram[sent[word]][sent[word+1]] = 1
            # make key - value in dictionary, and init the value to 1
            else: 
                bigram[sent[word]] = {}
                bigram[sent[word]][sent[word + 1]] = 1
#             break
#         break
    return bigram

In [7]:
eng_bigram = gen_bigram(eng_preprocessed)

In [8]:
fre_bigram = gen_bigram(fre_preprocessed)

In [9]:
it_bigram = gen_bigram(it_preprocessed)

## 3. Apply Model

### Calculate probability for each language 

In [10]:
# word_prev: word_(n-1), word_n: word_(n)
# We have to calculate this P(word_(n)| word_(n-1))
def calc_prob(model,word_prev,word_n):
    
    
    count = 0 # the number of times word_n appears in the model
    total = 0 # total number of times word_(n-1) appers in the model
    
    # if there are word_(n-1) in the model,
    if word_prev in model.keys():
        
        # if there are word_n in the dictionary of word_(n-1)
        if word_n in model[word_prev].keys():
            count = model[word_prev][word_n]
        else:
            count = 0
        
        # iterate model by keys and count the total frequency of word_prev
        for key in model[word_prev].keys():
            total += model[word_prev][key]
    else:
        return 0
    
    result = count/total
    
    return result

In [11]:
output_path = "../Data/Output/letterLangId.out"

with open(output_path, "w+") as result:
    # iterate test data by sentence
    for idx, sent in enumerate(test_preprocessed):

        prob_dict = {"English": 0, "French" : 0, "Italian" : 0}
        # iterate words in sentence
        for word in range(0, len(sent) -1):
            # apply bigram model for english and calcuate probability
            prob_dict["English"] += calc_prob(eng_bigram,sent[word], sent[word+1])
            # apply bigram model for french and calcuate probability
            prob_dict["French"] += calc_prob(fre_bigram,sent[word], sent[word+1])
            # apply bigram model for italian and calcuate probability
            prob_dict["Italian"] += calc_prob(it_bigram,sent[word], sent[word+1])
            
        # compare probability and extract language with the high probability
        lang = max(prob_dict, key=prob_dict.get)
        result.write(str(idx+1) + " " + lang + "\n")

## 4. Check Accuracy

In [12]:
result_path = "../Data/Output/letterLangId.out"
with open(result_path, "r") as result:
    result_list = []
    for i in result.readlines():
        result_list.append(i.split()[1])
        

In [13]:
ans_path = "../Data/Validation/labels.sol"
with open(ans_path, "r") as ans:
    ans_list = []
    for i in ans.readlines():
        ans_list.append(i.split()[1])

In [14]:
data = {"model": result_list, "answer":ans_list}
df = pd.DataFrame(data)
df

Unnamed: 0,model,answer
0,Italian,Italian
1,English,English
2,Italian,Italian
3,French,French
4,French,French
...,...,...
295,French,French
296,Italian,Italian
297,Italian,Italian
298,French,French


In [15]:
def check_match(model,answer):
    if model == answer:
        result = 1
    else:
        result = 0
    return result

In [16]:
df["match"] = df[["model", "answer"]].apply(lambda x: check_match(x[0],x[1]), axis =1)

In [17]:
acc = sum(df.match)/len(df)
print(acc)

0.9833333333333333
