# IMPORT DATA AND CLEAN THE DATA

In [45]:
import nltk
import re
from nltk.corpus import udhr
from nltk import word_tokenize
from nltk.util import ngrams
from collections import Counter
from nltk.probability import ConditionalFreqDist
nltk.download('udhr')

# Language Data
english = udhr.raw('English-Latin1')
french = udhr.raw('French_Francais-Latin1')
italian = udhr.raw('Italian_Italiano-Latin1')
spanish = udhr.raw('Spanish_Espanol-Latin1') 

# Training and Development Data
english_train, english_dev = english[0:1000], english[1000:1100]
french_train, french_dev = french[0:1000], french[1000:1100]
italian_train, italian_dev = italian[0:1000], italian[1000:1100]
spanish_train, spanish_dev = spanish[0:1000], spanish[1000:1100] 

def CleanData(raw_tokens):
    return [token.lower() for token in raw_tokens if token.isalpha()]

#Testing Data
english_test = udhr.words('English-Latin1')[0:1000]
french_test = udhr.words('French_Francais-Latin1')[0:1000]
italian_test = udhr.words('Italian_Italiano-Latin1')[0:1000]
spanish_test = udhr.words('Spanish_Espanol-Latin1')[0:1000]

english_test_clean = CleanData(english_test)
french_test_clean = CleanData(french_test)
italian_test_clean = CleanData(italian_test)
spanish_test_clean = CleanData(spanish_test)

# Training Data 
english_train_clean = CleanData(nltk.word_tokenize(english_train))
french_train_clean = CleanData(nltk.word_tokenize(french_train))
spanish_train_clean = CleanData(nltk.word_tokenize(spanish_train))
italian_train_clean = CleanData(nltk.word_tokenize(italian_train))

english_train_clean_rawtext = ' '.join(english_train_clean)
french_train_clean_rawtext = ' '.join(french_train_clean)
spanish_train_clean_rawtext = ' '.join(spanish_train_clean)
italian_train_clean_rawtext = ' '.join(italian_train_clean)
# Dev Data
english_dev_clean = CleanData(nltk.word_tokenize(english_dev))
french_dev_clean = CleanData(nltk.word_tokenize(french_dev))
spanish_dev_clean = CleanData(nltk.word_tokenize(spanish_dev))
italian_dev_clean = CleanData(nltk.word_tokenize(italian_dev))

english_dev_clean_rawtext = ' '.join(english_dev_clean)
french_dev_clean_rawtext = ' '.join(french_dev_clean)
spanish_dev_clean_rawtext = ' '.join(spanish_dev_clean)
italian_dev_clean_rawtext = ' '.join(italian_dev_clean)


def generate_NGrams_Character(s, n):
    # Convert to lowercases
    s = s.lower()

    # Replace all none alphanumeric characters with spaces
    s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s)

    # Break sentence in the token, remove empty tokens
    tokens = [token for token in s if token != ""]

    # Use the zip function to help us generate n-grams
    # Concatentate the tokens into ngrams and return
    ngrams = zip(*[tokens[i:] for i in range(n)])
    return ["".join(ngram) for ngram in ngrams]



[nltk_data] Downloading package udhr to
[nltk_data]     C:\Users\chait\AppData\Roaming\nltk_data...
[nltk_data]   Package udhr is already up-to-date!


# UNIGRAM MODEL IMPLEMENTATION

In [46]:
print('Testing Unigram Models for English against English and French models ')
def Build_Unigram_Model(language):
    raw_text = ''
    if language == 'english':
        raw_text = generate_NGrams_Character(english_train_clean_rawtext + english_dev_clean_rawtext, 1)
    elif language == 'spanish':
        raw_text = generate_NGrams_Character(spanish_train_clean_rawtext + spanish_dev_clean_rawtext, 1)
    elif language == 'french':
        raw_text = generate_NGrams_Character(french_train_clean_rawtext + french_dev_clean_rawtext, 1)
    elif language == 'italian':
        raw_text = generate_NGrams_Character(italian_train_clean_rawtext + italian_dev_clean_rawtext, 1)
    unigrams = raw_text
    unigram_model = nltk.FreqDist(unigrams)
    return unigram_model
    
unigram_model_english = Build_Unigram_Model('english')    
unigram_model_spanish = Build_Unigram_Model('spanish')
unigram_model_italian = Build_Unigram_Model('italian')
unigram_model_french = Build_Unigram_Model('french')

def calculate_unigram_probability_model(word, language):
    model = None
    probability = 1
    
    if language == 'english':
        model = unigram_model_english
    elif language == 'spanish':
        model = unigram_model_spanish
    elif language == 'french':
        model = unigram_model_french
    elif language == 'italian':
        model = unigram_model_italian
        
    for character in word: 
        probability = (probability * model.freq(character))
    return probability
        
def Test_Unigram_Models(Test_data_set):
    Total_english_probability = -1
    Total_spanish_probability = -1
    Total_italian_probability = -1
    Total_french_probability = -1
    
    for each_word in Test_data_set:
        english_probability= calculate_unigram_probability_model(each_word, 'english')
        spanish_probability= calculate_unigram_probability_model(each_word, 'spanish')
        french_probability= calculate_unigram_probability_model(each_word, 'french')
        italian_probability= calculate_unigram_probability_model(each_word, 'italian')
        
        Total_english_probability  = Total_english_probability + english_probability
        Total_spanish_probability  = Total_spanish_probability + spanish_probability
        Total_italian_probability  = Total_italian_probability + italian_probability
        Total_french_probability  = Total_french_probability + french_probability
    
    if(Total_english_probability > Total_french_probability):
        print('English Unigram Probability is', round(Total_english_probability*100,2))
    else:
        print('French Unigram Probability is', round(Total_french_probability*100,2))


Test_Unigram_Models(english_test_clean)
    

Testing Unigram Models for English against English and French models 
English Unigram Probability is 63.48


# BIGRAM MODEL IMPLEMENTATION

In [47]:
def Build_BigramModel(language):
    Bi_gram_Model = ConditionalFreqDist()
    Bi_grams = ''
    if language == 'english':
        Bi_grams = generate_NGrams_Character(' '+english_train_clean_rawtext + english_dev_clean_rawtext+' ', 2)
    elif language == 'spanish':
        Bi_grams = generate_NGrams_Character(' '+spanish_train_clean_rawtext + spanish_dev_clean_rawtext+' ', 2)
    elif language == 'french':
        Bi_grams = generate_NGrams_Character(' '+french_train_clean_rawtext + french_dev_clean_rawtext+' ', 2)
    elif language == 'italian':
        Bi_grams = generate_NGrams_Character(' '+italian_train_clean_rawtext + italian_dev_clean_rawtext+' ', 2)
    for bigram in Bi_grams:
        Bi_gram_Model[bigram[0]][bigram[1]]+=1
    return Bi_gram_Model

Bi_gram_model_english = Build_BigramModel('english')
Bi_gram_model_spanish = Build_BigramModel('spanish')
Bi_gram_model_french = Build_BigramModel('french')
Bi_gram_model_italian = Build_BigramModel('italian')

def calculate_bigram_probability_model(word, language):
    bigram_model = None
    unigram_model = None
    probability = 1
    bigram_count = 0
    unigram_count = 0
    
    if language == 'english':
        bigram_model = Bi_gram_model_english
        unigram_model = unigram_model_english
        bigram_count =  len(generate_NGrams_Character(' '+english_train_clean_rawtext + english_dev_clean_rawtext+' ', 2))
        unigram_count =  len(generate_NGrams_Character(' '+english_train_clean_rawtext + english_dev_clean_rawtext+' ', 1))
    elif language == 'spanish':
        bigram_model = Bi_gram_model_spanish
        unigram_model = unigram_model_spanish
        bigram_count = len(generate_NGrams_Character(' '+spanish_train_clean_rawtext + spanish_dev_clean_rawtext+' ', 2))
        unigram_count =  len(generate_NGrams_Character(' '+spanish_train_clean_rawtext + spanish_dev_clean_rawtext+' ', 1))
    elif language == 'french':
        bigram_model = Bi_gram_model_french
        unigram_model = unigram_model_french
        bigram_count = len(generate_NGrams_Character(' '+french_train_clean_rawtext + french_dev_clean_rawtext+' ', 2))
        unigram_count =  len(generate_NGrams_Character(' '+french_train_clean_rawtext + french_dev_clean_rawtext+' ', 1))
    elif language == 'italian':
        bigram_model = Bi_gram_model_italian
        unigram_model = unigram_model_italian
        bigram_count  = len(generate_NGrams_Character(' '+italian_train_clean_rawtext + italian_dev_clean_rawtext+' ', 2))
        unigram_count =  len(generate_NGrams_Character(' '+italian_train_clean_rawtext + italian_dev_clean_rawtext+' ', 1))
            
    for bigrams_of_word in generate_NGrams_Character(word,2):
        numerator_probability = (bigram_model[bigrams_of_word[0]][bigrams_of_word[1]] + 1)/ (bigram_count + len(bigram_model))
        ## Add One Smoothing. 
        denominator_probability = (unigram_model[bigrams_of_word[0]] + 1)/( len(unigram_model) + unigram_count)
        probability = probability * (numerator_probability/denominator_probability)
    return probability

def Test_bigram_Models(Test_data_set):
    Total_english_probability = 0
    Total_spanish_probability = 0
    Total_italian_probability = 0
    Total_french_probability = 0
    
    for each_word in Test_data_set:
        english_probability= calculate_bigram_probability_model(each_word, 'english')
        spanish_probability= calculate_bigram_probability_model(each_word, 'spanish')
        french_probability= calculate_bigram_probability_model(each_word, 'french')
        italian_probability= calculate_bigram_probability_model(each_word, 'italian')
        
                
        Total_english_probability  = Total_english_probability + english_probability
        Total_spanish_probability  = Total_spanish_probability + spanish_probability
        Total_italian_probability  = Total_italian_probability + italian_probability
        Total_french_probability  = Total_french_probability + french_probability
    if (Total_english_probability > Total_french_probability):
        print('English Bigram Probability is', Total_english_probability)
    else:
        print('French Bigram Probability is', Total_french_probability)
    
Test_bigram_Models(english_test_clean)

English Bigram Probability is 63.170820576352774


# TRIGRAM MODEL IMPLEMENTATION

In [49]:
def Build_TrigramModel(language):
    tri_gram_Model = ConditionalFreqDist()
    trigrams = ''
    
    if language == 'english':
        trigrams = generate_NGrams_Character(' '+english_train_clean_rawtext + english_dev_clean_rawtext+' ', 3)
    elif language == 'spanish':
        trigrams = generate_NGrams_Character(' '+spanish_train_clean_rawtext + spanish_dev_clean_rawtext+' ', 3)
    elif language == 'french':
        trigrams = generate_NGrams_Character(' '+french_train_clean_rawtext + french_dev_clean_rawtext+' ', 3)
    elif language == 'italian':
        trigrams = generate_NGrams_Character(' '+italian_train_clean_rawtext + italian_dev_clean_rawtext+' ', 3)
        
    for trigram in trigrams:
        tri_gram_Model[trigram[:2]][trigram[2]]+=1
    return tri_gram_Model

trigram_model_english = Build_TrigramModel('english')
trigram_model_spanish = Build_TrigramModel('spanish')
trigram_model_french = Build_TrigramModel('french')
trigram_model_italian = Build_TrigramModel('italian')

def calculate_Trigram_probability_model(word, language):
    trigram_model  = None
    bigram_model = None
    probability = 1
    bigram_count = 0
    trigram_count = 0
    
    if language == 'english':
        trigram_model = trigram_model_english
        bigram_model = Bi_gram_model_english
        bigram_count =  len(generate_NGrams_Character(' '+english_train_clean_rawtext + english_dev_clean_rawtext+' ', 2))
        trigram_count = len(generate_NGrams_Character(' '+english_train_clean_rawtext + english_dev_clean_rawtext+' ', 3))
    elif language == 'spanish':
        trigram_model = trigram_model_spanish
        bigram_model = Bi_gram_model_spanish
        bigram_count =  len(generate_NGrams_Character(' '+spanish_train_clean_rawtext + spanish_dev_clean_rawtext+' ', 2))
        trigram_count = len(generate_NGrams_Character(' '+spanish_train_clean_rawtext + spanish_dev_clean_rawtext+' ', 3))
    elif language == 'french':
        trigram_model = trigram_model_french
        bigram_model = Bi_gram_model_french
        bigram_count =  len(generate_NGrams_Character(' '+french_train_clean_rawtext + french_dev_clean_rawtext+' ', 2))
        trigram_count = len(generate_NGrams_Character(' '+french_train_clean_rawtext + french_dev_clean_rawtext+' ', 3))
    elif language == 'italian':
        trigram_model = trigram_model_italian
        bigram_model = Bi_gram_model_italian
        bigram_count =  len(generate_NGrams_Character(' '+italian_train_clean_rawtext + italian_dev_clean_rawtext+' ', 2))
        trigram_count = len(generate_NGrams_Character(' '+italian_train_clean_rawtext + italian_dev_clean_rawtext+' ', 3))
        
    for trigrams_of_word in generate_NGrams_Character(word,3):
        ## Add one Smoothing
        numerator_probability = (trigram_model[trigrams_of_word[:2]][trigrams_of_word[2]] + 1) / (trigram_count + len(trigram_model))
        denominator_probability = (bigram_model[trigrams_of_word[0]][trigrams_of_word[1]] + 1) / (bigram_count + len(bigram_model))
        probability = probability * (numerator_probability/denominator_probability)
    return probability

def Test_Trigram_Models(Test_data_set):
    Total_english_probability = 0
    Total_spanish_probability = 0
    Total_italian_probability = 0
    Total_french_probability = 0
    
    for each_word in Test_data_set:
        english_probability= calculate_Trigram_probability_model(each_word, 'english')
        spanish_probability= calculate_Trigram_probability_model(each_word, 'spanish')
        french_probability= calculate_Trigram_probability_model(each_word, 'french')
        italian_probability= calculate_Trigram_probability_model(each_word, 'italian')
        
        Total_english_probability  = Total_english_probability + english_probability
        Total_spanish_probability  = Total_spanish_probability + spanish_probability
        Total_italian_probability  = Total_italian_probability + italian_probability
        Total_french_probability  = Total_french_probability + french_probability
        Normalized_english_probability = ((Total_english_probability*2)/10)
        Normalized_spanish_probability = ((Total_spanish_probability*2)/10)
        Normalized_italian_probability =  ((Total_italian_probability*2)/10)
        Normalized_french_probability =  ((Total_french_probability*2)/10)
    if (Normalized_english_probability > Normalized_french_probability):
        print('English Trigram Probability is', (Normalized_english_probability))
    else:
         print('French Trigram Probability is', (Normalized_french_probability))

Test_Trigram_Models(english_test_clean)


English Trigram Probability is 64.20014204506471


# ENGLISH VS FRENCH
   ### All The Above Data Set has been cleaned, pre-processed, converted to lower case words, Applied Add One Laplace Smoothing Technique and  processed. When English test data has been tested on English vs French Trigram, it has been found that english had 64 % accuracy and French had 60 %.  English trigram is more accurate than French.  When tested against English vs French Bi-gram , English outperformed french bi-gram, English bigram reported accuracy of 63 % and French reported accuracy of 29%. When performed against English vs French unigram. English Unigram was more accurate than French, showing 63 % accuracy.
   ### English Trigram - 64 % 
   ### English Bigram - 63% 
   ### English Unigram 63% 
   

# Spanish VS Italian
   ### All The Above Data Set has been cleaned, pre-processed, converted to lower case words, Applied Add One Laplace Smoothing Technique and  processed. When Spanish test data has been tested on Spanish vs Italian Trigram, it has been found that spanish had 72 % accuracy and italian had  69 %.  Spanish trigram is more accurate than italian.  When tested against Spanish vs Italian Bi-gram , Spanish outperformed Italian bi-gram, Spanish bigram reported accuracy of 64 % and Italian reported accuracy of 41%. When performed against Spanish vs Italian unigram. Spanish Unigram was more accurate than Italian, showing 51% accuracy whereas Italian unigram showed 42%. 
