# DAT410 - Assignment 4, NLP

## Most frequent words

In [1]:
import pandas as pandas
import numpy as np

In [2]:
import re
import string

# Method for removing noise in text
def remove_noise(string):
    
    # Characters to be removed
    chars = '.,!?@#%()'

    # Remove special chars
    string = string.translate(str.maketrans('','', chars))
    
    # Remove HTML for apostophe 
    string = re.sub(' &apos;', "'", string)

    # Remove tags for new rows 
    string = re.sub('\n', '', string)
    
    return string 

In [3]:
# Open files  
english_data = open('europarl-v7.fr-en.lc.en')
french_data = open('europarl-v7.fr-en.lc.fr')

# Read files as strings
english_str = english_data.read()
french_str = french_data.read()

In [4]:
from collections import Counter

# Clean strings from special characters
english_str = remove_noise(english_str)
french_str = remove_noise(french_str)

# Count word frequency
en_word_counter = Counter(english_str.split())
fr_word_coutner = Counter(french_str.split())

# 10 most common words 
print(f'Most common English words: {en_word_counter.most_common(10)}')
print(f'Most common French words: {fr_word_coutner.most_common(10)}')

Most common English words: [('the', 19583), ('of', 9487), ('to', 8981), ('and', 7207), ('in', 6155), ('is', 4453), ('that', 4415), ('a', 4378), ('we', 3340), ('this', 3329)]
Most common French words: [('de', 14520), ('la', 9736), ('et', 6617), ("l'", 6510), ('le', 6167), ('les', 5582), ('à', 5498), ('des', 5232), ('que', 4795), ("d'", 4553)]


In [5]:
# Method for computing probability of single word 
def word_probability(word, dict):
    if word not in dict:
        word_frequency = 0
    else:
        word_frequency = dict.get(word)

    return word_frequency / sum(dict.values())

In [6]:
# Probability of sinlge words
print(f"Probability of the word 'speaker': {word_probability('speaker', en_word_counter)}")
print(f"Probability of the word 'zebra': {word_probability('zebra', en_word_counter)}")

Probability of the word 'speaker': 4.5943565986446645e-05
Probability of the word 'zebra': 0.0


## Language modeling

In [7]:
# Read files as lists  
english_sentences = open('europarl-v7.fr-en.lc.en').readlines()
french_sentences = open('europarl-v7.fr-en.lc.fr').readlines()

# Remove noise
for index, sentence in enumerate(english_sentences):
    english_sentences[index] = remove_noise(sentence)

for index, sentence in enumerate(french_sentences):
    french_sentences[index] = remove_noise(sentence)

In [8]:
# Method for calculating probability of initial word starting sentence
def prob_word(word, text_lst, position):

    # Changing index depending on if first/last word asked for
    if position == 'first':
        index = 0
    else:
        index = -1
    
    # Count nr of sentences to start with target sentence's inital word
    count = 0 
    
    for i in range(len(text_lst)):
        if text_lst[i].split()[index] == word:
            count += 1

    # If 0, set to 1 to avoid total probability equal 0
    if count == 0:
        count = 1
    
    # MLE sentences to start with target sentence's inital word
    p_word = count/len(text_lst)

    return p_word

In [9]:
# Method for creating bigrams of sentence
def create_bigrams(sentence):

    # Empty list to store bigrams
    bigrams = list()

    # Loop through all words in target sentence but the last
    for i in range(0, len(sentence)-1):

        # Create list for bigram
        target_words = list()

        # Append current and next word 
        target_words.append(sentence[i])
        target_words.append(sentence[i+1])

        # Join the two words separate strings into one
        bigram = (' '.join(target_words))

        bigrams.append(bigram)

    return bigrams


In [10]:
 # Method for probability of all bigrams in a sentence
def prob_bigrams(words, bigrams, text_lst, text_str):
 
    # Counter of all possible bigram
    all_bigrams_count = Counter(get_bigrams(text_lst))

    # Counter of all possible words
    word_count = Counter(text_str)                  
    
    # Initiate probability of bigrams
    p_bigrams = 1.0

    # For each bigram ...
    for i in range(len(bigrams)):
        
        # Check how many times it occurs in the complete text file
        bigram_count = all_bigrams_count[bigrams[i]]

        # If 0, set to 1 to avoid total probability equal 0
        if bigram_count == 0:
            bigram_count = 1
        
        # Check total occurencies of starting word in bigram, no matter surrounds it 
        total = word_count[words[i]]

        # If 0, set to large number to avoid total probability equal 0
        if total == 0:
            total = 1000000
        else:
            total = total

        # MLE for bigram 
        p_bigram = bigram_count / total 

        # Multiply all bigram probabilities 
        p_bigrams *= p_bigram

    return p_bigrams

In [11]:
# Method for getting all possible bigrams from a text file 
def get_bigrams(text_lst):
    
    all_bigrams = list()

    # For each sentence, call create_bigrams 
    for sentence in text_lst:
        bigram = create_bigrams(sentence.split())

        # Append each bigram to all_bigrams
        for bigram in bigram:
            all_bigrams.append(bigram)

    return all_bigrams

In [12]:
# Method for anguage modeling
def language_model(sentence, text_file):

    # Read files as strings (words) and lists (sentences)
    text_str = open(text_file).read()
    text_lst = open(text_file).readlines()
  
    # Clean string
    text_str = remove_noise(text_str)

    # Clean sentences 
    for index, sentence in enumerate(text_lst):
        text_lst[index] = remove_noise(sentence)

    
    # Remove noise 
    s = remove_noise(sentence)

    # Split words in target sentence
    words = s.split()

    # Get probability of inital word starting a sentence
    p_first = prob_word(words[0], text_lst, 'first')

    # Get probability of final word ending a sentence
    p_last = prob_word(words[-1], text_lst, 'last')
    
    # Get bigrams of words in sentence
    bigrams = create_bigrams(words)

    # Get probability of all bigrams in target sentence
    p_bigrams = prob_bigrams(words, bigrams, text_lst, text_str)

    # Multiply probabilities to get final results
    p_sentence = p_first * p_bigrams * p_last

    return p_sentence


In [13]:
# Test sentence from text file 
s = english_sentences[0]
print(f"Test sentence: {s}")

# UNIT TEST: prob_word
print(f"Probability of a sentence starting with the word 'I':{prob_word('i', english_sentences, 'first')}")

print(f"Probability of a sentence ending with the word 'last': {prob_word('period', english_sentences, 'last')}")

# UNIT TEST: create_bigrams
s_bigrams = create_bigrams(s.split())

# UNIT TEST: prob_bigrams
print(f"Probability of bigrams: {prob_bigrams(s.split(), s_bigrams, english_sentences, english_str.split())}")

# UNIT TEST: get_bigram
get_bigrams(english_sentences)

# UNIT TEST: language_model
print(f"Test sentence: {s}")
print(f"Probability of sentence: {language_model(sentence=s, text_file='europarl-v7.fr-en.lc.en')}")

Test sentence: i declare resumed the session of the european parliament adjourned on friday 17 december 1999  and i would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period 
Probability of a sentence starting with the word 'I':0.1141
Probability of a sentence ending with the word 'last': 0.001
Probability of bigrams: 1.3596761261666992e-54
Test sentence: i declare resumed the session of the european parliament adjourned on friday 17 december 1999  and i would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period 
Probability of sentence: 8.72028988363961e-81


## Translation model

In [14]:
# Method for calculating word alignment probabilities for a sentence pair 
def alignment_prob(t_prob):

    # Empty dict to store sum of word alignment probabilities
    source_sums = dict()

    delta = t_prob.copy()

    # Calculating sum of word alignment probabilities (divider) for each source word
    for (source, target), prob in delta.items():
        if source in source_sums.keys():
            source_sums[source] += prob
        else:
            source_sums[source] = prob 
            
    # Calculating word alignment probabilities for each source word
    for (source, target), prob in delta.items():

        # Sum of word alignment probabilities 
        total = source_sums[source]

        # New word alignment probabilities
        delta[(source, target)] = prob / total 
    
    return delta 

In [15]:
## Initialize transition probabilities and soft counts for unit tests
import itertools
from random import random

# Read files as strings (words) and lists (sentences)
source_str = open('fr_test.txt').read()    
target_str = open('en_test.txt').read()

# Clean strings
source_str = remove_noise(source_str)
target_str = remove_noise(target_str)

# Split corpus into words
source_words = source_str.split()
target_words = target_str.split()

# Add NULL to English words
target_words.append('NULL')

# Initialize t(f|e), c(f, e), c(e) for complete corpus 
test_trans_prob = dict()
test_soft_counts = dict()

iterables = [source_words, target_words]
for t in itertools.product(*iterables):

    # Randomize inital transition probabilities 
    test_trans_prob[t] = random()    

    # Initialize all soft_counts to 0 
    test_soft_counts[t] = 0
    test_soft_counts[t[1]] = 0

In [16]:
# UNIT TEST: alignment_prob
print(test_trans_prob[('décembre', '1999')])
test_delta = alignment_prob(test_trans_prob)
print(test_delta[('décembre', '1999')])

0.6575734810977056
0.0031864720801970284


In [17]:
# Method for updating soft counts
def update_counts (soft_counts, delta):

    for (source, target), align_prob in delta.items():
        soft_counts[(source, target)] += align_prob
        soft_counts[target] += align_prob

    return soft_counts

In [18]:
# UNIT TEST: update_counts

print(test_soft_counts[('vous', 'to')])
print(test_soft_counts['to'])

x = update_counts(test_soft_counts, test_delta)

print(x[('vous', 'to')])
print(x['to'])

0
0
0.002309010501550066
1.119198031707437


In [19]:
# Method for updating transition probabilities 
def transition_prob (t_prob, soft_counts):
    
    for (source, target), prob in t_prob.items():
        t_prob[(source, target)] = soft_counts[(source, target)] / soft_counts[target]

    return t_prob 

In [20]:
# UNIT TEST: transition_prob

print(test_trans_prob[('décembre', '1999')])

res = transition_prob(test_trans_prob, test_soft_counts)

print(res[('décembre', '1999')])

0.6575734810977056
0.0029588486383488968


In [24]:
import operator

# Translation model 
def translation_model(source_file, target_file, n_iter):

    # Read files as strings (words) and lists (sentences)
    source_str = open(source_file).read()
    target_str = open(target_file).read()

    source_list = open(source_file).readlines()
    target_list = open(target_file).readlines()

    # Clean strings
    source_str = remove_noise(source_str)
    target_str = remove_noise(target_str)

    # Clean sentences 
    for index, sentence in enumerate(target_list):
        target_list[index] = remove_noise(sentence)

    for index, sentence in enumerate(source_list):
        source_list[index] = remove_noise(sentence)

    # Split corpus into words
    source_words = source_str.split()
    target_words = target_str.split()

     # Add NULL to English words
    target_words.append('NULL')

    # Initialize t(f|e), c(f, e), c(e) for complete corpus 
    trans_prob = dict()
    soft_counts = dict()

    # For each iteration
    for i in range(n_iter):
    # For each sentence pair
        for (source, target) in zip(source_list, target_list):
    
            # Split sentences into words
            source_sentence = source.split()
            target_sentence = target.split()

            # Add NULL to English sentence 
            target_sentence.append('NULL')

            # Initialize t(f|e), c(f, e), c(e) for specific sentences 
            # To perform operations on small dictionaries one at a time
            temp_trans_prob = dict()
            temp_soft_counts = dict()

            iterables = [source_sentence, target_sentence]
            for t in itertools.product(*iterables):

                # Get transition probabilities from general dictionary if present, otherwise initialize to random [0,1)
                if t in trans_prob.keys():
                    temp_trans_prob[t] =  trans_prob[t]
                else:
                    temp_trans_prob[t] = random()
                
                # Get soft counts from general dictionary, otherwise initialize to 0 
                if t in soft_counts.keys():
                    temp_soft_counts[t] = soft_counts[t]
                else:
                    temp_soft_counts[t] = 0

                if t[1] in soft_counts.keys():
                    temp_soft_counts[t[1]] = soft_counts[t[1]]
                else: 
                    temp_soft_counts[t[1]] = 0


            # Get word alignment probabilities
            delta = alignment_prob(temp_trans_prob) 

            # Update soft counts 
            temp_soft_counts = update_counts(temp_soft_counts, delta)

            # Get transitional probabilities 
            temp_trans_prob = transition_prob(temp_trans_prob, temp_soft_counts)

            # Update general dictionaries
            for key, value in temp_trans_prob.items():
                trans_prob[key] = value
            
            
            for key, value in temp_soft_counts.items():
                soft_counts[key] = value

    # Sort transition probabilities by descending order
    trans_prob = dict(sorted(trans_prob.items(), key=operator.itemgetter(1),reverse=True))
  
    return trans_prob

In [58]:
# Train model: Get transition probabilities dictionary 
trans_prob_1 = translation_model(source_file='europarl-v7.fr-en.lc.fr', target_file='europarl-v7.fr-en.lc.en', n_iter=1)
trans_prob_5 = translation_model(source_file='europarl-v7.fr-en.lc.fr', target_file='europarl-v7.fr-en.lc.en', n_iter=5)
trans_prob_10 = translation_model(source_file='europarl-v7.fr-en.lc.fr', target_file='europarl-v7.fr-en.lc.en', n_iter=10)
trans_prob_20 = translation_model(source_file='europarl-v7.fr-en.lc.fr', target_file='europarl-v7.fr-en.lc.en', n_iter=20)

In [28]:
# Method for getting the n most likely translations for a specific word 
def word_translation (word, trans_prob, n):

    translations = list()

    for (source, target), prob in trans_prob.items():               
        if source == word:
            translations.append(target)       
    
    return translations[:n]

### Most probable translation of the French word 'européenne' for different EM iteration

In [59]:
# Using transition probabilitites generated after 1 iteration
print(f"\nMost likely English translations the french word 'européenne' after 1 iteration:\n {word_translation('européenne', trans_prob_1, 10)}")

# Using transition probabilitites generated after 5 iteration
print(f"\nMost likely English translations the french word 'européenne' after 5 iterations:\n {word_translation('européenne', trans_prob_5, 10)}")

# Using transition probabilitites generated after 10 iteration
print(f"\nMost likely English translations the french word 'européenne' after 10 iterations:\n {word_translation('européenne', trans_prob_10, 10)}")

# Using transition probabilitites generated after 20 iteration
print(f"\nMost likely English translations the french word 'européenne' after 20 iterations:\n {word_translation('européenne', trans_prob_20, 10)}")


Most likely English translations the french word 'européenne' after 1 iteration:
 ['multi-governmental', 'enables', 'blockade', 'communitise', 'impossible', 'enrich', 'reinvent', 'acceding', "soul'", 'county']

Most likely English translations the french word 'européenne' after 5 iterations:
 ['european', 'aided', 'symposium', 'contend', 'turns', 'europe-wide', 'overburdened', 'chapeau', 'provoke', 'adds']

Most likely English translations the french word 'européenne' after 10 iterations:
 ['european', 'aided', 'turns', 'patchwork', 'embellish', 'adds', 'armaments', 'periodical', 'qualms', 'focuses']

Most likely English translations the french word 'européenne' after 20 iterations:
 ['european', 'turns', 'aided', 'symposium', 'embellish', 'conceded', "europe'", 'de-europeanise', 'europe-wide', 'agendas']


## Decoding 

In [60]:
from itertools import combinations

# Method for translating sentence
def translate (sentence, target_text):

    # Store all words in list
    words = list()

    for source_word in sentence.split():

        # Get most likely translated word using the best trans_prob dictionary 
        target_word = word_translation(source_word, trans_prob_20, 1)

        words.extend(target_word)

    # Create list with all possible sentences with translated words
    #print(words)
    permutations = list(itertools.permutations(words))
    

    # Calculate probability of each possible sentence
    results = dict()

    # Get probability of each permutation
    for sentence in permutations:
        sentence = ' '.join(sentence)
        res = language_model(sentence=sentence, text_file=target_text)
        results[sentence] = res

    # Get most probable sentence 
    translation = max(results, key=results.get)
    
    return translation

In [63]:
# Test sentence in French to be translated into English
print(f"French: je suis très heureux \n English translation: {translate('je suis très heureux', 'europarl-v7.fr-en.lc.en')}")

French: je suis très heureux 
 English translation: i am very pleased
