# IBM Model 1, Expectation Maximisation
#### Authors: Adriaan de Vries, Féliciën Veldema, Verna Dankers

This notebook implements the expectation maximisation training algorithm for IBM Model 1. Run the cells in order to run the algorithm.

### 1. Requirements

In [1]:
# Python libraries to install
from __future__ import print_function, division
from collections import defaultdict, Counter
from tqdm import tqdm
from random import random
from scipy.special import digamma, loggamma, gammaln

import numpy as np
import pickle
import math
import os

# Custom requirements
from aer import read_naacl_alignments, AERSufficientStatistics, test
import data

### 2. Read in the data

Please set the paths to the data and run the code below. Functions for reading in the data have been placed outside of the notebook, as they are re-used by other notebooks.

In [2]:
english_train = 'training/hansards.36.2.e'
french_train = 'training/hansards.36.2.f'
english_val = 'validation/dev.e'
french_val = 'validation/dev.f'
fname = 'naacltest.txt'

training_data = data.read_data(english_train, french_train, True)
ext_data = list(zip(*training_data))
validation_data = data.read_data(english_val, french_val, True,
    ttype='validation', eng_data=ext_data[0], fre_data=ext_data[1])

Adding the -UNK- token to the data.
English data complete.
French data complete
Adding the -UNK- token to the data.
English data complete.
French data complete


### 3. Implementation of IBM1 EM

First, we implement the training algorithm, and the functions to calculate alignments and the log likelihood.

In [3]:
def align_all(data, translate_dict, fname=None):
    """Create alignments for pairs of English and French sentences.
    Both save them as sets per sentence and pair and save to file.
    
    Args:
        validation: zipped object with pairs of e and f sentences
        translate_dict: dictionary with translation probabilities e to f
        fname: filename to save alignments in, in NAACL format

    Returns:
        list of sets
    """
    file = open(fname, 'w')
    alignments = []
    for k, (english_words, french_words) in enumerate(data):
        alignment = align(english_words, french_words, translate_dict, False)
        for pos1, pos2 in alignment:
            file.write("{} {} {}\n".format(str(k+1), str(pos1), str(pos2)))
        alignments.append(set(alignment))
    return alignments
    
def align(english_words, french_words, translate_dict, add_null=True):
    """Align one sentence pair, either with or without the NULL alignments.
    
    Args:
        english_words: list of english words
        french_words: list of french words
        translate_dict: dictionary with translation probabilities e to f
        add_null: boolean to indicate whether NULL alignments should be included

    Return:
        list of tuples
    """
    alignment = []
    for j, fword in enumerate(french_words):
        prior = 0.0
        alignment_j = 0
        for i, eword in enumerate(english_words):
            # Only include terms that are in the dictionary
            if eword in translate_dict and fword in translate_dict[eword]:
                prob = translate_dict[eword][fword]
                if prob > prior:
                    prior = prob
                    alignment_j = i
        # Add dependent on whether it's a NULL alignments
        if alignment_j != 0 or add_null:
            alignment.append((alignment_j, j + 1))
    return alignment

def log_likelihood(data, translate_dict, add_constant=False):
    """Calculate the log likelihood for the training data.

    Args:
        data: zipped object with pairs of e and f sentences
        translate_dict: dictionary with translation probabilities e to f
        add_constant: whether to add the length normalisation constant

    Returns:
        float: log likelihood
    """
    log_likelihood = 0
    for e, f in data:
        alignment = align(e, f, translate_dict, True)
        prob = 0
        for j, i in alignment:
            prob += math.log(translate_dict[e[j]][f[i-1]])
        log_likelihood += prob

        # Length normalisation constant
        if add_constant:
            log_likelihood += -len(f) * np.log(len(e) + 1)
    return log_likelihood

def initialize_t(data, uniform=True):
    """Initialise the translation probabilities.
    
    Args:
        data: list of tuples, english and french sentences
        uniform: boolean indicating initialisation type

    Returns:
        defaultdict(Counter)
    """
    # Initialise random or uniform
    t = defaultdict(Counter)
    for e, f in tqdm(data):
        for e_word in e:
            for f_word in f:
                if uniform:
                    t[e_word][f_word] = 1
                else:
                    t[e_word][f_word] = random()

    # Normalise counts for every English word
    for e_word in t:
        normalization_factor = sum(list(t[e_word].values()))
        for f_word in t[e_word]:
            t[e_word][f_word] = t[e_word][f_word] / normalization_factor
    return t

def test(own_path, gold_path='validation/dev.wa.nonullalign', personal_sets=None):
    from random import random
    # 1. Read in gold alignments
    gold_sets = read_naacl_alignments(gold_path)

    # 2. Here you would have the predictions of your own algorithm
    if personal_sets is None:
        personal_sets = read_naacl_alignments(own_path)
        predictions = []
        for s, p in personal_sets:
            links = set()
            for link in s:
                links.add(link)
            predictions.append(links)
    else:
        predictions=personal_sets

    # 3. Compute AER
    # first we get an object that manages sufficient statistics 
    metric = AERSufficientStatistics()
    # then we iterate over the corpus 
    for gold, pred in zip(gold_sets, predictions):
        metric.update(sure=gold[0], probable=gold[1], predicted=pred)
    # AER
    return metric.aer()

def EM_IBM1(data, validation, max_steps=20, translate_dict=None, epochs_trained=0):
    """Train IBM1 using the EM algorithm.
    
    Args:
        data: list of tuples, english and french sentences
        validation: list of tuples, english and french sentences
        max_steps: maximum number of iterations
        translate_dict: dictionary with translation probabilities e to f
        epoch_trained: epochs already trained before

    Returns:
        defaultdict(Counter)
    """
    print("Initializing translation dictionary.")
    # If translate dict is not already given, initialise it
    if translate_dict is None:
        translate_dict = initialize_t(data)

    for iteration in range(epochs_trained, epochs_trained + max_steps):
        # Initialise counts
        fname = "iteration {}.txt".format(iteration)
        counts = Counter()
        co_counts = defaultdict(Counter)

        # Expectation
        print("Expectation step {}".format(iteration + 1))
        for e_s, f_s in tqdm(data):
            for f in f_s:
                sum_of_probs = sum([translate_dict[e2][f] for e2 in e_s])
                for e in e_s:
                    delta = translate_dict[e][f] / sum_of_probs
                    co_counts[e][f] += delta
                    counts[e] += delta

        # Maximisation
        print("Maximisation step {}".format(iteration + 1))
        for e in co_counts:
            for f in co_counts[e]:
                new_value = co_counts[e][f] / counts[e]
                translate_dict[e][f] = new_value

        # Writing the iteration files in naacl for AER use
        alignments = align_all(validation, translate_dict, fname)
        ll = log_likelihood(data, translate_dict)
        aer = test("", personal_sets=alignments)
        print("Log likelihood: {}, AER: {}".format(ll, aer))

        # Save translate_dict for later use
        pickle.dump(translate_dict, open("translate_dicts/ibm1_em_epoch_{}.pickle".format(iteration + 1), 'wb'))
    return translate_dict

In [84]:
translate_dict_em = EM_IBM1(training_data[:50000], validation_data, 5)

Initializing translation dictionary.


100%|██████████████████████████████████| 50000/50000 [00:15<00:00, 3272.89it/s]


Expectation step 1


100%|███████████████████████████████████| 50000/50000 [01:16<00:00, 654.07it/s]


Maximisation step 1
Log likelihood: -3620590.13330799, AER: 0.40835707502374174
Expectation step 2


100%|███████████████████████████████████| 50000/50000 [01:18<00:00, 637.22it/s]


Maximisation step 2
Log likelihood: -2626810.406258484, AER: 0.34574976122254064
Expectation step 3


100%|███████████████████████████████████| 50000/50000 [01:24<00:00, 588.26it/s]


Maximisation step 3
Log likelihood: -2173920.021614505, AER: 0.34416826003824097
Expectation step 4


100%|███████████████████████████████████| 50000/50000 [01:18<00:00, 639.85it/s]


Maximisation step 4
Log likelihood: -1954758.4640658444, AER: 0.33747609942638623
Expectation step 5


100%|███████████████████████████████████| 50000/50000 [01:17<00:00, 643.67it/s]


Maximisation step 5
Log likelihood: -1826981.2982877223, AER: 0.3413001912045889
