In [5]:
#!/usr/bin/env python

"""
Python Version: 3.6

Implementations of IBM models 1 and 2.
"""

from collections import defaultdict
import re
import logging
import random
from math import floor
import numpy as np
from __future__ import division
logging.basicConfig(level=logging.DEBUG)


NULL_TOKEN = "<NULL>"
LIMIT = 100 # how sentences to train on

def preprocess(line):
    """
    Apply preprocessing to line in corpus.
    :param line:
    :return:
    """

    line = line.lower()  # to lower case
    line = re.sub(r"\d+", "", line)  # remove digits
    line = re.sub(r'[^\w\s]', "", line)  # remove all non-alphanumeric and non-space characters
    line = re.sub(r"\s+", " ", line).strip()  # remove excess white spaces
    return line


def get_vocab(file):
    """
    Extract all unique words from a corpus.
    :param file: text file containing corpus
    :return: set of unique words
    """

    vocab = set()
    count = 0
    with open(file, 'r') as f:
        for line in f:
            line = preprocess(line)
            for word in line.split():
                vocab.add(word)
            count += 1
            if count == LIMIT:
                break
    return vocab

def get_corpus(e_file, f_file):

    fe = open(e_file)
    ff = open(f_file)
    count = 0
    for e_sent, f_sent in zip(fe, ff):

        e_sent = preprocess(e_sent)
        e_sent = NULL_TOKEN + " " + e_sent
        f_sent = preprocess(f_sent)
        yield (e_sent.split(), f_sent.split())

        count += 1
        if count == LIMIT:
            break

class IBM(object):

    def __init__(self, model=2, initialization="uniform"):
        self.model = model
        self.t = None
        self.e_vocab = None
        self.f_vocab = None
        self.initialization = initialization
        self.max_jump = 100
        self.jump = None

    def train(self, e_file="training/hansards.36.2.e", f_file="training/hansards.36.2.f", iters=10):

        logging.info("Creating English vocabulary...")
        self.e_vocab = get_vocab(e_file)
        self.e_vocab.add(NULL_TOKEN)

        logging.info("Creating French vocabulary...")
        self.f_vocab = get_vocab(f_file)

        logging.info("Initialising model parameters...")
        self.initialise_params()

        logging.info("Training parameters with EM...")
        self.EM(e_file, f_file, iters)


    def EM(self, e_file, f_file, iters):

        # Train parameters with EM algorithm
        for iteration in range(iters):


            if self.model == 1:

                # All counts to zero for the new iteration
                pair_counts = defaultdict(float)
                word_counts = defaultdict(float)

                # Expectation step
                for e_sent, f_sent in get_corpus(e_file, f_file):

                    normalise = {}
                    for f_word in f_sent:

                        # Sum translation probabilities of f words over all e words
                        normalise[f_word] = 0.0
                        for e_word in e_sent:
                            normalise[f_word] += self.t[e_word][f_word]

                        # Update counts
                        for e_word in e_sent:
                            delta = self.t[e_word][f_word] / normalise[f_word]
                            pair_counts[(e_word, f_word)] += delta
                            word_counts[e_word] += delta

                # Maximisation step
                for e_word in self.e_vocab:
                    for f_word in self.f_vocab:
                        self.t[e_word][f_word] = pair_counts[(e_word, f_word)] / word_counts[e_word]

            elif self.model == 2:
                # EM for IBM2
                pair_counts = defaultdict(float)
                word_counts = defaultdict(float)
                jump_counts = np.zeros((1, 2 * self.max_jump), dtype=np.float)
                
                l = len(self.e_vocab)
                m = len(self.f_vocab)
                # Expectation step
                for e_sent, f_sent in get_corpus(e_file, f_file):

                    normalise = {}
                    #french word position j
                    #english word position i 
                    
                    for j, f_word in enumerate(f_sent):
                        # Sum translation probabilities of f words over all e words
                        normalise[f_word] = 0.0
                        for i, e_word in enumerate(e_sent):
                            normalise[f_word] += self.t[e_word][f_word] * self.jump[0, self.get_jump(i, j, l, m)]

                        # Update counts
                        for i, e_word in enumerate(e_sent):
                            idx = self.get_jump(i, j, l, m)
                         #   if normalise[f_word] == 0:
                         #       print('help!')
                         #       delta = 0
                         #   else:
                            delta = (self.t[e_word][f_word] * self.jump[0, idx]) / normalise[f_word]
                            
                            pair_counts[(e_word, f_word)] += delta
                            word_counts[e_word] += delta
                            jump_counts[0, idx] += delta
                
                # Maximisation step
                for e_word in self.e_vocab:
                    for f_word in self.f_vocab:
                        self.t[e_word][f_word] = pair_counts[(e_word, f_word)] / word_counts[e_word]
                self.jump = 1./float(np.sum(jump_counts)) * jump_counts


    def initialise_params(self):

        if self.model == 1:

            # Store t(f|e) as t[e][f]
            initial_value = 1.0/len(self.f_vocab)
            self.t = {e_word: {f_word: initial_value for f_word in self.f_vocab} for e_word in self.e_vocab}


        elif self.model == 2:
        # Initialise IBM2 parameters (some of which will be the same)
            if self.initialization == "uniform":
                # Store t(f|e) as t[e][f]
                initial_value = 1.0/len(self.f_vocab)
                self.t = {e_word: {f_word: initial_value for f_word in self.f_vocab} for e_word in self.e_vocab}
                
            elif self.initialization == "random":
                # random samples from Dirichlet distribution
                #alpha = (0.1,) * len(self.f_vocab)
                #initial_value = dirichlet(alpha, size=len(self.e_vocab)).T
                self.t = {e_word: {f_word: random.uniform(0.1,0.9) for f_word in self.f_vocab} for e_word in self.e_vocab}
            
            #initializing jump
            self.jump = 1. / (2 * self.max_jump) * np.ones((1, 2 * self.max_jump), dtype = np.float)
        
    def get_jump(self, i, j, l, m):
        """
        Align french word j to english word i. 
        Returns value in range [0, 2*max_jump] instead of [-max_jump, max_jump]
        to get sensible indices.
        """
        jump = int(i - floor(j * l / m)) + self.max_jump 
        if jump >= 2 * self.max_jump:
            return self.max_jump - 1
        if jump < 0:
            return 0
        else:
            return jump


def main():
    model = IBM()
    #model.train(e_file="mock/e", f_file="mock/f", iters=100)
    model.train()

    #print(model.t['b']['x'])
    print(model.t)
    #print("model_jump_shape:", model.jump.shape)
    

if __name__ == "__main__":
    main()

INFO:root:Creating English vocabulary...
INFO:root:Creating French vocabulary...
INFO:root:Initialising model parameters...
INFO:root:Training parameters with EM...


{'value': {'apprécier': 0.46101187102971325, 'anciens': 0.0, 'notamment': 0.0, 'nordique': 0.0, 'entrée': 0.0, 'universités': 0.0, 'devenus': 0.0, 'séance': 0.0, 'ordre': 0.0, 'millénaire': 0.0, 'cardiaque': 0.0, 'reconnus': 0.0, 'honorables': 0.0, 'qualité': 0.0, 'science': 0.0, 'qui': 2.33312933611784e-08, 'adresse': 0.0, 'indissociable': 0.0, 'cinq': 0.0, 'canada': 0.0, 'meilleure': 0.0, 'à': 0.376320391736894, 'culture': 0.0, 'créer': 0.0, 'assembler': 0.0, 'boudria': 0.0, 'les': 1.6541515472645986e-05, 'septembre': 0.0, 'soit': 0.0, 'était': 0.0, 'prorogé': 0.0, 'ralston': 0.0, 'demande': 0.0, 'rehaussant': 0.0, 'espace': 0.0, 'lucienne': 0.0, 'comités': 0.0, 'perspective': 0.0, 'gens': 0.0, 'succès': 0.0, 'étaient': 0.0, 'développement': 0.0, 'robillard': 0.0, 'officielle': 0.0, 'approche': 0.0, 'ses': 0.0, 'sont': 0.0, 'vivre': 0.0, 'devient': 0.0, 'puisent': 0.0, 'démocratique': 0.0, 'pléniers': 0.0, 'paix': 0.0, 'pacifique': 0.0, 'utilisation': 0.0, 'm': 0.0, 'messieurs': 0.0,