In [1]:
from __future__ import print_function, division
from collections import defaultdict, Counter
from tqdm import tqdm
from aer import read_naacl_alignments, AERSufficientStatistics
from random import random
from scipy import special
import numpy as np
import matplotlib.pyplot as plt
import pickle
import math
import os

## Read in the data

In [2]:
english_train = 'training/hansards.36.2.e'
french_train = 'training/hansards.36.2.f'
english_val = 'validation/dev.e'
french_val = 'validation/dev.f'
fname = 'naacltest.txt'

def add_unk(original_sentences, threshold, ext_sentences=None):
    #use original sentences and unk low frequency words
    if ext_sentences is None: 
        counts = Counter([item for sublist in original_sentences for item in sublist])
        unk_words = list({k:counts[k] for k in counts if counts[k] <= threshold})
    #use external sentences and unk unknown words
    else:
        counts = Counter([item for sublist in ext_sentences for item in sublist])
        unk_words = list({k:counts[k] for k in counts})
    for line, sentence in enumerate(tqdm(original_sentences)):
        for index, word in enumerate(sentence):
            if word in unk_words and ext_sentences is None:
                original_sentences[line][index] = '-UNK-'
            elif word not in unk_words and ext_sentences is not None:
                original_sentences[line][index] = '-UNK-'
    return original_sentences
    

def read_data(english_file, french_file, unk=False, threshold=1, ttype='training', eng_data=None, fre_data=None):
    english_sentences = []
    french_sentences = []
    with open(english_file, 'r', encoding='utf8') as engf, open(french_file, 'r', encoding='utf8') as fref:
        for line in engf:
            english_sentences.append(["NULL"] + line.split())
        for line in fref:
            french_sentences.append(line.split())
    
    #unk cases
    if unk:
        print("Adding the -UNK- token to the data.")
        englishname = ttype +'_'+ str(threshold)+'_unk.e'
        frenchname = ttype +'_'+ str(threshold)+'_unk.f'
        #load if file is found
        if os.path.isfile(englishname):
            with open (englishname, 'rb') as eng:
                english_sentences = pickle.load(eng)
        else:
            english_sentences = add_unk(english_sentences, threshold, eng_data)
            with open(englishname, 'wb') as eng:
                pickle.dump(english_sentences, eng)
        print("English data complete.")
        
        
        if os.path.isfile(frenchname):
            with open(frenchname, 'rb') as fre:
                french_sentences = pickle.load(fre)
        else:
            french_sentences = add_unk(french_sentences, threshold, fre_data)
            with open(frenchname, 'wb') as fre:
                pickle.dump(french_sentences, fre)             
        print("French data complete")          
        
    assert len(english_sentences) == len(french_sentences), 'data mismatch'
    return list(zip(english_sentences, french_sentences))

training_data = read_data(english_train, french_train, True)
ext_data = list(zip(*training_data))
validation_data = read_data(english_val, french_val, True, ttype='validation', eng_data=ext_data[0], fre_data=ext_data[1])

Adding the -UNK- token to the data.
English data complete.
French data complete
Adding the -UNK- token to the data.
English data complete.
French data complete


# AER Cell

In [3]:
def test(path, personal_sets=None):
    from random import random
    # 1. Read in gold alignments
    gold_sets = read_naacl_alignments('validation/dev.wa.nonullalign')

    # 2. Here you would have the predictions of your own algorithm
    if personal_sets is None:
        personal_sets = read_naacl_alignments(path)
        predictions = []
        for s, p in personal_sets:
            links = set()
            for link in s:
                links.add(link)
            predictions.append(links)
    else:
        predictions=personal_sets

    # 3. Compute AER
    # first we get an object that manages sufficient statistics 
    metric = AERSufficientStatistics()
    # then we iterate over the corpus 
    for gold, pred in zip(gold_sets, predictions):
        metric.update(sure=gold[0], probable=gold[1], predicted=pred)
    return metric.aer()

#hardcoded AER test for 5 iterations
for i in range(20):
    aer = test('iteration'+str(i)+'.txt')
    print(aer)

0.49562682215743437
0.3857421875
0.3209169054441261
0.32057416267942584
0.3215311004784689
0.323444976076555
0.323444976076555
0.3231357552581262
0.3212237093690249
0.31800766283524906
0.31800766283524906
0.3137065637065637
0.3137065637065637
0.31436837029893927
0.3153326904532304
0.3153326904532304
0.3153326904532304
0.3153326904532304
0.3153326904532304
0.3153326904532304


## IBM Model 1: Generative Process
Step 1: Pick an alignmet $a$ with probability  p(a|e,m) = $\frac{1}{(l+1)^m}$

Step 2: pick the French words with probability

p(f|a,e,m) = $\prod^m_{j=1} t(f_j | e_aj )$



P(f,a|e, m ) = p(a|e,m) $\times$ p(f|a,e,m)


In [6]:
def align_all(data, translate_dict, fname=None):
    """Create alignments for pairs of English and French sentences.
    Both save them as sets per sentence and pair and save to file.
    
    Args:
        validation: zipped object with pairs of e and f sentences
        translate_dict: dictionary with translation probabilities e to f
        fname: filename to save alignments in, in NAACL format

    Returns:
        list of sets
    """
    file = open(fname, 'w')
    alignments = []
    for k, (english_words, french_words) in enumerate(data):
        alignment = align(english_words, french_words, translate_dict, False)
        for pos1, pos2 in alignment:
            file.write("{} {} {}\n".format(str(k+1), str(pos1), str(pos2)))
        alignments.append(set(alignment))
    return alignments
    
def align(english_words, french_words, translate_dict, add_null=True):
    """Align one sentence pair, either with or without the NULL alignments.
    
    Args:
        english_words: list of english words
        french_words: list of french words
        translate_dict: dictionary with translation probabilities e to f
        add_null: boolean to indicate whether NULL alignments should be included

    Return:
        list of tuples
    """
    alignment = []
    for j, fword in enumerate(french_words):
        prior = 0.0
        alignment_j = 0
        for i, eword in enumerate(english_words):
            # Only include terms that are in the dictionary
            if eword in translate_dict and fword in translate_dict[eword]:
                prob = translate_dict[eword][fword]
                if prob > prior:
                    prior = prob
                    alignment_j = i
        # Add dependent on whether it's a NULL alignments
        if alignment_j != 0 or add_null:
            alignment.append((alignment_j, j + 1))
    return alignment

def log_likelihood(data, translate_dict, add_constant=False):
    """
    Args:
        data: zipped object with pairs of e and f sentences
        translate_dict: dictionary with translation probabilities e to f

    Returns:
        float: log likelihood
    """
    log_likelihood = 0
    for e, f in data:
        alignment = align(e, f, translate_dict, True)
        prob = 0
        for j, i in alignment:
            prob += math.log(translate_dict[e[j]][f[i-1]])
        log_likelihood += prob
        if add_constant:
            log_likelihood += -len(f) * np.log(len(e) + 1)
    return log_likelihood

def initialize_t(data, uniform=True):
    """Initialise the translation probabilities.
    
    Args:
        data: list of tuples, english and french sentences
        uniform: boolean indicating initialisation type

    Returns:
        defaultdict(Counter)
    """
    co_counts = defaultdict(Counter)
    for e, f in tqdm(data):
        for e_word in e:
            for f_word in f:
                if uniform:
                    co_counts[e_word][f_word] = 1
                else:
                    co_counts[e_word][f_word] = random()
    for e_word in co_counts:
        normalization_factor = sum(list(co_counts[e_word].values()))
        for f_word in co_counts[e_word]:
            co_counts[e_word][f_word] = co_counts[e_word][f_word] / normalization_factor
    return co_counts

def EM_IBM1(data, validation, max_steps=20, translate_dict=None, epochs_trained=0):
    print("Initializing translation dictionary.")
    if translate_dict is None:
        translate_dict = initialize_t(data)
    for iteration in range(epochs_trained, epochs_trained + max_steps):
        change = False
        fname = 'iteration' + str(iteration) + '.txt'
        counts = Counter()
        co_counts = defaultdict(Counter)
        
        print("Expectation step {}".format(iteration + 1))
        for e_s, f_s in tqdm(data):
            for f in f_s:
                sum_of_probs = sum([translate_dict[e2][f] for e2 in e_s])
                for e in e_s:
                    delta = translate_dict[e][f] / sum_of_probs
                    co_counts[e][f] += delta
                    counts[e] += delta

        print("Maximisation step {}".format(iteration + 1))
        for e in co_counts:
            for f in co_counts[e]:
                new_value = co_counts[e][f] / counts[e]
                if abs(translate_dict[e][f] - new_value) > 1e-5:
                    change = True
                translate_dict[e][f] = new_value
        if not change:
            break
        #writing the iteration files in naacl for AER use
        alignments = align_all(validation, translate_dict, fname)
        ll = log_likelihood(data, translate_dict)
        aer = test("", alignments)
        print("Log likelihood: {}, AER: {}".format(ll, aer))
        pickle.dump(translate_dict, open("translate_dicts/epoch_{}.pickle".format(iteration + 1), 'wb'))
    return translate_dict

print(type(training_data))

<class 'list'>


In [18]:


translate_dict = EM_IBM1(training_data, validation_data, 20)

Initializing translation dictionary.
Expectation step 1


100%|█████████████████████████████████| 231164/231164 [07:10<00:00, 537.07it/s]


Maximisation step 1
Log likelihood: -16445768.168144051, AER: 0.3664772727272727
Expectation step 2


100%|█████████████████████████████████| 231164/231164 [07:08<00:00, 539.15it/s]


Maximisation step 2
Log likelihood: -11694690.516029175, AER: 0.3279158699808795
Expectation step 3


100%|█████████████████████████████████| 231164/231164 [07:07<00:00, 541.07it/s]


Maximisation step 3
Log likelihood: -9767777.560276357, AER: 0.3209169054441261
Expectation step 4


100%|█████████████████████████████████| 231164/231164 [07:15<00:00, 530.65it/s]


Maximisation step 4
Log likelihood: -8852037.158920348, AER: 0.32057416267942584
Expectation step 5


100%|█████████████████████████████████| 231164/231164 [07:11<00:00, 535.33it/s]


Maximisation step 5
Log likelihood: -8320101.917196267, AER: 0.3215311004784689
Expectation step 6


100%|█████████████████████████████████| 231164/231164 [07:52<00:00, 489.54it/s]


Maximisation step 6
Log likelihood: -7975756.922772594, AER: 0.323444976076555
Expectation step 7


100%|█████████████████████████████████| 231164/231164 [10:49<00:00, 355.94it/s]


Maximisation step 7
Log likelihood: -7737641.475669495, AER: 0.323444976076555
Expectation step 8


100%|█████████████████████████████████| 231164/231164 [07:37<00:00, 504.89it/s]


Maximisation step 8
Log likelihood: -7564839.842448467, AER: 0.3231357552581262
Expectation step 9


100%|█████████████████████████████████| 231164/231164 [07:39<00:00, 502.90it/s]


Maximisation step 9
Log likelihood: -7434988.3206639, AER: 0.3212237093690249
Expectation step 10


100%|█████████████████████████████████| 231164/231164 [07:39<00:00, 502.93it/s]


Maximisation step 10
Log likelihood: -7334186.011778272, AER: 0.31800766283524906
Expectation step 11


100%|█████████████████████████████████| 231164/231164 [07:41<00:00, 501.33it/s]


Maximisation step 11
Log likelihood: -7254070.746853232, AER: 0.31800766283524906
Expectation step 12


100%|█████████████████████████████████| 231164/231164 [07:46<00:00, 495.73it/s]


Maximisation step 12
Log likelihood: -7189218.099847515, AER: 0.3137065637065637
Expectation step 13


100%|█████████████████████████████████| 231164/231164 [07:17<00:00, 527.97it/s]


Maximisation step 13
Log likelihood: -7135495.860541471, AER: 0.3137065637065637
Expectation step 14


100%|█████████████████████████████████| 231164/231164 [07:16<00:00, 529.78it/s]


Maximisation step 14
Log likelihood: -7090644.123121053, AER: 0.31436837029893927
Expectation step 15


100%|█████████████████████████████████| 231164/231164 [07:17<00:00, 528.62it/s]


Maximisation step 15
Log likelihood: -7052661.740290008, AER: 0.3153326904532304
Expectation step 16


100%|█████████████████████████████████| 231164/231164 [07:13<00:00, 533.24it/s]


Maximisation step 16
Log likelihood: -7020132.124201371, AER: 0.3153326904532304
Expectation step 17


100%|█████████████████████████████████| 231164/231164 [07:23<00:00, 521.73it/s]


Maximisation step 17
Log likelihood: -6992028.903662212, AER: 0.3153326904532304
Expectation step 18


100%|█████████████████████████████████| 231164/231164 [07:21<00:00, 523.28it/s]


Maximisation step 18
Log likelihood: -6967509.286361041, AER: 0.3153326904532304
Expectation step 19


100%|█████████████████████████████████| 231164/231164 [07:13<00:00, 533.08it/s]


Maximisation step 19
Log likelihood: -6945895.807228058, AER: 0.3153326904532304
Expectation step 20


100%|█████████████████████████████████| 231164/231164 [07:13<00:00, 533.61it/s]


Maximisation step 20
Log likelihood: -6926769.577004622, AER: 0.3153326904532304


In [None]:
def print_translation_probs(english_word, transdict):
    results = []
    for french_word in translate_dict[english_word]:
        results.append((translate_dict[english_word][french_word], french_word))
    results.sort(reverse=True)
    for r in results[:20]:
        print(r)
        
print_translation_probs('commissioners', translate_dict)

# Bayesian IBM1

In [None]:
from scipy.special import digamma, loggamma, gammaln

def elbo(data, t, f_vocab, alpha, lambdas):
    ll = log_likelihood(data, t)
    elbo = ll
    b = gammaln(alpha) * len(f_vocab)
    c = gammaln(alpha * len(f_vocab))
    for e in tqdm(t):
        a = sum([t[e][f] * (alpha - lambdas[e][f]) + gammaln(lambdas[e][f]) - gammaln(alpha)
                 for f in f_vocab])
        d = gammaln(sum([lambdas[e][f] for f in f_vocab]))
        elbo += a - b + c - c
    return elbo

def VB_IBM1(data, validation, alpha, max_steps=20, translate_dict=None):
    print("Initializing translation dictionary.")
    if translate_dict is None:
        translate_dict = initialize_t(data)
    e_vocab = translate_dict.keys()
    f_vocab = {f for e in translate_dict for f in translate_dict[e]}
    for iteration in range(max_steps):
        change = False
        fname = 'iteration' + str(iteration) + '.txt'
        lambdas = defaultdict(lambda : defaultdict(lambda : 0.05))

        print("Expectation step {}".format(iteration + 1))
        for e_s, f_s in tqdm(data):
            for f in f_s:
                sum_of_probs = sum([translate_dict[e2][f] for e2 in e_s])
                for e in e_s:
                    lambdas[e][f] += translate_dict[e][f] / sum_of_probs

        print("Maximisation step {}".format(iteration + 1))
        for e in tqdm(e_vocab):
            summation = digamma(sum(lambdas[e][f2] for f2 in lambdas[e]))
            standard = np.exp(digamma(alpha) - summation)
            for f in f_vocab:
                if lambdas[e][f] == alpha:
                    translate_dict[e][f] = standard
                else:
                    translate_dict[e][f] = np.exp(digamma(lambdas[e][f]) - summation)
#                 if abs(translate_dict[e][f] - new_value) > 1e-5:
#                     change = True


#         if not change:
#             print("The translation probabilities did not change, so the model converged.")
#             break

        # Writing the iteration files in naacl for AER use
        alignments = align_all(validation, translate_dict, fname)
        eb = elbo(data, translate_dict, f_vocab, alpha, lambdas)
        print("Elbo: {}".format(eb))
        aer = test("", alignments)
        print("AER: {}".format(aer))
    return translate_dict

translate_dict = VB_IBM1(training_data, validation_data, 0.05, 10)

Initializing translation dictionary.



  0%|                                               | 0/231164 [00:00<?, ?it/s]
  0%|                                   | 127/231164 [00:00<03:11, 1209.49it/s]
  0%|                                   | 222/231164 [00:00<03:33, 1082.91it/s]
  0%|                                   | 308/231164 [00:00<03:50, 1003.26it/s]
  0%|                                   | 474/231164 [00:00<03:18, 1161.75it/s]
  0%|                                   | 568/231164 [00:00<03:26, 1118.11it/s]
  0%|                                   | 713/231164 [00:00<03:16, 1172.70it/s]
  0%|▏                                 | 1117/231164 [00:00<02:26, 1573.23it/s]
  1%|▏                                  | 1313/231164 [00:02<06:55, 553.77it/s]
  1%|▏                                  | 1454/231164 [00:02<06:33, 583.23it/s]
  1%|▏                                  | 1606/231164 [00:02<06:11, 617.69it/s]
  1%|▎                                  | 1745/231164 [00:02<06:01, 634.08it/s]
  1%|▎                                 

# IBM 2

In [None]:
def EM_IBM2(data, validation, initial_translation_estimate, max_steps=3):
    translate_dict = initial_translation_estimate
    jump_dict = {}
    for iteration in range(max_steps):
        fname = 'IBM2_iteration' + str(iteration) + '.txt'
        counts = defaultdict(int)
        co_counts = defaultdict(int)
        jump_counts = defaultdict(int)
        pos_counts = 0
        for e_s,f_s in tqdm(data):
            m = len(f_s)
            l = len(e_s)
            for i, f in enumerate(f_s):
                sum_of_probs = 0
                for j, e in enumerate(e_s):
                    jump_prob = jump_dict.get(get_jump(j,i,l,m), 0.1)
                    translate_prob = translate_dict.get((e,f), 0.1)
                    sum_of_probs += jump_prob * translate_prob
                for j, e in enumerate(e_s):
                    jump_prob = jump_dict.get(get_jump(j,i,l,m), 0.1)
                    translate_prob = translate_dict.get((e,f), 0.1)
                    prob = jump_prob * translate_prob / sum_of_probs
                    co_counts[(e,f)] += prob
                    counts[e] += prob
                    jump_counts[get_jump(j,i,l,m)] += prob
                    pos_counts += prob
        for e,f in co_counts:
            translate_dict[(e,f)] = co_counts[(e, f)] / counts[e]
        for jump in jump_counts:
            jump_dict[jump] = jump_counts[jump] / pos_counts
            
        #writing the iteration files in naacl for AER use
        alignments = align_all(validation, translate_dict, fname)
        ll, ll2 = log_likelihood(data, translate_dict)
        print(ll, ll2)
        test("", alignments)
    return translate_dict, jump_dict

def get_jump(eng_pos, fre_pos, eng_len, fre_len):
    equivalent_pos = round(fre_pos * eng_len / fre_len)
    return eng_pos - equivalent_pos

# ibm2_transdict, ibm2_jumpdict = EM_IBM2(english_sentences, french_sentences, translate_dict, 2)
ibm2_transdict, ibm2_jumpdict = EM_IBM2(training_data, validation_data, translate_dict, 2)

In [None]:
print_translation_probs('agriculture', ibm2_transdict)
jumps = np.array(sorted(ibm2_jumpdict.items()))
plt.plot(jumps[65:85,0], jumps[65:85,1])
plt.show()
print(sum(jumps[:,1]))

In [None]:
a = defaultdict(lambda : defaultdict(lambda : 0.05))

In [None]:
a["hi"]["bonjour"]