# Machine Translation Lab 3
# IBM Model 1: Training

### Some imports and helper functions...

In [3]:
import os
import sys
from collections import defaultdict

def probs_from_counts(counts, total_count, cutoff):
    '''
        Take a matrix of counts pairwise counts (of word pairs) and output the probability of these count pairs. 
        Args: 
            counts(dict): Dictionary with the pairwise counts of N source words and M target words. 
            total_count (list): List of counts for words in source language. 
            
        Returns:
            prob_dict (dict): Dictionary with pairwise. 
    '''
    # Probabilities of pairs (default value for each probability initialised to zero)
    prob_dict = defaultdict(lambda: defaultdict(float))  
    # Iterate through the keys which identify each pair of words. 
    for c1 in counts:
        for c2 in counts[c1]:  # counts of pairs (c1, c2)
            prob = counts[c1][c2]/total_count[c1] # conditional probability P(c2|c1)
            if prob > cutoff:   # discard too low probabilities
                prob_dict[c1][c2] = prob 
    # probabilities P(c2|c1)
    return prob_dict

def write_probs(probs):
    '''
    Function to format and print pairwise probabilities. 
    Args: 
        (list): 2D nested list of probabilities for each pair of words. 
        
    Returns:
        None
    '''
    for c1 in probs:
        for c2 in probs[c1]:
            sys.stdout.write("%s\t%s\t%.4f\n" % (c1, c2, probs[c1][c2]))
        sys.stdout.write("\n")
    sys.stdout.write("\n\n")

### Initialisation step 

In [6]:
# discarding probabilities which are too low (see function above)
prob_cutoff = 0.0000001 

# handling potential probabilities of zero. 
# => These can cause errors so we set this value to a really small number near zero. 
prob_smooth = 0.0000001 


# Initialising word pair counts
Clex = defaultdict(lambda: defaultdict(float)) 
# Initialising source or target word counts (for normalisation)
Cst = defaultdict(float) 


# Initialisation step
stxt = open(os.path.join("data","example.source"), "r") # opening the source example file
ttxt = open(os.path.join("data","example.target"), "r") # opening the target example file

for (sline, tline) in zip(stxt, ttxt): # for each sentence in the training text
    # Strip new line characters, split sentences on whitespace. 
    swords = sline.strip().split()
    twords = tline.strip().split()
    
    swords.append("NULL")
    #twords.append("NULL")
    
    # Initial uniform probabilities
    Pin = 1.0/len(swords) # (len(swords) is already J+1 because the NULL word has been added)
    # Pin = 1.0/len(twords)
    
    # Collecting counts
    for sw in swords:
        for tw in twords:
            Clex[sw][tw] += Pin
            Cst[sw] += Pin
            
    #for tw in twords:
    #    for sw in swords:
    #        Clex[tw][sw] += Pa
    #        Cst += Pa

# Calculate new probabilites
Plex = probs_from_counts(Clex, Cst, prob_cutoff)

stxt.close()
ttxt.close()
# The texts have to be closed after initialisation and each iteration, and re-opened
# Alternative: read the entire text at once -- it can be veeeeeery large!!

# take look into the probabilities after initialisation:
sys.stdout.write("after initialisation:\n")
write_probs(Plex)

Clex

after initialisation:
la	the	0.3846
la	house	0.2692
la	big	0.1154
la	blue	0.1154
la	flower	0.1154

maison	the	0.4118
maison	house	0.4118
maison	big	0.1765

NULL	the	0.4286
NULL	house	0.1667
NULL	big	0.0714
NULL	blue	0.0714
NULL	flower	0.0714
NULL	dog	0.0952
NULL	cat	0.0952

grande	the	0.3333
grande	big	0.3333
grande	house	0.3333

fleur	the	0.3333
fleur	blue	0.3333
fleur	flower	0.3333

bleu	the	0.3333
bleu	blue	0.3333
bleu	flower	0.3333

le	the	0.5000
le	dog	0.2500
le	cat	0.2500

chien	the	0.5000
chien	dog	0.5000

chat	the	0.5000
chat	cat	0.5000





defaultdict(<function __main__.<lambda>()>,
            {'la': defaultdict(float,
                         {'the': 0.8333333333333333,
                          'house': 0.5833333333333333,
                          'big': 0.25,
                          'blue': 0.25,
                          'flower': 0.25}),
             'maison': defaultdict(float,
                         {'the': 0.5833333333333333,
                          'house': 0.5833333333333333,
                          'big': 0.25}),
             'NULL': defaultdict(float,
                         {'the': 1.4999999999999998,
                          'house': 0.5833333333333333,
                          'big': 0.25,
                          'blue': 0.25,
                          'flower': 0.25,
                          'dog': 0.3333333333333333,
                          'cat': 0.3333333333333333}),
             'grande': defaultdict(float,
                         {'the': 0.25, 'big': 0.25, 'house': 0.25}),
        

In [7]:
Cst

defaultdict(float,
            {'la': 2.1666666666666665,
             'maison': 1.4166666666666665,
             'NULL': 3.5000000000000004,
             'grande': 0.75,
             'fleur': 0.75,
             'bleu': 0.75,
             'le': 1.3333333333333333,
             'chien': 0.6666666666666666,
             'chat': 0.6666666666666666})

### Run model for set number of iterations

In [None]:
# Setting the number of iterations
iterations = 5

In [24]:
# iterating n times, n = the given number of iterations
for iter in range(iterations):
    Clex.clear() # clear counts at the beginning of each iteration (do not clear Plex!) 
    Cst.clear()

    sys.stdout.write("iteration %d\n" % (iter+1))
    
    stxt = open(os.path.join("data","example.source"), "r") # opening the source example file
    ttxt = open(os.path.join("data","example.target"), "r") # opening the target example file
    
    for (sline, tline) in zip(stxt, ttxt):
        swords = sline.strip().split()
        twords = tline.strip().split()
        
        swords.append("NULL")
        #twords.append("NULL")
        
        # Calculate the normalisation factor from current probabilities
        for tw in twords:
            Psum = 0.0
            for sw in swords:
                prob = Plex[sw][tw]
                P=max(prob, prob_smooth)
            
                Psum += P
    
            # Collect weighted counts for pairs (sw, tw) from current probabilities
            if Psum > 0:
              for sw in swords:
                  prob = Plex[sw][tw]
                  # Deal with zero probabilities
                  P=max(prob, prob_smooth)

                  Clex[sw][tw] += P/Psum
                  Cst[sw] += P/Psum
    
    stxt.close()
    ttxt.close()
    
    # After collecting counts from all sentences, update the probabilities: 
    Plex=probs_from_counts(Clex, Cst, prob_cutoff)
    
    # Take look into the probabilities after finished iteration:
    write_probs(Plex)
    

iteration 1
la	the	0.3846
la	house	0.2692
la	big	0.1154
la	blue	0.1154
la	flower	0.1154

maison	the	0.4118
maison	house	0.4118
maison	big	0.1765

NULL	the	0.4286
NULL	house	0.1667
NULL	big	0.0714
NULL	blue	0.0714
NULL	flower	0.0714
NULL	dog	0.0952
NULL	cat	0.0952

grande	the	0.3333
grande	big	0.3333
grande	house	0.3333

fleur	the	0.3333
fleur	blue	0.3333
fleur	flower	0.3333

bleu	the	0.3333
bleu	blue	0.3333
bleu	flower	0.3333

le	the	0.5000
le	dog	0.2500
le	cat	0.2500

chien	the	0.5000
chien	dog	0.5000

chat	the	0.5000
chat	cat	0.5000



iteration 2
la	the	0.4554
la	house	0.3027
la	big	0.0919
la	blue	0.0750
la	flower	0.0750

maison	the	0.3557
maison	house	0.4943
maison	big	0.1501

NULL	the	0.6452
NULL	house	0.1439
NULL	big	0.0437
NULL	blue	0.0357
NULL	flower	0.0357
NULL	dog	0.0480
NULL	cat	0.0480

grande	the	0.2195
grande	big	0.4909
grande	house	0.2896

fleur	the	0.2238
fleur	blue	0.3881
fleur	flower	0.3881

bleu	the	0.2238
bleu	blue	0.3881
bleu	flower	0.3881

le	the	0.5420
le	dog	0.22

In [16]:
# Final probabilities.
sys.stdout.write("final probabilities:\n")
write_probs(Plex)

final probabilities:
la	the	0.6704
la	house	0.3128
la	big	0.0081
la	blue	0.0044
la	flower	0.0044

maison	the	0.0748
maison	house	0.9018
maison	big	0.0234

NULL	the	0.9919
NULL	house	0.0075
NULL	big	0.0002
NULL	blue	0.0001
NULL	flower	0.0001
NULL	dog	0.0001
NULL	cat	0.0001

grande	the	0.0075
grande	big	0.9300
grande	house	0.0625

fleur	the	0.0116
fleur	blue	0.4942
fleur	flower	0.4942

bleu	the	0.0116
bleu	blue	0.4942
bleu	flower	0.4942

le	the	0.7849
le	dog	0.1075
le	cat	0.1075

chien	the	0.0277
chien	dog	0.9723

chat	the	0.0277
chat	cat	0.9723



