In [13]:
# This file readings the model generated by GIZA ++, uses it to compute the alignment probability of any given stences
model_path = 'wmt18-de_en'

is_invs = "inv"
# Read src and trg dictionaries
src_dict_path = f"{model_path}/src_trg_{is_invs}swm.trn.src.vcb"
trg_dict_path = f"{model_path}/src_trg_{is_invs}swm.trn.trg.vcb"

src_dict = {}
trg_dict = {}

with open(src_dict_path, 'r') as f:
    for line in f:
        line = line.strip()
        if line == '':
            continue
        idx, word, count = line.split(' ')
        src_dict[int(idx)] = word
src_word2id = {v:k for k,v in src_dict.items()}
        
with open(trg_dict_path, 'r') as f:
    for line in f:
        line = line.strip()
        if line == '':
            continue
        idx, word, count = line.split(' ')
        trg_dict[int(idx)] = word
trg_word2id = {v:k for k,v in trg_dict.items()}
        

# Read T table (Translation probability)
# Each line is of the following format:
# s_id t_id P(t_id/s_id)
# where: 
#  s_id: is the unique id for the source token
#  t_id: is the unique id for the target token
#  P(t_id/s_id) the probability of translating s_id as t_id
t_table_path = f"{model_path}/src_trg_{is_invs}swm.t3.final"
t_table = {}
with open(t_table_path, 'r') as f:
    for line in f:
        line = line.strip()
        if line == '':
            continue
        src, trg, prob = line.split(' ')
        t_table[(int(src), int(trg))] = float(prob)



# Read Fertility table
fertility_table_path = f"{model_path}/src_trg_{is_invs}swm.n3.final"
# Each line in this file is of the following format:
# source_token_id p0 p1 p2 .... pn
# where p0 is the probability that the source token has zero fertility;
# p1, fertility one, ...., and n is the maximum possible fertility as
# defined in the program.
n_table = {}
with open(fertility_table_path, 'r') as f:
    for line in f:
        line = line.strip()
        if line == '':
            continue
        tokens = line.split(' ')
        n_table[int(tokens[0])] = [float(x) for x in tokens[1:]]


# Read the distortion table
distortion_table_path = f"{model_path}/src_trg_{is_invs}swm.d3.final"
# File format: i j l m p(i | j, l, m)
# where i, j, l, m are all integers and
#  j = position in target sentence
#  i = position in source sentence
#  l = length of source sentence
#  m = length of target sentence
# and p(i/j,l,m) is the probability that a source word in position i is
# moved to position j in a pair of sentences of length l and m.
d_table = {}
with open(distortion_table_path, 'r') as f:
    for line in f:
        line = line.strip()
        if line == '':
            continue
        j, i, l, m, p = line.split(' ')
        d_table[(int(j), int(i), int(l), int(m))] = float(p)

In [14]:
def compute_alignment_prob(src_sent, trg_sent):
    l = len(src_sent)
    m = len(trg_sent)
    alignment_probs = []
    l = 100 # Current GIZA ++ fix l  fixed to 100
    
    for trg_pos, trg_id in enumerate(trg_sent):
        trg_probs = []

        # Alignment with NULL token
        null_trans_prob = t_table.get((0, trg_id), 0)  # Default to 0 if not found
        null_fert_prob = n_table.get(0, [0])[0]  # Probability of NULL token having zero fertility
        null_distortion_prob = d_table.get((0, trg_pos, l, m), 0)  # Default to 0 if not found
        null_prob = null_trans_prob * null_fert_prob * null_distortion_prob
        trg_probs.append(null_prob)

        # Alignment with source tokens
        for src_pos, src_id in enumerate(src_sent, start=1):  # Start at 1 because 0 is for NULL token
            trans_prob = t_table.get((src_id, trg_id), 0)  # Default to 0 if not found
            fert_probs = n_table.get(src_id, [0])
            fert_prob = fert_probs[1] if len(fert_probs) > 1 else 0  # Assuming fertility of 1; adjust if needed
            distortion_prob = d_table.get((src_pos, trg_pos, l, m), 0)  # Default to 0 if not found
            combined_prob = (trans_prob * fert_prob * distortion_prob)
            trg_probs.append(combined_prob)

        # Normalize the probabilities to make them sum to 1
        # sum_probs = sum(trg_probs)
        # trg_probs = [p / sum_probs if sum_probs > 0 else 0 for p in trg_probs]
        alignment_probs.append(trg_probs)

    return alignment_probs

In [15]:
# Example usage
src_text = "▁Wenn ▁Sie ▁einen ▁Namen ▁ändern , ▁dann ▁wird ▁der ▁andere ▁auch ▁automatisch ▁geändert ."
trg_text = "▁When ▁you ▁change ▁one , ▁the ▁other ▁will ▁automatically ▁be ▁changed ."

src_id = [src_word2id[x] for x in src_text.split(' ') if x in src_word2id]
trg_id = [trg_word2id[x] for x in trg_text.split(' ') if x in trg_word2id]

alignment_probabilities = compute_alignment_prob(src_id, trg_id)
print(alignment_probabilities)

[[0, 0.0008590090157791882, 2.3628697345050067e-05, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.018384370745592783, 0.0, 2.374374453177285e-08, 0.0, 0.0, 2.8090289131258066e-05, 0.0, 0.0, 0.0, 0.0, 2.2557923237641612e-06, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.019715982166560068, 4.204802786774065e-08, 5.125400230639616e-06, 0.0, 0.0, 2.1456545387737697e-05, 0.0, 1.778533667050098e-07, 0.0, 0.0], [0.0, 0.0, 1.087968336639729e-08, 0.015965704451687573, 0.0, 0.0, 5.74490918781068e-08, 0.0, 0.0, 1.5225651955358712e-08, 2.7125545637657254e-05, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03357298204595233, 0.0, 0.0, 0.0, 0.0, 1.0606694716645022e-07, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 6.602352272025047e-05, 0.0, 0.0, 0.0, 0.0, 0.0, 0.025726274914731572, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 4.895618539994404e-07, 0.03614139328557959, 3.858314496165736e-06, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 5.271971126564631e-06, 0.0, 0.0, 0