In [1]:
!git clone https://github.com/xutaima/jhu-mt-hw
%cd jhu-mt-hw/hw2
!ls

fatal: destination path 'jhu-mt-hw' already exists and is not an empty directory.
/content/jhu-mt-hw/hw2
align		  data	  ibm1_final_10k.a  ibm1_final.py  README.md
check-alignments  dice.a  ibm1_final_1k.a   jhu-mt-hw	   score-alignments


In [2]:
!python align -n 1000 > dice.a
!python score-alignments < dice.a

Training with Dice's coefficient.......................................
  Alignment 0  KEY: ( ) = guessed, * = sure, ? = possible
  ------------------------------------------------------------------------------------
 | *                                                                                   | chacun
 |    ?                                                                                | en
 |       ?                                                                             | lui
 |       ?                                                                             | -
 |       ?                                                                             | même
 |         (*)                           ( )                                           | est
 |             *                                                                       | très
 |               (*)         ( )            ( )   ( )            ( )         ( )( )    | complexe
 |                              

In [3]:
%%writefile ibm1_final.py
#!/usr/bin/env python
import optparse
import sys
from collections import defaultdict
import math

optparser = optparse.OptionParser()
optparser.add_option("-d", "--data", dest="train", default="data/hansards", help="Data filename prefix (default=data)")
optparser.add_option("-e", "--english", dest="english", default="e", help="Suffix of English filename (default=e)")
optparser.add_option("-f", "--french", dest="french", default="f", help="Suffix of French filename (default=f)")
optparser.add_option("-n", "--num_sentences", dest="num_sents", default=100000000000, type="int", help="Number of sentences to use for training and alignment")
optparser.add_option("-i", "--iterations", dest="iterations", default=10, type="int", help="Number of EM iterations (default=10)")
optparser.add_option("-t", "--threshold", dest="threshold", default=0.1, type="float", help="Threshold for alignment (default=0.1)")
(opts, _) = optparser.parse_args()

f_data = "%s.%s" % (opts.train, opts.french)
e_data = "%s.%s" % (opts.train, opts.english)

# load data
sys.stderr.write("Reading data...\n")
bitext = []
for (n, (f_sent, e_sent)) in enumerate(zip(open(f_data), open(e_data))):
    f_words = f_sent.strip().split()
    e_words = e_sent.strip().split()
    if len(f_words) > 0 and len(e_words) > 0:  # skip the empty
        bitext.append((f_words, e_words))
    if len(bitext) >= opts.num_sents:
        break

sys.stderr.write("Read %d sentence pairs\n" % len(bitext))

f_vocab = set()  # French
e_vocab = set()  # English

for (f_sent, e_sent) in bitext:
    f_vocab.update(f_sent)
    e_vocab.update(e_sent)

# initialize t(f|e)
t = defaultdict(lambda: defaultdict(float))
uniform_prob = 1.0 / len(e_vocab)

for f in f_vocab:
    for e in e_vocab:
        t[f][e] = uniform_prob

sys.stderr.write("French vocabulary size: %d\n" % len(f_vocab))
sys.stderr.write("English vocabulary size: %d\n" % len(e_vocab))

# initialize prob
sys.stderr.write("Initializing IBM Model 1...\n")
t = defaultdict(lambda: defaultdict(float))

uniform_prob = 1.0 / len(e_vocab)
for (f_sent, e_sent) in bitext:
    for f in f_sent:
        for e in e_sent:
            if t[f][e] == 0:
                t[f][e] = uniform_prob

# EM training
for iter_num in range(opts.iterations):
    sys.stderr.write("Iteration %d..." % (iter_num + 1))

    count = defaultdict(lambda: defaultdict(float))
    total = defaultdict(float)

    # E-step
    for (f_sent, e_sent) in bitext:
        # for every single French word
        for f in f_sent:
            # calculate z
            z = sum(t[f][e] for e in e_sent)

            # collect
            for e in e_sent:
                delta = t[f][e] / z if z > 0 else 0
                count[f][e] += delta
                total[e] += delta

    # M-step
    for f in f_vocab:
        for e in e_vocab:
            if total[e] > 0:
                t[f][e] = count[f][e] / total[e]
            else:
                t[f][e] = 0

    sys.stderr.write(" done\n")

sys.stderr.write("Generating alignments...\n")
for (f_sent, e_sent) in bitext:
    alignment = []
    for (i, f) in enumerate(f_sent):
        best_j = -1
        best_prob = opts.threshold
        for (j, e) in enumerate(e_sent):
            # adding diagonal bias
            distance_penalty = math.exp(-abs(i - j * len(f_sent) / len(e_sent)))
            prob = t[f][e] * distance_penalty

            if prob > best_prob:
                best_prob = prob
                best_j = j
        if best_j >= 0:
            alignment.append(f"{i}-{best_j}")
    print(" ".join(alignment))

Overwriting ibm1_final.py


In [4]:
# use 1000 sentences to train
!chmod +x ibm1_final.py
!python ibm1_final.py -n 1000 -i 10 -t 0.1 > ibm1_final_1k.a
!echo "Results with 1000 sentences:"
!python score-alignments < ibm1_final_1k.a

# process 10000 sentences in 4 batches of 2500 each
!echo "Processing 10000 sentences in batches..."

# first batch
!python ibm1_final.py -n 2500 -i 10 -t 0.1 > ibm1_batch1.a

# second batch
!tail -n +2501 data/hansards.e > temp.e
!tail -n +2501 data/hansards.f > temp.f
!head -n 2500 temp.e > batch2.e
!head -n 2500 temp.f > batch2.f
!python ibm1_final.py -d batch2 -n 2500 -i 10 -t 0.1 > ibm1_batch2.a

# third batch
!tail -n +5001 data/hansards.e > temp.e
!tail -n +5001 data/hansards.f > temp.f
!head -n 2500 temp.e > batch3.e
!head -n 2500 temp.f > batch3.f
!python ibm1_final.py -d batch3 -n 2500 -i 10 -t 0.1 > ibm1_batch3.a

# fourth batch
!tail -n +7501 data/hansards.e > temp.e
!tail -n +7501 data/hansards.f > temp.f
!head -n 2500 temp.e > batch4.e
!head -n 2500 temp.f > batch4.f
!python ibm1_final.py -d batch4 -n 2500 -i 10 -t 0.1 > ibm1_batch4.a


!cat ibm1_batch1.a ibm1_batch2.a ibm1_batch3.a ibm1_batch4.a > ibm1_final_10k.a
!rm temp.e temp.f batch*.e batch*.f ibm1_batch*.a
!echo "Results with 10000 sentences:"
!python score-alignments < ibm1_final_10k.a

Reading data...
Read 1000 sentence pairs
French vocabulary size: 3301
English vocabulary size: 2862
Initializing IBM Model 1...
Iteration 1... done
Iteration 2... done
Iteration 3... done
Iteration 4... done
Iteration 5... done
Iteration 6... done
Iteration 7... done
Iteration 8... done
Iteration 9... done
Iteration 10... done
Generating alignments...
Results with 1000 sentences:
  Alignment 0  KEY: ( ) = guessed, * = sure, ? = possible
  ------------------------------------------------------------------------------------
 | *                                                                                   | chacun
 |    ?                                                                                | en
 |       ?                                                                             | lui
 |       ?                                                                             | -
 |       ?                                                                             | même
 |    

In [13]:
!python ibm1_final.py -i 10 -t 0.1 > alignment.txt
!ls -la alignment.txt
!wc -l alignment.txt
!cp alignment.txt alignment
#download
from google.colab import files
files.download('alignment.txt')
files.download('alignment')

Reading data...
Read 100000 sentence pairs
/bin/bash: line 1: 95303 Killed                  python ibm1_final.py -i 10 -t 0.1 > alignment.txt
-rw-r--r-- 1 root root 0 Sep 12 00:27 alignment.txt
0 alignment.txt


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>