## Install libraries, modules

In [9]:
# !python -m spacy download es_core_news_sm

In [10]:
# !python -m spacy download en_core_web_sm

In [11]:
# pip install spacy

In [12]:
# pip install pandas

In [35]:
import spacy

from spacy.morphology import Morphology

import pandas as pd
import random

import json

import time

In [36]:
# install spacy lang models

spaNLP = spacy.load("es_core_news_sm")
engNLP = spacy.load("en_core_web_sm")

## 1. Load raw texts

In [37]:
with open('texts/rawsource.txt','r') as f:
     sourcetxt = f.read()
with open('texts/rawtarget.txt','r') as f:
     targettxt = f.read()

### Apply language model

In [38]:
sourcedoc = spaNLP(sourcetxt)
targetdoc = engNLP(targettxt)

In [39]:
# sentenize
rawsrcsents = []
rawtgtsents = []
for sent in sourcedoc.sents:
    rawsrcsents.append(sent.text)
for sent in targetdoc.sents:
    rawtgtsents.append(sent.text)

## 2. Write standardized files (one line per sentence) for input to Bleualign

In [40]:
# these are the inputs to bleualign
with open('sourcetextforbleualign.txt','w') as f:
    f.write('\n'.join(rawsrcsents))
with open('targettextforbleualign.txt','w') as f:
    f.write('\n'.join(rawtgtsents))

In [41]:
# tokenized sentences for data output
srctokens = []
for srcsent in rawsrcsents:
    tokens = spaNLP(srcsent)
    srctokens.append([{'text' : t.text, 'lemma' : t.lemma_} for t in tokens])
tgttokens = []
for tgtsent in rawtgtsents:
    tokens = engNLP(tgtsent)
    tgttokens.append([{'text' : t.text, 'lemma' : t.lemma_} for t in tokens])

### At this point, I also generate sourcetexttranslation.text using Google Translate and run Bleualign on the texts on my terminal.

# Run Bleualign

In [9]:
# !python setup.py install

In [10]:
# pip install translators --upgrade

In [45]:
import translators as ts

In [46]:
start = time.time()
translatedsourcesents = []
for sent in rawsrcsents:
    translatedsourcesents.append(ts.google(sent, to_language = 'en'))
end = time.time()
print(f'machine translation took {end-start} seconds')

machine translation took 212.9293007850647 seconds


In [47]:
with open('translatedsource.txt','w') as f:
    f.write('\n'.join(translatedsourcesents))

In [294]:
# %%capture cap --no-stderr
start = time.time()
!./bleualign.py -s sourcetextforbleualign.txt -t targettextforbleualign.txt --srctotarget translatedsource.txt -o outputfile --verbosity 2
end = time.time()
print(f'sentence alignment took {end-start} seconds')

reading in article 0: 
processing
computing alignment between srctotarget (file 0) and target text
Evaluating sentences with bleu
finished
searching for longest path of good alignments
finished
Wed Mar 30 15:26:57 2022
filling gaps
finished
Wed Mar 30 15:26:57 2022
Results of BLEU 1-to-1 alignment
[92m0: 0[1;m
[1;31m1: unaligned. best cand 108[1;m
[92m2: 2[1;m
[92m3: 3[1;m
[92m4: 6[1;m
[92m5: 7[1;m
[92m6: 8[1;m
[92m7: 9[1;m
[92m8: 10[1;m
[92m9: 11[1;m
[92m10: 12[1;m
[92m11: 13[1;m
[92m12: 16[1;m
[92m13: 17[1;m
[92m14: 18[1;m
[92m15: 19[1;m
[92m16: 20[1;m
[92m17: 22[1;m
[92m18: 23[1;m
[92m19: 24[1;m
[1;31m20: unaligned. best cand 39[1;m
[92m21: 25[1;m
[92m22: 26[1;m
[92m23: 28[1;m
[92m24: 29[1;m
[92m25: 31[1;m
[92m26: 32[1;m
[92m27: 33[1;m
[92m28: 35[1;m
[92m29: 36[1;m
[92m30: 37[1;m
[92m31: 38[1;m
[92m32: 40[1;m
[92m33: 41[1;m
[92m34: 42[1;m
[92m35: 43[1;m
[92m36: 44[1;m
[92m37: 45[1;m
[92m38: 46[1;m
[92m39:

In [284]:
# rawsrcsents[218], rawtgtsents[276]

In [285]:
# output = cap.stdout

# indexpairs = []
# split = output.split('finished with article')[0].split('alignment: ')[1:]
# for string in split:
#     string = string.replace('\r\n','')
#     strindices = string.split(' - ')
#     for srcidx in strindices[0].split(','):
#         for tgtidx in strindices[1].split(','):
#             if (int(srcidx) < len(rawsrcsents) - 1) and (int(tgtidx) < len(rawtgtsents) - 1):
#                 indexpairs.append((int(srcidx)+1, int(tgtidx) + 1))

In [286]:
# rawsrcsents[4], rawtgtsents[5:7]

## [START HERE] 3. Read sentence-aligned files (from Bleualign)

In [287]:
with open('outputfile-s','r') as f:
    alignedsrc = f.read().split('\n')
with open('outputfile-t','r') as f:
    alignedtgt = f.read().split('\n')

In [293]:
i = random.choice(range(len(alignedsrc)))
alignedsrc[i], alignedtgt[i]

('Sé que de todos los problemas, ninguno lo inquietó y lo trabajó como el abismal problema del tiempo.',
 'I know that of all problems, none disquieted him more, and none concerned him more than the profound one of time.')

In [296]:
len(alignedsrc)

212

In [335]:
# sent to sent alignment

sentAlignments = []
alignmentLookup = dict()

def isthisamatch(sent1,sent2,alignedsent):
    if (sent1 + ' '+ sent2).strip() == alignedsent.strip():
        return True
    elif (sent2 + ' '+ sent1).strip() == alignedsent.strip():
        return True
    else:
        return False
    
for srcsent, tgtsent in zip(alignedsrc, alignedtgt):
    srcmatches = []
    tgtmatches = []
    for s1, src1 in enumerate(rawsrcsents):
        if src1.strip() == srcsent.strip():
            srcmatches = [{'index' : s1, 'sent' : src1}]

        else:
            for s2, src2 in enumerate(rawsrcsents):
                if isthisamatch(src1, src2, srcsent):
                    srcmatches = [{'index' : s1 ,'sent' : src1}, {'index' : s2,'sent' : src2}]
                    continue
                    
    for t1, tgt1 in enumerate(rawtgtsents):
        if tgt1.strip() == tgtsent.strip():
            tgtmatches = [{'index' : t1, 'sent' : tgt1}]

        else:
            for t2, tgt2 in enumerate(rawtgtsents):
                if isthisamatch(tgt1, tgt1, tgtsent):
                    tgtmatches = [{'index' : t1 ,'sent' : tgt1}, {'index' : t2,'sent' : tgt2}]
                    continue
                
    for s in srcmatches:
        for t in tgtmatches:
            sentAlignments.append({
                'alignedsentindices' : (s['index'], t['index']),
                'alignedsents' : (rawsrcsents[s['index']], rawtgtsents[t['index']])
            })
            
            alignmentLookup[s['index']] = t['index']

In [311]:
with open('sentAlignment3-28.json', 'w', encoding='utf-8') as f:
    json.dump(sentAlignments, f, ensure_ascii=False, indent=4)

# EXAMPLE DONT NEED - check it works

In [319]:
# chec, k it works
randSentAlign = random.choice(sentAlignments)
s, t = randSentAlign['alignedsents']
print(s)
print(t)

Me apresuré; el próximo saldría a las nueve y media.
I hurried, for the next would not go until half past nine.


## 4.1 Parse word alignment using SimAlign (recommended: fast and high coverage)

In [56]:
# pip install simalign

In [57]:
from simalign import SentenceAligner
start = time.time()
# making an instance of our model.
# You can specify the embedding model and all alignment settings in the constructor.
myaligner = SentenceAligner(model="bert", token_type="bpe", matching_methods="mai")
end = time.time()
print(f'downloading word aligner tool took {end-start} seconds')

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
2022-03-29 09:32:48,750 - simalign.simalign - INFO - Initialized the EmbeddingLoader with model: bert-base-multilingual-cased


downloading word aligner tool took 8.547110080718994 seconds


## calculate word alignment with SimAlign

In [324]:
# get rid of white space at end
your_data = zip(srcsents[:-1], tgtsents[:-1])
your_data = zip(rawsrcsents, rawtgtsents)

start = time.time()

wordAlignmentList = []

for i, srcsent in enumerate(rawsrcsents):
    if i % 25 == 0:
        currently = time.time()
        print(f"{i}/{len(rawsrcsents)} sentences parsed in {currently-start} s.")

    srcDoc = spaNLP(srcsent)
    
    srcTokens = []
    for token in srcDoc:
        srcTokens.append({
            'tokenid' : token.idx,
            'pos' : token.pos_, 
            'text' : token.text, 
            'lemma' : token.lemma_,
            'features' : Morphology.feats_to_dict(str(token.morph))
        })

    try:
        j = alignmentLookup[i]
    except:
        continue

    tgtDoc = engNLP(rawtgtsents[j])

    tgtTokens = []
    for token in tgtDoc:
        tgtTokens.append({
            'tokenid' : token.idx,
            'pos' : token.pos_, 
            'text' : token.text, 
            'lemma' : token.lemma_,
            'features' : Morphology.feats_to_dict(str(token.morph))
        })
    
    src = [t.text for t in srcDoc]
    tgt = [t.text for t in tgtDoc]

    alignments = myaligner.get_word_aligns(src, tgt)
    itermax = alignments['itermax']

    wordAlignmentList.append({
        'alignedwordindices' : itermax,
        'alignedwords' : [(src[s], tgt[t]) for s, t in itermax],
        'srctokens' : srcTokens,
        'tgttokens' : tgtTokens,
        'srcsentidx' : i,
        'tgtsentidx' : j,
    })
end = time.time()
print('parsed in',end-start,'s')

0/223 sentences parsed in 0.00034689903259277344 s.
25/223 sentences parsed in 25.049719095230103 s.
50/223 sentences parsed in 35.06890416145325 s.
75/223 sentences parsed in 48.04298210144043 s.
100/223 sentences parsed in 65.13521909713745 s.
125/223 sentences parsed in 100.23047614097595 s.
150/223 sentences parsed in 117.76058912277222 s.
150/223 sentences parsed in 117.82086110115051 s.
150/223 sentences parsed in 117.83392810821533 s.
parsed in 128.95235204696655 s


# Write to JSON or CSV

In [341]:
with open('wordAlignment3-28.json', 'w',encoding='utf-8') as f:
    json.dump(alignmentList, f, ensure_ascii=False, indent=4)

In [353]:
srctokens = []
tgttokens = []
for srcsent in rawsrcsents:
    srcdoc = spaNLP(srcsent)
    srctokens.append([{
            'tokenid' : token.idx,
            'pos' : token.pos_, 
            'text' : token.text, 
            'lemma' : token.lemma_,
            'features' : Morphology.feats_to_dict(str(token.morph))
        } for token in srcdoc])
for tgtsent in rawtgtsents:
    tgtdoc = engNLP(tgtsent)
    tgttokens.append([{
            'tokenid' : token.idx,
            'pos' : token.pos_, 
            'text' : token.text, 
            'lemma' : token.lemma_,
            'features' : Morphology.feats_to_dict(str(token.morph))
        } for token in tgtdoc])

sentsInOrderJSON = {'srcSentsInOrder' : {'text' : rawsrcsents, 'tokens' : srctokens}, 'tgtSentsInOrder' : {'text' : rawtgtsents, 'tokens' : tgttokens}}
with open('sentsInOrder3-28.json', 'w', encoding='utf-8') as f:
    json.dump(sentsInOrderJSON, f, ensure_ascii=False, indent=4)

# Examples of Alignment (don't need in Python)

In [354]:
random.choice(wordAlignmentList)

{'alignedwordindices': [(0, 9),
  (1, 0),
  (1, 1),
  (1, 12),
  (2, 3),
  (3, 4),
  (4, 5),
  (5, 7),
  (6, 8),
  (7, 15),
  (8, 10),
  (9, 11),
  (10, 13),
  (11, 14)],
 'alignedwords': [('»', '"'),
  ('Reflexioné', 'I'),
  ('Reflexioné', 'thought'),
  ('Reflexioné', 'is'),
  ('un', 'a'),
  ('momento', 'moment'),
  ('y', 'and'),
  ('repuse', 'replied'),
  (':', ':'),
  ('»', '"'),
  ('-La', 'The'),
  ('palabra', 'word'),
  ('ajedrez', 'chess'),
  ('.', '.')],
 'srctokens': [{'tokenid': 0,
   'pos': 'PUNCT',
   'text': '»',
   'lemma': '»',
   'features': {'PunctType': 'Colo'}},
  {'tokenid': 1,
   'pos': 'PROPN',
   'text': 'Reflexioné',
   'lemma': 'Reflexioné',
   'features': {}},
  {'tokenid': 12,
   'pos': 'DET',
   'text': 'un',
   'lemma': 'uno',
   'features': {'Definite': 'Ind',
    'Gender': 'Masc',
    'Number': 'Sing',
    'PronType': 'Art'}},
  {'tokenid': 15,
   'pos': 'NOUN',
   'text': 'momento',
   'lemma': 'momento',
   'features': {'Gender': 'Masc', 'Number': 'Sing'

In [None]:
 # (1, 0),
 #  (1, 1),
 #  (1, 12),
 #  (2, 3),
 #  (3, 4),
 #  (4, 5),
 #  (5, 7),
 #  (6, 8),
 #  (7, 15),
 #  (8, 10),
 #  (9, 11),

In [90]:
# import pickle

# with open('borges_word_alignment_3-21.pickle', 'wb') as handle:
#     pickle.dump(alignmentList, handle, protocol=pickle.HIGHEST_PROTOCOL)

## EXAMPLE DONT NEED: an example of simalign on a single pair

In [108]:
i = random.choice(range(len(srcsents)))
srcDoc = spaNLP.tokenizer(srcsents[i])
tgtDoc = engNLP.tokenizer(tgtsents[i])
src = [t.text for t in srcDoc]
tgt = [t.text for t in tgtDoc]
alignments = myaligner.get_word_aligns(src, tgt)

for match in alignments:
    print(match, ':', alignments[match])

mwmf : [(0, 0), (1, 1), (2, 2), (3, 3), (4, 5), (4, 7), (5, 4), (6, 15), (7, 6), (8, 9), (9, 8), (10, 10), (11, 11), (12, 12), (13, 13), (14, 14), (15, 16), (16, 16), (17, 20), (18, 17), (19, 18), (20, 22), (21, 21), (21, 23), (22, 19), (22, 24), (23, 25)]
inter : [(0, 0), (1, 1), (2, 2), (3, 3), (4, 5), (7, 6), (8, 9), (9, 8), (10, 10), (11, 11), (12, 12), (13, 13), (14, 14), (16, 16), (19, 18), (20, 22), (21, 23), (22, 24), (23, 25)]
itermax : [(0, 0), (1, 1), (2, 2), (3, 3), (4, 5), (5, 4), (6, 7), (7, 6), (8, 9), (9, 8), (10, 10), (11, 11), (12, 12), (13, 13), (14, 14), (15, 16), (16, 16), (18, 19), (19, 18), (20, 22), (21, 23), (22, 24), (23, 25)]


In [109]:
for match in alignments:
    for s, t in alignments[match]:
        print(src[s], tgt[t])
    print()

Ts’ui Ts'ui
Pên Pen
fue was
un a
novelista novelist
novelista he
genial fine
, ,
pero but
también also
fue was
un a
hombre man
de of
letras letters
que who
sin doubtless
duda doubtless
no more
se ,
consideró considered
un a
mero than
mero mere
novelista himself
novelista novelist
. .

Ts’ui Ts'ui
Pên Pen
fue was
un a
novelista novelist
pero but
también also
fue was
un a
hombre man
de of
letras letters
que who
duda doubtless
consideró considered
un a
mero mere
novelista novelist
. .

Ts’ui Ts'ui
Pên Pen
fue was
un a
novelista novelist
genial fine
, he
pero but
también also
fue was
un a
hombre man
de of
letras letters
que who
sin doubtless
duda doubtless
se himself
consideró considered
un a
mero mere
novelista novelist
. .



In [None]:
# # get rid of white space at end
# your_data = zip(srcsents[:-1], tgtsents[:-1])

# start = time.time()

# alignmentList = []
# t = 0

# for sent_es_str, sent_en_str in your_data:
#     if t % 25 == 0:
#         currently = time.time()
#         print(f'{t/{len(srcsents)} sentences parsed in {currently-start} s.')

#     srcDoc = spaNLP(sent_es_str)
#     tgtDoc = engNLP(sent_en_str)
    
#     srcTokens = []
#     for token in srcDoc:
#         srcTokens.append({
#             'tokenid' : token.idx,
#             'pos' : token.pos_, 
#             'text' : token.text, 
#             'lemma' : token.lemma_,
#             'features' : Morphology.feats_to_dict(str(token.morph))
#         })

#     tgtTokens = []
#     for token in tgtDoc:
#         tgtTokens.append({
#             'tokenid' : token.idx,
#             'pos' : token.pos_, 
#             'text' : token.text, 
#             'lemma' : token.lemma_,
#             'features' : Morphology.feats_to_dict(str(token.morph))
#         })

#     src = [t.text for t in srcDoc]
#     tgt = [t.text for t in tgtDoc]
    
#     alignments = myaligner.get_word_aligns(src, tgt)
#     itermax = alignments['itermax']
#     try:
#         j = alignmentLookup[i]
#     except:
#         j = 'No Aligned Sentence'
    
    
#     alignmentList.append({
#         'alignedwordindices' : itermax,
#         'alignedwords' : [(src[s], tgt[t]) for s, t in itermax],
#         'srctokens' : srcTokens,
#         'tgttokens' : tgtTokens,
#         'srcsentidx' : i,
#         'tgtsentidx' : j,
#     })
    
#     t += 1
# end = time.time()
# print('parsed in',end-start,'s')