## Install libraries, modules

In [9]:
# !python -m spacy download es_core_news_sm

In [10]:
# !python -m spacy download en_core_web_sm

In [11]:
# pip install spacy

In [12]:
# pip install pandas

In [35]:
import spacy

from spacy.morphology import Morphology

import pandas as pd
import random

import json

import time

In [36]:
# install spacy lang models

spaNLP = spacy.load("es_core_news_sm")
engNLP = spacy.load("en_core_web_sm")

## 1. Load raw texts

In [37]:
with open('texts/rawsource.txt','r') as f:
     sourcetxt = f.read()
with open('texts/rawtarget.txt','r') as f:
     targettxt = f.read()

### Apply language model

In [38]:
sourcedoc = spaNLP(sourcetxt)
targetdoc = engNLP(targettxt)

In [39]:
# sentenize
rawsrcsents = []
rawtgtsents = []
for sent in sourcedoc.sents:
    rawsrcsents.append(sent.text)
for sent in targetdoc.sents:
    rawtgtsents.append(sent.text)

## 2. Write standardized files (one line per sentence) for input to Bleualign

In [40]:
# these are the inputs to bleualign
with open('sourcetextforbleualign.txt','w') as f:
    f.write('\n'.join(rawsrcsents))
with open('targettextforbleualign.txt','w') as f:
    f.write('\n'.join(rawtgtsents))

In [41]:
# tokenized sentences for data output
srctokens = []
for srcsent in rawsrcsents:
    tokens = spaNLP(srcsent)
    srctokens.append([{'text' : t.text, 'lemma' : t.lemma_} for t in tokens])
tgttokens = []
for tgtsent in rawtgtsents:
    tokens = engNLP(tgtsent)
    tgttokens.append([{'text' : t.text, 'lemma' : t.lemma_} for t in tokens])

In [42]:
with open('sentsInOrder3-28.json', 'w', encoding='utf-8') as f:
    json.dump({'srcSentsInOrder' : {'text' : rawsrcsents, 'tokens' : srctokens}, 'tgtSentsInOrder' : {'text' : rawtgtsents, 'tokens' : tgttokens}}, f, ensure_ascii=False, indent=4)

### At this point, I also generate sourcetexttranslation.text using Google Translate and run Bleualign on the texts on my terminal.

# Run Bleualign

In [9]:
# !python setup.py install

In [10]:
# pip install translators --upgrade

In [45]:
import translators as ts

In [46]:
start = time.time()
translatedsourcesents = []
for sent in rawsrcsents:
    translatedsourcesents.append(ts.google(sent, to_language = 'en'))
end = time.time()
print(f'machine translation took {end-start} seconds')

machine translation took 212.9293007850647 seconds


In [47]:
with open('translatedsource.txt','w') as f:
    f.write('\n'.join(translatedsourcesents))

In [48]:
start = time.time()
!./bleualign.py -s sourcetextforbleualign.txt -t targettextforbleualign.txt --srctotarget translatedsource.txt -o outputfile
end = time.time()
print(f'sentence alignment took {end-start} seconds')

reading in article 0: 
processing
computing alignment between srctotarget (file 0) and target text
Evaluating sentences with bleu
finished
searching for longest path of good alignments
finished
filling gaps
finished

finished with article


sentence alignment took 2.1918561458587646 seconds


## [START HERE] 3. Read sentence-aligned files (from Bleualign)

In [49]:
with open('outputfile-s','r') as f:
    srcsents = f.read().split('\n')
with open('outputfile-t','r') as f:
    tgtsents = f.read().split('\n')

In [51]:
i = random.choice(range(len(srcsents)))
srcsents[i], tgtsents[i]

('Madden, en el departamento de Viktor Runeberg, quería decir el fin de nuestros afanes y -pero eso parecía muy secundario, o debía parecérmelo- también de nuestras vidas.',
 "Madden, in Viktor Runeberg's office, meant the end of all our work and - though this seemed a secondary matter, or should have seemed so to me - of our lives also.")

In [52]:
# sent to sent alignment
oneLineSpa, oneLineEng = rawsrcsents, rawtgtsents
alignedSpa, alignedEng = srcsents, tgtsents
sentAlignments = []
alignmentLookup = dict()

spaIndex = 0
for alignSpaSent, alignEngSent in zip(alignedSpa, alignedEng):
    if spaIndex % 50 == 0:
        print(f'{spaIndex}/{len(srcsents)} sentences parsed.')
    individualEngSents = [sent.text for sent in engNLP(alignEngSent).sents]
    for indEngSent in individualEngSents:
        for i, thisEngLine in enumerate(oneLineEng):
            if indEngSent.strip() == thisEngLine.strip():
                engIndex = i
        for j, thisSpaLine in enumerate(oneLineSpa):
            if alignSpaSent.strip() == thisSpaLine.strip():
                spaIndex = j
        sentAlignments.append({
            'alignedsentindices' : (spaIndex, engIndex),
            'alignedsents' : (oneLineSpa[spaIndex], oneLineEng[engIndex])
        })
        alignmentLookup[spaIndex] = engIndex
    spaIndex += 1

0/212 sentences parsed.
50/212 sentences parsed.
100/212 sentences parsed.
150/212 sentences parsed.
200/212 sentences parsed.


In [53]:
with open('sentAlignment3-28.json', 'w', encoding='utf-8') as f:
    json.dump(sentAlignments, f, ensure_ascii=False, indent=4)

# EXAMPLE DONT NEED - check it works

In [54]:
# chec, k it works
randSentAlign = random.choice(sentAlignments)
s, t = randSentAlign['alignedsents']
print(s)
print(t)

"¿Ashgrove?", les pregunté a unos chicos en el andén.
I asked some children on the platform.


## 4.1 Parse word alignment using SimAlign (recommended: fast and high coverage)

In [56]:
# pip install simalign

In [57]:
from simalign import SentenceAligner
start = time.time()
# making an instance of our model.
# You can specify the embedding model and all alignment settings in the constructor.
myaligner = SentenceAligner(model="bert", token_type="bpe", matching_methods="mai")
end = time.time()
print(f'downloading word aligner tool took {end-start} seconds')

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
2022-03-29 09:32:48,750 - simalign.simalign - INFO - Initialized the EmbeddingLoader with model: bert-base-multilingual-cased


downloading word aligner tool took 8.547110080718994 seconds


## calculate word alignment with SimAlign

In [58]:
your_data = zip(srcsents[:-1], tgtsents[:-1])
start = time.time()

alignmentList = []
i = 0
for sent_es_str, sent_en_str in your_data:
    if i % 25 == 0:
        currently = time.time()
        print(f'{i}/{len(srcsents)} sentences parsed in {currently-start} s.')
    srcDoc = spaNLP(sent_es_str)
    tgtDoc = engNLP(sent_en_str)
    
    srcTokens = []
    for token in srcDoc:
        srcTokens.append({
            'tokenid' : token.idx,
            'pos' : token.pos_, 
            'text' : token.text, 
            'lemma' : token.lemma_,
            'features' : Morphology.feats_to_dict(str(token.morph))
        })
    tgtTokens = []
    for token in tgtDoc:
        tgtTokens.append({
            'tokenid' : token.idx,
            'pos' : token.pos_, 
            'text' : token.text, 
            'lemma' : token.lemma_,
            'features' : Morphology.feats_to_dict(str(token.morph))
        })

    src = [t.text for t in srcDoc]
    tgt = [t.text for t in tgtDoc]
    
    alignments = myaligner.get_word_aligns(src, tgt)
    itermax = alignments['itermax']
    try:
        j = alignmentLookup[i]
    except:
        j = 'No Aligned Sentence'
    
    
    alignmentList.append({
        'alignedwordindices' : itermax,
        'alignedwords' : [(src[s], tgt[t]) for s, t in itermax],
        'srctokens' : srcTokens,
        'tgttokens' : tgtTokens,
        'srcsentidx' : i,
        'tgtsentidx' : j,
    })
    
    i += 1
end = time.time()
print('parsed in',end-start,'s')

0/212 sentences parsed in 0.003075122833251953 s.
25/212 sentences parsed in 29.386754035949707 s.
50/212 sentences parsed in 53.92922806739807 s.
75/212 sentences parsed in 71.35570216178894 s.
100/212 sentences parsed in 104.39477396011353 s.
125/212 sentences parsed in 120.35754227638245 s.
150/212 sentences parsed in 142.1159610748291 s.
175/212 sentences parsed in 160.01262426376343 s.
200/212 sentences parsed in 177.03865003585815 s.
parsed in 183.1890082359314 s


# Write to JSON or CSV

In [64]:
with open('wordAlignment3-28.json', 'w',encoding='utf-8') as f:
    json.dump(alignmentList, f, ensure_ascii=False, indent=4)

# Examples of Alignment (don't need in Python)

In [107]:
random.choice(alignmentList)

{'alignedwordindices': [(0, 0),
  (1, 1),
  (2, 3),
  (3, 4),
  (5, 5),
  (6, 7),
  (7, 8),
  (8, 12),
  (10, 13),
  (11, 9),
  (12, 14)],
 'alignedwords': [('El', 'The'),
  ('camino', 'road'),
  ('bajaba', 'descending'),
  ('y', 'and'),
  ('bifurcaba', 'branching'),
  (',', ','),
  ('entre', 'through'),
  ('las', 'the'),
  ('confusas', 'twilight'),
  ('praderas', 'meadows'),
  ('.', '.')],
 'srctokens': [{'tokenid': 0,
   'pos': 'DET',
   'text': 'El',
   'lemma': 'el',
   'features': {'Definite': 'Def',
    'Gender': 'Masc',
    'Number': 'Sing',
    'PronType': 'Art'}},
  {'tokenid': 3,
   'pos': 'NOUN',
   'text': 'camino',
   'lemma': 'camino',
   'features': {'Gender': 'Masc', 'Number': 'Sing'}},
  {'tokenid': 10,
   'pos': 'VERB',
   'text': 'bajaba',
   'lemma': 'bajar',
   'features': {'Mood': 'Ind',
    'Number': 'Sing',
    'Person': '3',
    'Tense': 'Imp',
    'VerbForm': 'Fin'}},
  {'tokenid': 17, 'pos': 'CCONJ', 'text': 'y', 'lemma': 'y', 'features': {}},
  {'tokenid': 1

In [90]:
# import pickle

# with open('borges_word_alignment_3-21.pickle', 'wb') as handle:
#     pickle.dump(alignmentList, handle, protocol=pickle.HIGHEST_PROTOCOL)

## EXAMPLE DONT NEED: an example of simalign on a single pair

In [108]:
i = random.choice(range(len(srcsents)))
srcDoc = spaNLP.tokenizer(srcsents[i])
tgtDoc = engNLP.tokenizer(tgtsents[i])
src = [t.text for t in srcDoc]
tgt = [t.text for t in tgtDoc]
alignments = myaligner.get_word_aligns(src, tgt)

for match in alignments:
    print(match, ':', alignments[match])

mwmf : [(0, 0), (1, 1), (2, 2), (3, 3), (4, 5), (4, 7), (5, 4), (6, 15), (7, 6), (8, 9), (9, 8), (10, 10), (11, 11), (12, 12), (13, 13), (14, 14), (15, 16), (16, 16), (17, 20), (18, 17), (19, 18), (20, 22), (21, 21), (21, 23), (22, 19), (22, 24), (23, 25)]
inter : [(0, 0), (1, 1), (2, 2), (3, 3), (4, 5), (7, 6), (8, 9), (9, 8), (10, 10), (11, 11), (12, 12), (13, 13), (14, 14), (16, 16), (19, 18), (20, 22), (21, 23), (22, 24), (23, 25)]
itermax : [(0, 0), (1, 1), (2, 2), (3, 3), (4, 5), (5, 4), (6, 7), (7, 6), (8, 9), (9, 8), (10, 10), (11, 11), (12, 12), (13, 13), (14, 14), (15, 16), (16, 16), (18, 19), (19, 18), (20, 22), (21, 23), (22, 24), (23, 25)]


In [109]:
for match in alignments:
    for s, t in alignments[match]:
        print(src[s], tgt[t])
    print()

Ts’ui Ts'ui
Pên Pen
fue was
un a
novelista novelist
novelista he
genial fine
, ,
pero but
también also
fue was
un a
hombre man
de of
letras letters
que who
sin doubtless
duda doubtless
no more
se ,
consideró considered
un a
mero than
mero mere
novelista himself
novelista novelist
. .

Ts’ui Ts'ui
Pên Pen
fue was
un a
novelista novelist
pero but
también also
fue was
un a
hombre man
de of
letras letters
que who
duda doubtless
consideró considered
un a
mero mere
novelista novelist
. .

Ts’ui Ts'ui
Pên Pen
fue was
un a
novelista novelist
genial fine
, he
pero but
también also
fue was
un a
hombre man
de of
letras letters
que who
sin doubtless
duda doubtless
se himself
consideró considered
un a
mero mere
novelista novelist
. .

