## Install libraries, modules

In [1]:
# !python -m spacy download es_core_news_sm

In [2]:
# !python -m spacy download en_core_web_sm

In [3]:
# !python -m spacy download ru_core_news_sm

In [4]:
# pip install spacy

In [5]:
# pip install pandas

In [1]:
import spacy

from spacy.morphology import Morphology

import pandas as pd
import random

import json

import time

In [4]:
# SPECIFY SOURCE LANGUAGE
srclang = 'Spanish'

In [5]:
# install spacy lang models

if srclang == 'Spanish':
    sourceNLP = spacy.load("es_core_news_sm")
elif srclang == 'Russian':
    sourceNLP = spacy.load("ru_core_news_sm")
    
engNLP = spacy.load("en_core_web_sm")

## 1. Load raw texts

In [55]:
with open(f'texts/{srclang}/rawsource.txt','r') as f:
     sourcetxt = f.read().replace('\n',' ')
with open(f'texts/{srclang}/rawtarget.txt','r') as f:
     targettxt = f.read().replace('\n',' ')

### Apply language model

In [56]:
sourcedoc = sourceNLP(sourcetxt)
targetdoc = engNLP(targettxt)

In [57]:
# sentenize
rawsrcsents = []
rawtgtsents = []
for sent in sourcedoc.sents:
    rawsrcsents.append(sent.text)
for sent in targetdoc.sents:
    rawtgtsents.append(sent.text)

## 2. Write standardized files (one line per sentence) for input to Bleualign

In [58]:
# these are the inputs to bleualign
with open(f'texts/{srclang}/sourcetextforbleualign.txt','w') as f:
    f.write('\n'.join(rawsrcsents))
with open(f'texts/{srclang}/targettextforbleualign.txt','w') as f:
    f.write('\n'.join(rawtgtsents))

In [59]:
# tokenized sentences for data output
srctokens = []
for srcsent in rawsrcsents:
    tokens = sourceNLP(srcsent)
    srctokens.append([{'text' : t.text, 'lemma' : t.lemma_} for t in tokens])
tgttokens = []
for tgtsent in rawtgtsents:
    tokens = engNLP(tgtsent)
    tgttokens.append([{'text' : t.text, 'lemma' : t.lemma_} for t in tokens])

# Run Bleualign

In [60]:
# !python setup.py install

In [61]:
# pip install translators --upgrade

In [62]:
import translators as ts

In [92]:
start = time.time()
translatedsourcesents = []
for i, sent in enumerate(rawsrcsents):
    if i % 25 == 0:
        print(f'{i}/{len(rawsrcsents)} sents translated.')
    try:
        translatedsourcesents.append(ts.google(sent, to_language = 'en'))
    except:
        print('problem on',sent)
        translatedsourcesents.append('\n')
end = time.time()
print(f'machine translation took {end-start} seconds')

0/226 sents translated.
25/226 sents translated.
50/226 sents translated.
75/226 sents translated.
100/226 sents translated.
125/226 sents translated.
problem on  
150/226 sents translated.
175/226 sents translated.
200/226 sents translated.
225/226 sents translated.
machine translation took 201.71470999717712 seconds


In [93]:
len(rawsrcsents), len(translatedsourcesents)

(226, 226)

In [94]:
with open(f'texts/{srclang}/translatedsource.txt','w') as f:
    f.write('\n'.join(translatedsourcesents))

In [95]:
# %%capture cap --no-stderr
start = time.time()
!./bleualign.py -s texts/spanish/sourcetextforbleualign.txt -t texts/spanish/targettextforbleualign.txt --srctotarget texts/spanish/translatedsource.txt -o texts/spanish/outputfile --verbosity 2
end = time.time()
print(f'sentence alignment took {end-start} seconds')

reading in article 0: 
processing
computing alignment between srctotarget (file 0) and target text
Evaluating sentences with bleu
finished
searching for longest path of good alignments
finished
Thu Apr  7 17:17:23 2022
filling gaps
finished
Thu Apr  7 17:17:23 2022
Results of BLEU 1-to-1 alignment
[92m0: 0[1;m
[1;31m1: unaligned. best cand 107[1;m
[92m2: 2[1;m
[92m3: 3[1;m
[92m4: 5[1;m
[92m5: 6[1;m
[92m6: 7[1;m
[92m7: 8[1;m
[1;31m8: unaligned. best cand 9[1;m
[92m9: 9[1;m
[92m10: 10[1;m
[92m11: 11[1;m
[92m12: 12[1;m
[92m13: 15[1;m
[92m14: 16[1;m
[92m15: 17[1;m
[92m16: 18[1;m
[92m17: 19[1;m
[92m18: 21[1;m
[92m19: 22[1;m
[92m20: 23[1;m
[1;31m21: unaligned. best cand 38[1;m
[92m22: 24[1;m
[92m23: 25[1;m
[92m24: 27[1;m
[92m25: 28[1;m
[92m26: 30[1;m
[92m27: 31[1;m
[92m28: 32[1;m
[92m29: 34[1;m
[92m30: 35[1;m
[92m31: 36[1;m
[92m32: 37[1;m
[92m33: 39[1;m
[92m34: 40[1;m
[92m35: 41[1;m
[92m36: 42[1;m
[92m37: 43[1;m
[92

## [START HERE] 3. Read sentence-aligned files (from Bleualign)

In [96]:
with open(f'texts/{srclang}/outputfile-s','r') as f:
    alignedsrc = f.read().split('\n')
with open(f'texts/{srclang}/outputfile-t','r') as f:
    alignedtgt = f.read().split('\n')

In [99]:
i = random.choice(range(len(alignedsrc)))
alignedsrc[i], alignedtgt[i]

('Si mi boca, antes que la deshiciera un balazo, pudiera gritar ese nombre de modo que lo oyeran en Alemania... Mi voz humana era muy pobre.',
 'If only my mouth, before it should be silenced by a bullet, could shout this name in such a way that it could be heard in Germany . . . My voice, my human voice, was weak.')

In [101]:
# sent to sent alignment
oneLineSpa, oneLineEng = rawsrcsents, rawtgtsents
alignedSpa, alignedEng = alignedsrc, alignedtgt
sentAlignments = []
alignmentLookup = dict()
spaIndex = 0
for alignSpaSent, alignEngSent in zip(alignedSpa, alignedEng):
    if spaIndex % 50 == 0:
        print(f'{spaIndex}/{len(rawsrcsents)} sentences parsed.')
    individualEngSents = [sent.text for sent in engNLP(alignEngSent).sents]
    for indEngSent in individualEngSents:
        for i, thisEngLine in enumerate(oneLineEng):
            if indEngSent.strip() == thisEngLine.strip():
                engIndex = i
        for j, thisSpaLine in enumerate(oneLineSpa):
            if alignSpaSent.strip() == thisSpaLine.strip():
                spaIndex = j
        sentAlignments.append({
            'indices' : (spaIndex, engIndex),
            'sents' : (oneLineSpa[spaIndex], oneLineEng[engIndex])
        })
        alignmentLookup.setdefault(spaIndex,[])
        alignmentLookup[spaIndex].append(engIndex)
    spaIndex += 1

0/226 sentences parsed.
50/226 sentences parsed.
100/226 sentences parsed.
150/226 sentences parsed.
200/226 sentences parsed.


In [126]:
with open(f'jsondata/{srclang}/sentAlignment4-8.json', 'w', encoding='utf-8') as f:
    json.dump(sentAlignments, f, ensure_ascii=False, indent=4)

# EXAMPLE DONT NEED - check it works

In [118]:
# chec, k it works
randSentAlign = random.choice(sentAlignments)
s, t = randSentAlign['sents']
print(s)
print(t)

Esa trama de tiempos que se aproximan, se bifurcan, se cortan o que secularmente se ignoran, abarca todas las posibilidades.
He believed in an infinite series of times, in a dizzily growing, ever spreading network of diverging, converging and parallel times.


## 4.1 Parse word alignment using SimAlign (recommended: fast and high coverage)

In [120]:
# pip install simalign

In [121]:
from simalign import SentenceAligner
start = time.time()
# making an instance of our model.
# You can specify the embedding model and all alignment settings in the constructor.
myaligner = SentenceAligner(model="bert", token_type="bpe", matching_methods="mai")
end = time.time()
print(f'downloading word aligner tool took {end-start} seconds')

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
2022-04-07 18:27:56,278 - simalign.simalign - INFO - Initialized the EmbeddingLoader with model: bert-base-multilingual-cased


downloading word aligner tool took 24.096473217010498 seconds


## calculate word alignment with SimAlign

In [122]:
len(rawsrcsents)

226

In [123]:
# get rid of white space at end
your_data = zip(rawsrcsents, rawtgtsents)

start = time.time()

wordAlignmentList = []

for i, srcsent in enumerate(rawsrcsents):
    if i % 25 == 0:
        currently = time.time()
        print(f"{i}/{len(rawsrcsents)} sentences parsed in {currently-start} s.")

    srcDoc = sourceNLP(srcsent)
    
    srcTokens = []
    for token in srcDoc:
        srcTokens.append({
            'tokenid' : token.idx,
            'pos' : token.pos_, 
            'text' : token.text, 
            'lemma' : token.lemma_,
            'features' : Morphology.feats_to_dict(str(token.morph))
        })

    try:
        jLst = alignmentLookup[i]
    except:
        continue
        
    for j in jLst:
        tgtDoc = engNLP(rawtgtsents[j])

        tgtTokens = []
        for token in tgtDoc:
            tgtTokens.append({
                'tokenid' : token.idx,
                'pos' : token.pos_, 
                'text' : token.text, 
                'lemma' : token.lemma_,
                'features' : Morphology.feats_to_dict(str(token.morph))
            })

        src = [t.text for t in srcDoc]
        tgt = [t.text for t in tgtDoc]

        alignments = myaligner.get_word_aligns(src, tgt)
        itermax = alignments['itermax']

        wordAlignmentList.append({
            'alignedwordindices' : itermax,
            'alignedwords' : [(src[s], tgt[t]) for s, t in itermax],
            'srctokens' : srcTokens,
            'tgttokens' : tgtTokens,
            'srcsentidx' : i,
            'tgtsentidx' : j,
        })
end = time.time()
print('parsed in',end-start,'s')

0/226 sentences parsed in 0.0003972053527832031 s.
25/226 sentences parsed in 30.3469979763031 s.
50/226 sentences parsed in 55.82429313659668 s.
75/226 sentences parsed in 77.6922881603241 s.
100/226 sentences parsed in 109.99845719337463 s.
125/226 sentences parsed in 123.02643013000488 s.
150/226 sentences parsed in 135.3704390525818 s.
175/226 sentences parsed in 158.83312106132507 s.
200/226 sentences parsed in 179.96948099136353 s.
225/226 sentences parsed in 190.1589879989624 s.
parsed in 190.40406203269958 s


# Write to JSON or CSV

In [128]:
with open(f'jsondata/{srclang}/wordAlignment4-8.json', 'w',encoding='utf-8') as f:
    json.dump(wordAlignmentList, f, ensure_ascii=False, indent=4)

In [129]:
srcsent

')'

In [130]:
srctokens = []
tgttokens = []
for srcsent in rawsrcsents:
    srcdoc = sourceNLP(srcsent)
    senttokens = [{
            'tokenid' : token.idx,
            'pos' : token.pos_, 
            'text' : token.text, 
            'lemma' : token.lemma_,
            'features' : Morphology.feats_to_dict(str(token.morph)),
            'linebreak' : False
        } for token in srcdoc]
    srctokens.append({
        'text' : srcsent,
        'tokens' : senttokens
    })
for tgtsent in rawtgtsents:
    tgtdoc = engNLP(tgtsent)
    senttokens = [{
            'tokenid' : token.idx,
            'pos' : token.pos_, 
            'text' : token.text, 
            'lemma' : token.lemma_,
            'features' : Morphology.feats_to_dict(str(token.morph)),
            'linebreak' : False
        } for token in tgtdoc]
    tgttokens.append({
        'text' : tgtsent,
        'tokens' : senttokens
    })

sentsInOrderJSON = {'srcSentsInOrder' : srctokens, 'tgtSentsInOrder' : tgttokens}
with open(f'jsondata/{srclang}/sentsInOrder4-8.json', 'w', encoding='utf-8') as f:
    json.dump(sentsInOrderJSON, f, ensure_ascii=False, indent=4)