## Install libraries, modules

In [1]:
# !python -m spacy download es_core_news_sm

In [2]:
# !python -m spacy download en_core_web_sm

In [3]:
# !python -m spacy download ru_core_news_sm

In [4]:
# pip install spacy

In [5]:
# pip install pandas

In [16]:
import spacy

from spacy.morphology import Morphology

import pandas as pd
import random

import json

import time

In [17]:
# SPECIFY SOURCE LANGUAGE
srclang = 'Russian'

In [18]:
# install spacy lang models

if srclang == 'Spanish':
    sourceNLP = spacy.load("es_core_news_sm")
elif srclang == 'Russian':
    sourceNLP = spacy.load("ru_core_news_sm")
    
engNLP = spacy.load("en_core_web_sm")

## 1. Load raw texts

In [47]:
with open(f'texts/{srclang}/rawsource.txt','r') as f:
     sourcetxt = f.read().replace('\n',' ')
with open(f'texts/{srclang}/rawtarget.txt','r') as f:
     targettxt = f.read().replace('\n',' ')

### Apply language model

In [48]:
sourcedoc = sourceNLP(sourcetxt)
targetdoc = engNLP(targettxt)

In [49]:
# sentenize
rawsrcsents = []
rawtgtsents = []
for sent in sourcedoc.sents:
    rawsrcsents.append(sent.text)
for sent in targetdoc.sents:
    rawtgtsents.append(sent.text)

## 2. Write standardized files (one line per sentence) for input to Bleualign

In [52]:
# these are the inputs to bleualign
with open('sourcetextforbleualign.txt','w') as f:
    f.write('\n'.join(rawsrcsents))
with open('targettextforbleualign.txt','w') as f:
    f.write('\n'.join(rawtgtsents))

In [53]:
# tokenized sentences for data output
srctokens = []
for srcsent in rawsrcsents:
    tokens = sourceNLP(srcsent)
    srctokens.append([{'text' : t.text, 'lemma' : t.lemma_} for t in tokens])
tgttokens = []
for tgtsent in rawtgtsents:
    tokens = engNLP(tgtsent)
    tgttokens.append([{'text' : t.text, 'lemma' : t.lemma_} for t in tokens])

# Run Bleualign

In [54]:
# !python setup.py install

In [55]:
# pip install translators --upgrade

In [56]:
import translators as ts

Using United States server backend.


In [57]:
start = time.time()
translatedsourcesents = []
for i, sent in enumerate(rawsrcsents):
    if i % 25 == 0:
        print(f'{i}/{len(rawsrcsents)} sents translated.')
    try:
        translatedsourcesents.append(ts.google(sent, to_language = 'en'))
    except:
        print('problem on',sent)
end = time.time()
print(f'machine translation took {end-start} seconds')

machine translation took 301.34487771987915 seconds


In [58]:
with open('translatedsource.txt','w') as f:
    f.write('\n'.join(translatedsourcesents))

In [60]:
# %%capture cap --no-stderr
start = time.time()
!./bleualign.py -s sourcetextforbleualign.txt -t targettextforbleualign.txt --srctotarget translatedsource.txt -o outputfile --verbosity 2
end = time.time()
print(f'sentence alignment took {end-start} seconds')

reading in article 0: 
processing
computing alignment between srctotarget (file 0) and target text
Evaluating sentences with bleu
finished
searching for longest path of good alignments
finished
Wed Apr  6 11:13:56 2022
filling gaps
finished
Wed Apr  6 11:13:56 2022
Results of BLEU 1-to-1 alignment
[92m0: 0[1;m
[92m1: 1[1;m
[92m2: 2[1;m
[92m3: 3[1;m
[92m4: 4[1;m
[92m5: 5[1;m
[1;31m6: unaligned. best cand 59[1;m
[92m7: 7[1;m
[92m8: 8[1;m
[92m9: 9[1;m
[92m10: 10[1;m
[92m11: 11[1;m
[92m12: 12[1;m
[92m13: 13[1;m
[92m14: 14[1;m
[92m15: 15[1;m
[92m16: 16[1;m
[92m17: 17[1;m
[92m18: 18[1;m
[92m19: 19[1;m
[92m20: 20[1;m
[1;31m21: unaligned. best cand [][1;m
[1;31m22: unaligned. best cand [][1;m
[92m23: 23[1;m
[92m24: 24[1;m
[92m25: 25[1;m
[92m26: 26[1;m
[1;31m27: unaligned. best cand [][1;m
[92m28: 28[1;m
[1;31m29: unaligned. best cand 194[1;m
[92m30: 31[1;m
[92m31: 32[1;m
[92m32: 33[1;m
[92m33: 34[1;m
[92m34: 35[1;m
[92m35: 

## [START HERE] 3. Read sentence-aligned files (from Bleualign)

In [61]:
with open('outputfile-s','r') as f:
    alignedsrc = f.read().split('\n')
with open('outputfile-t','r') as f:
    alignedtgt = f.read().split('\n')

In [71]:
i = random.choice(range(len(alignedsrc)))
alignedsrc[i], alignedtgt[i]

('Она поехала на лошадях, и он провожал ее.',
 'She went by carriage, and he accompanied her.')

In [72]:
len(alignedsrc)

318

In [73]:
# sent to sent alignment
oneLineSpa, oneLineEng = rawsrcsents, rawtgtsents
alignedSpa, alignedEng = alignedsrc, alignedtgt
sentAlignments = []
alignmentLookup = dict()
spaIndex = 0
for alignSpaSent, alignEngSent in zip(alignedSpa, alignedEng):
    if spaIndex % 50 == 0:
        print(f'{spaIndex}/{len(rawsrcsents)} sentences parsed.')
    individualEngSents = [sent.text for sent in engNLP(alignEngSent).sents]
    for indEngSent in individualEngSents:
        for i, thisEngLine in enumerate(oneLineEng):
            if indEngSent.strip() == thisEngLine.strip():
                engIndex = i
        for j, thisSpaLine in enumerate(oneLineSpa):
            if alignSpaSent.strip() == thisSpaLine.strip():
                spaIndex = j
        sentAlignments.append({
            'indices' : (spaIndex, engIndex),
            'sents' : (oneLineSpa[spaIndex], oneLineEng[engIndex])
        })
        alignmentLookup.setdefault(spaIndex,[])
        alignmentLookup[spaIndex].append(engIndex)
    spaIndex += 1

0/321 sentences parsed.
50/321 sentences parsed.
100/321 sentences parsed.
150/321 sentences parsed.
200/321 sentences parsed.
250/321 sentences parsed.
300/321 sentences parsed.


In [75]:
with open(f'jsondata/{srclang}/sentAlignment3-28.json', 'w', encoding='utf-8') as f:
    json.dump(sentAlignments, f, ensure_ascii=False, indent=4)

# EXAMPLE DONT NEED - check it works

In [76]:
# chec, k it works
randSentAlign = random.choice(sentAlignments)
s, t = randSentAlign['sents']
print(s)
print(t)

Разве он любил тогда?
Had he been in love then?


## 4.1 Parse word alignment using SimAlign (recommended: fast and high coverage)

In [56]:
# pip install simalign

In [77]:
from simalign import SentenceAligner
start = time.time()
# making an instance of our model.
# You can specify the embedding model and all alignment settings in the constructor.
myaligner = SentenceAligner(model="bert", token_type="bpe", matching_methods="mai")
end = time.time()
print(f'downloading word aligner tool took {end-start} seconds')

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
2022-04-06 11:15:07,600 - simalign.simalign - INFO - Initialized the EmbeddingLoader with model: bert-base-multilingual-cased


downloading word aligner tool took 8.49353313446045 seconds


## calculate word alignment with SimAlign

In [78]:
# get rid of white space at end
your_data = zip(rawsrcsents, rawtgtsents)

start = time.time()

wordAlignmentList = []

for i, srcsent in enumerate(rawsrcsents):
    if i % 25 == 0:
        currently = time.time()
        print(f"{i}/{len(rawsrcsents)} sentences parsed in {currently-start} s.")

    srcDoc = sourceNLP(srcsent)
    
    srcTokens = []
    for token in srcDoc:
        srcTokens.append({
            'tokenid' : token.idx,
            'pos' : token.pos_, 
            'text' : token.text, 
            'lemma' : token.lemma_,
            'features' : Morphology.feats_to_dict(str(token.morph))
        })

    try:
        jLst = alignmentLookup[i]
    except:
        continue
        
    for j in jLst:
        tgtDoc = engNLP(rawtgtsents[j])

        tgtTokens = []
        for token in tgtDoc:
            tgtTokens.append({
                'tokenid' : token.idx,
                'pos' : token.pos_, 
                'text' : token.text, 
                'lemma' : token.lemma_,
                'features' : Morphology.feats_to_dict(str(token.morph))
            })

        src = [t.text for t in srcDoc]
        tgt = [t.text for t in tgtDoc]

        alignments = myaligner.get_word_aligns(src, tgt)
        itermax = alignments['itermax']

        wordAlignmentList.append({
            'alignedwordindices' : itermax,
            'alignedwords' : [(src[s], tgt[t]) for s, t in itermax],
            'srctokens' : srcTokens,
            'tgttokens' : tgtTokens,
            'srcsentidx' : i,
            'tgtsentidx' : j,
        })
end = time.time()
print('parsed in',end-start,'s')

0/321 sentences parsed in 0.002349853515625 s.
25/321 sentences parsed in 60.17492198944092 s.
50/321 sentences parsed in 127.16363978385925 s.
75/321 sentences parsed in 242.4741849899292 s.
100/321 sentences parsed in 266.65494894981384 s.
125/321 sentences parsed in 299.299115896225 s.
150/321 sentences parsed in 357.85830998420715 s.
175/321 sentences parsed in 393.8537619113922 s.
200/321 sentences parsed in 419.0575439929962 s.
225/321 sentences parsed in 473.8860957622528 s.
250/321 sentences parsed in 503.7986629009247 s.
275/321 sentences parsed in 515.4980957508087 s.
300/321 sentences parsed in 549.3142309188843 s.
parsed in 577.4674389362335 s


# Write to JSON or CSV

In [80]:
with open(f'jsondata/{srclang}/wordAlignment3-28.json', 'w',encoding='utf-8') as f:
    json.dump(wordAlignmentList, f, ensure_ascii=False, indent=4)

In [81]:
srctokens = []
tgttokens = []
for srcsent in rawsrcsents:
    srcdoc = sourceNLP(srcsent)
    srctokens.append([{
            'tokenid' : token.idx,
            'pos' : token.pos_, 
            'text' : token.text, 
            'lemma' : token.lemma_,
            'features' : Morphology.feats_to_dict(str(token.morph))
        } for token in srcdoc])
for tgtsent in rawtgtsents:
    tgtdoc = engNLP(tgtsent)
    tgttokens.append([{
            'tokenid' : token.idx,
            'pos' : token.pos_, 
            'text' : token.text, 
            'lemma' : token.lemma_,
            'features' : Morphology.feats_to_dict(str(token.morph))
        } for token in tgtdoc])

sentsInOrderJSON = {'srcSentsInOrder' : {'text' : rawsrcsents, 'tokens' : srctokens}, 'tgtSentsInOrder' : {'text' : rawtgtsents, 'tokens' : tgttokens}}
with open(f'jsondata/{srclang}/sentsInOrder3-28.json', 'w', encoding='utf-8') as f:
    json.dump(sentsInOrderJSON, f, ensure_ascii=False, indent=4)