## Install libraries, modules

In [1]:
# !python -m spacy download es_core_news_sm

In [2]:
# !python -m spacy download en_core_web_sm

In [3]:
# !python -m spacy download ru_core_news_sm

In [4]:
# pip install spacy

In [5]:
# pip install pandas

In [6]:
import spacy

from spacy.morphology import Morphology

import pandas as pd
import random

import json

import time

In [7]:
# SPECIFY SOURCE LANGUAGE
srclang = 'Russian'

In [8]:
# install spacy lang models

if srclang == 'Spanish':
    sourceNLP = spacy.load("es_core_news_sm")
elif srclang == 'Russian':
    sourceNLP = spacy.load("ru_core_news_sm")
    
engNLP = spacy.load("en_core_web_sm")

## 1. Load raw texts

In [15]:
with open(f'texts/{srclang}/rawsource.txt','r') as f:
     sourcetxt = f.read().replace('\n',' ')
with open(f'texts/{srclang}/rawtarget.txt','r') as f:
     targettxt = f.read().replace('\n',' ')

### Apply language model

In [16]:
sourcedoc = sourceNLP(sourcetxt)
targetdoc = engNLP(targettxt)

In [17]:
# sentenize
rawsrcsents = []
rawtgtsents = []
for sent in sourcedoc.sents:
    rawsrcsents.append(sent.text)
for sent in targetdoc.sents:
    rawtgtsents.append(sent.text)

## 2. Write standardized files (one line per sentence) for input to Bleualign

In [19]:
# these are the inputs to bleualign
with open('sourcetextforbleualign.txt','w') as f:
    f.write('\n'.join(rawsrcsents))
with open('targettextforbleualign.txt','w') as f:
    f.write('\n'.join(rawtgtsents))

In [22]:
# tokenized sentences for data output
srctokens = []
for srcsent in rawsrcsents:
    tokens = sourceNLP(srcsent)
    srctokens.append([{'text' : t.text, 'lemma' : t.lemma_} for t in tokens])
tgttokens = []
for tgtsent in rawtgtsents:
    tokens = engNLP(tgtsent)
    tgttokens.append([{'text' : t.text, 'lemma' : t.lemma_} for t in tokens])

### At this point, I also generate sourcetexttranslation.text using Google Translate and run Bleualign on the texts on my terminal.

# Run Bleualign

In [23]:
# !python setup.py install

In [24]:
# pip install translators --upgrade

In [25]:
import translators as ts

Using United States server backend.


In [None]:
start = time.time()
translatedsourcesents = []
for i, sent in enumerate(rawsrcsents):
    print(f'{i}/{len(rawsrcsents)} sents translated.')
    try:
        translatedsourcesents.append(ts.google(sent, to_language = 'en'))
    except:
        print('problem on',sent)
end = time.time()
print(f'machine translation took {end-start} seconds')

0/317 sents translated.
1/317 sents translated.
2/317 sents translated.
3/317 sents translated.
4/317 sents translated.
5/317 sents translated.
6/317 sents translated.
7/317 sents translated.
8/317 sents translated.
9/317 sents translated.
10/317 sents translated.
11/317 sents translated.
12/317 sents translated.
13/317 sents translated.
14/317 sents translated.
15/317 sents translated.
16/317 sents translated.
17/317 sents translated.
18/317 sents translated.
19/317 sents translated.
20/317 sents translated.
21/317 sents translated.
22/317 sents translated.
23/317 sents translated.
24/317 sents translated.
25/317 sents translated.


In [47]:
with open('translatedsource.txt','w') as f:
    f.write('\n'.join(translatedsourcesents))

In [9]:
# %%capture cap --no-stderr
start = time.time()
!./bleualign.py -s sourcetextforbleualign.txt -t targettextforbleualign.txt --srctotarget translatedsource.txt -o outputfile --verbosity 1
end = time.time()
print(f'sentence alignment took {end-start} seconds')

zsh:1: permission denied: ./bleualign.py
sentence alignment took 0.6176121234893799 seconds


In [284]:
# rawsrcsents[218], rawtgtsents[276]

In [285]:
# output = cap.stdout

# indexpairs = []
# split = output.split('finished with article')[0].split('alignment: ')[1:]
# for string in split:
#     string = string.replace('\r\n','')
#     strindices = string.split(' - ')
#     for srcidx in strindices[0].split(','):
#         for tgtidx in strindices[1].split(','):
#             if (int(srcidx) < len(rawsrcsents) - 1) and (int(tgtidx) < len(rawtgtsents) - 1):
#                 indexpairs.append((int(srcidx)+1, int(tgtidx) + 1))

In [286]:
# rawsrcsents[4], rawtgtsents[5:7]

## [START HERE] 3. Read sentence-aligned files (from Bleualign)

In [10]:
with open('outputfile-s','r') as f:
    alignedsrc = f.read().split('\n')
with open('outputfile-t','r') as f:
    alignedtgt = f.read().split('\n')

In [11]:
i = random.choice(range(len(alignedsrc)))
alignedsrc[i], alignedtgt[i]

('Stephen Albert prosiguió: »-No creo que su ilustre antepasado jugara ociosamente a las variaciones.',
 'Stephen Albert continued: "I do not think that your illustrious ancestor toyed idly with variations.')

In [12]:
len(alignedsrc)

212

In [24]:
# sent to sent alignment
oneLineSpa, oneLineEng = rawsrcsents, rawtgtsents
alignedSpa, alignedEng = alignedsrc, alignedtgt
sentAlignments = []
alignmentLookup = dict()
spaIndex = 0
for alignSpaSent, alignEngSent in zip(alignedSpa, alignedEng):
    if spaIndex % 50 == 0:
        print(f'{spaIndex}/{len(rawsrcsents)} sentences parsed.')
    individualEngSents = [sent.text for sent in engNLP(alignEngSent).sents]
    for indEngSent in individualEngSents:
        for i, thisEngLine in enumerate(oneLineEng):
            if indEngSent.strip() == thisEngLine.strip():
                engIndex = i
        for j, thisSpaLine in enumerate(oneLineSpa):
            if alignSpaSent.strip() == thisSpaLine.strip():
                spaIndex = j
        sentAlignments.append({
            'indices' : (spaIndex, engIndex),
            'sents' : (oneLineSpa[spaIndex], oneLineEng[engIndex])
        })
        alignmentLookup.setdefault(spaIndex,[])
        alignmentLookup[spaIndex].append(engIndex)
    spaIndex += 1

0/223 sentences parsed.
50/223 sentences parsed.
100/223 sentences parsed.
150/223 sentences parsed.
200/223 sentences parsed.


In [25]:
alignmentLookup[81]

[97, 98, 99]

In [27]:
with open('jsondata/sentAlignment3-28.json', 'w', encoding='utf-8') as f:
    json.dump(sentAlignments, f, ensure_ascii=False, indent=4)

# EXAMPLE DONT NEED - check it works

In [29]:
# chec, k it works
randSentAlign = random.choice(sentAlignments)
s, t = randSentAlign['sents']
print(s)
print(t)

Por eso, yo la había aceptado con plenitud, sin prestarle atención.
That was why I had accepted it fully, without paying it any attention.


## 4.1 Parse word alignment using SimAlign (recommended: fast and high coverage)

In [56]:
# pip install simalign

In [30]:
from simalign import SentenceAligner
start = time.time()
# making an instance of our model.
# You can specify the embedding model and all alignment settings in the constructor.
myaligner = SentenceAligner(model="bert", token_type="bpe", matching_methods="mai")
end = time.time()
print(f'downloading word aligner tool took {end-start} seconds')

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
2022-04-03 20:40:46,911 - simalign.simalign - INFO - Initialized the EmbeddingLoader with model: bert-base-multilingual-cased


downloading word aligner tool took 7.278181076049805 seconds


## calculate word alignment with SimAlign

In [32]:
# get rid of white space at end
your_data = zip(rawsrcsents, rawtgtsents)

start = time.time()

wordAlignmentList = []

for i, srcsent in enumerate(rawsrcsents):
    if i % 25 == 0:
        currently = time.time()
        print(f"{i}/{len(rawsrcsents)} sentences parsed in {currently-start} s.")

    srcDoc = sourceNLP(srcsent)
    
    srcTokens = []
    for token in srcDoc:
        srcTokens.append({
            'tokenid' : token.idx,
            'pos' : token.pos_, 
            'text' : token.text, 
            'lemma' : token.lemma_,
            'features' : Morphology.feats_to_dict(str(token.morph))
        })

    try:
        jLst = alignmentLookup[i]
    except:
        continue
        
    for j in jLst:
        tgtDoc = engNLP(rawtgtsents[j])

        tgtTokens = []
        for token in tgtDoc:
            tgtTokens.append({
                'tokenid' : token.idx,
                'pos' : token.pos_, 
                'text' : token.text, 
                'lemma' : token.lemma_,
                'features' : Morphology.feats_to_dict(str(token.morph))
            })

        src = [t.text for t in srcDoc]
        tgt = [t.text for t in tgtDoc]

        alignments = myaligner.get_word_aligns(src, tgt)
        itermax = alignments['itermax']

        wordAlignmentList.append({
            'alignedwordindices' : itermax,
            'alignedwords' : [(src[s], tgt[t]) for s, t in itermax],
            'srctokens' : srcTokens,
            'tgttokens' : tgtTokens,
            'srcsentidx' : i,
            'tgtsentidx' : j,
        })
end = time.time()
print('parsed in',end-start,'s')

0/223 sentences parsed in 0.0007836818695068359 s.
25/223 sentences parsed in 27.857510805130005 s.
50/223 sentences parsed in 62.76290583610535 s.
75/223 sentences parsed in 86.85836291313171 s.
100/223 sentences parsed in 121.79177784919739 s.
125/223 sentences parsed in 136.3282699584961 s.
150/223 sentences parsed in 155.86679196357727 s.
175/223 sentences parsed in 179.74194407463074 s.
200/223 sentences parsed in 201.679790019989 s.
parsed in 210.68713808059692 s


In [37]:
for token in tgtDoc:
    print(token.idx, token.text)

0 -
2 Note
7 by
10 the
14 manuscript
25 editor
31 .


# Write to JSON or CSV

In [34]:
with open('jsondata/wordAlignment3-28.json', 'w',encoding='utf-8') as f:
    json.dump(wordAlignmentList, f, ensure_ascii=False, indent=4)

In [35]:
srctokens = []
tgttokens = []
for srcsent in rawsrcsents:
    srcdoc = sourceNLP(srcsent)
    srctokens.append([{
            'tokenid' : token.idx,
            'pos' : token.pos_, 
            'text' : token.text, 
            'lemma' : token.lemma_,
            'features' : Morphology.feats_to_dict(str(token.morph))
        } for token in srcdoc])
for tgtsent in rawtgtsents:
    tgtdoc = engNLP(tgtsent)
    tgttokens.append([{
            'tokenid' : token.idx,
            'pos' : token.pos_, 
            'text' : token.text, 
            'lemma' : token.lemma_,
            'features' : Morphology.feats_to_dict(str(token.morph))
        } for token in tgtdoc])

sentsInOrderJSON = {'srcSentsInOrder' : {'text' : rawsrcsents, 'tokens' : srctokens}, 'tgtSentsInOrder' : {'text' : rawtgtsents, 'tokens' : tgttokens}}
with open('jsondata/sentsInOrder3-28.json', 'w', encoding='utf-8') as f:
    json.dump(sentsInOrderJSON, f, ensure_ascii=False, indent=4)

# Examples of Alignment (don't need in Python)

In [377]:
tgttokens[-1]

[{'tokenid': 0, 'pos': 'PUNCT', 'text': '-', 'lemma': '-', 'features': {}},
 {'tokenid': 2,
  'pos': 'VERB',
  'text': 'Note',
  'lemma': 'note',
  'features': {'VerbForm': 'Inf'}},
 {'tokenid': 7, 'pos': 'ADP', 'text': 'by', 'lemma': 'by', 'features': {}},
 {'tokenid': 10,
  'pos': 'DET',
  'text': 'the',
  'lemma': 'the',
  'features': {'Definite': 'Def', 'PronType': 'Art'}},
 {'tokenid': 14,
  'pos': 'NOUN',
  'text': 'manuscript',
  'lemma': 'manuscript',
  'features': {'Number': 'Sing'}},
 {'tokenid': 25,
  'pos': 'NOUN',
  'text': 'editor',
  'lemma': 'editor',
  'features': {'Number': 'Sing'}},
 {'tokenid': 31,
  'pos': 'PUNCT',
  'text': '.',
  'lemma': '.',
  'features': {'PunctType': 'Peri'}}]

In [None]:
 # (1, 0),
 #  (1, 1),
 #  (1, 12),
 #  (2, 3),
 #  (3, 4),
 #  (4, 5),
 #  (5, 7),
 #  (6, 8),
 #  (7, 15),
 #  (8, 10),
 #  (9, 11),

In [90]:
# import pickle

# with open('borges_word_alignment_3-21.pickle', 'wb') as handle:
#     pickle.dump(alignmentList, handle, protocol=pickle.HIGHEST_PROTOCOL)

## EXAMPLE DONT NEED: an example of simalign on a single pair

In [108]:
i = random.choice(range(len(srcsents)))
srcDoc = sourceNLP.tokenizer(srcsents[i])
tgtDoc = engNLP.tokenizer(tgtsents[i])
src = [t.text for t in srcDoc]
tgt = [t.text for t in tgtDoc]
alignments = myaligner.get_word_aligns(src, tgt)

for match in alignments:
    print(match, ':', alignments[match])

mwmf : [(0, 0), (1, 1), (2, 2), (3, 3), (4, 5), (4, 7), (5, 4), (6, 15), (7, 6), (8, 9), (9, 8), (10, 10), (11, 11), (12, 12), (13, 13), (14, 14), (15, 16), (16, 16), (17, 20), (18, 17), (19, 18), (20, 22), (21, 21), (21, 23), (22, 19), (22, 24), (23, 25)]
inter : [(0, 0), (1, 1), (2, 2), (3, 3), (4, 5), (7, 6), (8, 9), (9, 8), (10, 10), (11, 11), (12, 12), (13, 13), (14, 14), (16, 16), (19, 18), (20, 22), (21, 23), (22, 24), (23, 25)]
itermax : [(0, 0), (1, 1), (2, 2), (3, 3), (4, 5), (5, 4), (6, 7), (7, 6), (8, 9), (9, 8), (10, 10), (11, 11), (12, 12), (13, 13), (14, 14), (15, 16), (16, 16), (18, 19), (19, 18), (20, 22), (21, 23), (22, 24), (23, 25)]


In [109]:
for match in alignments:
    for s, t in alignments[match]:
        print(src[s], tgt[t])
    print()

Ts’ui Ts'ui
Pên Pen
fue was
un a
novelista novelist
novelista he
genial fine
, ,
pero but
también also
fue was
un a
hombre man
de of
letras letters
que who
sin doubtless
duda doubtless
no more
se ,
consideró considered
un a
mero than
mero mere
novelista himself
novelista novelist
. .

Ts’ui Ts'ui
Pên Pen
fue was
un a
novelista novelist
pero but
también also
fue was
un a
hombre man
de of
letras letters
que who
duda doubtless
consideró considered
un a
mero mere
novelista novelist
. .

Ts’ui Ts'ui
Pên Pen
fue was
un a
novelista novelist
genial fine
, he
pero but
también also
fue was
un a
hombre man
de of
letras letters
que who
sin doubtless
duda doubtless
se himself
consideró considered
un a
mero mere
novelista novelist
. .



In [None]:
# # get rid of white space at end
# your_data = zip(srcsents[:-1], tgtsents[:-1])

# start = time.time()

# alignmentList = []
# t = 0

# for sent_es_str, sent_en_str in your_data:
#     if t % 25 == 0:
#         currently = time.time()
#         print(f'{t/{len(srcsents)} sentences parsed in {currently-start} s.')

#     srcDoc = sourceNLP(sent_es_str)
#     tgtDoc = engNLP(sent_en_str)
    
#     srcTokens = []
#     for token in srcDoc:
#         srcTokens.append({
#             'tokenid' : token.idx,
#             'pos' : token.pos_, 
#             'text' : token.text, 
#             'lemma' : token.lemma_,
#             'features' : Morphology.feats_to_dict(str(token.morph))
#         })

#     tgtTokens = []
#     for token in tgtDoc:
#         tgtTokens.append({
#             'tokenid' : token.idx,
#             'pos' : token.pos_, 
#             'text' : token.text, 
#             'lemma' : token.lemma_,
#             'features' : Morphology.feats_to_dict(str(token.morph))
#         })

#     src = [t.text for t in srcDoc]
#     tgt = [t.text for t in tgtDoc]
    
#     alignments = myaligner.get_word_aligns(src, tgt)
#     itermax = alignments['itermax']
#     try:
#         j = alignmentLookup[i]
#     except:
#         j = 'No Aligned Sentence'
    
    
#     alignmentList.append({
#         'alignedwordindices' : itermax,
#         'alignedwords' : [(src[s], tgt[t]) for s, t in itermax],
#         'srctokens' : srcTokens,
#         'tgttokens' : tgtTokens,
#         'srcsentidx' : i,
#         'tgtsentidx' : j,
#     })
    
#     t += 1
# end = time.time()
# print('parsed in',end-start,'s')