## Install libraries, modules

In [1]:
# !python -m spacy download es_core_news_sm

In [2]:
# !python -m spacy download en_core_web_sm

In [3]:
# !python -m spacy download ru_core_news_sm

In [4]:
# pip install spacy

In [5]:
# pip install pandas

In [137]:
# pip install PyArabic

In [140]:
# !python -m spacy download xx_ent_wiki_sm

In [1]:
import spacy
from spacy.morphology import Morphology

import pyarabic.araby as araby
import pyarabic.number as number

import pandas as pd
import random

import json

import time

In [2]:
# SPECIFY SOURCE LANGUAGE
srclang = 'Arabic'

In [3]:
# install spacy lang models

if srclang == 'Spanish':
    sourceNLP = spacy.load("es_core_news_sm")
elif srclang == 'Russian':
    sourceNLP = spacy.load("ru_core_news_sm")
elif srclang == 'Arabic':
    sourceNLP = spacy.load("xx_ent_wiki_sm")
    sourceNLP.add_pipe('sentencizer')
    
engNLP = spacy.load("en_core_web_sm")

## 1. Load raw texts

In [4]:
with open(f'texts/{srclang}/rawsource.txt','r') as f:
     sourcetxt = f.read().replace('\n',' ').replace('\t','')
with open(f'texts/{srclang}/rawtarget.txt','r') as f:
     targettxt = f.read().replace('\n',' ')

### Apply language model

In [5]:
if srclang == 'Arabic':
    # sourcedoc = araby.sentence_tokenize(sourcetxt)
    sourcedoc = sourceNLP(sourcetxt)
else:
    sourcedoc = sourceNLP(sourcetxt)
targetdoc = engNLP(targettxt)

In [6]:
# sentenize
rawsrcsents = []
rawtgtsents = []

for sent in sourcedoc.sents:
    rawsrcsents.append(sent.text)
    
newrawsrcsents = []
for sent in rawsrcsents:
    if sent == '':
        continue
    partfound = False
    for part in ['Part I.','Part II.','Part III.','Part IV.','Part V.']:
        if part in sent:
            newrawsrcsents.append(part)
            newrawsrcsents.append(sent.split(part)[1].strip())
            partfound = True
    if not partfound:
        newrawsrcsents.append(sent)
rawsrcsents = newrawsrcsents
    
        
for sent in targetdoc.sents:
    rawtgtsents.append(sent.text)

newrawtgtsents = []
for sent in rawtgtsents:
    if sent == '':
        continue
    partfound = False
    for part in ['Part I.','Part II.','Part III.','Part IV.','Part V.']:
        if part in sent:
            newrawtgtsents.append(part)
            newrawtgtsents.append(sent.split(part)[1])
            partfound = True
    if not partfound:
        newrawtgtsents.append(sent)
rawtgtsents = newrawtgtsents

## 2. Write standardized files (one line per sentence) for input to Bleualign

In [9]:
# these are the inputs to bleualign
with open(f'texts/{srclang}/sourcetextforbleualign.txt','w') as f:
    f.write('\n'.join(rawsrcsents))
with open(f'texts/{srclang}/targettextforbleualign.txt','w') as f:
    f.write('\n'.join(rawtgtsents))

In [10]:
# tokenized sentences for data output
srctokens = []
if srclang == 'Arabic':
    for srcsent in rawsrcsents:
        tokens = araby.tokenize(srcsent)
        srctokens.append([{'text' : t, 'lemma' : t} for t in tokens])
else:
    for srcsent in rawsrcsents:
        tokens = sourceNLP(srcsent)
        srctokens.append([{'text' : t.text, 'lemma' : t.lemma_} for t in tokens])
tgttokens = []
for tgtsent in rawtgtsents:
    tokens = engNLP(tgtsent)
    tgttokens.append([{'text' : t.text, 'lemma' : t.lemma_} for t in tokens])

# Run Bleualign

In [231]:
# !python setup.py install

In [232]:
# pip install translators --upgrade

In [233]:
import translators as ts

In [179]:
start = time.time()
translatedsourcesents = []
for i, sent in enumerate(rawsrcsents[:3]):
    if i % 25 == 0:
        print(f'{i}/{len(rawsrcsents)} sents translated.')
    try:
        translatedsourcesents.append(ts.google(sent, to_language = 'en'))
    except:
        print('problem on',sent)
        translatedsourcesents.append('\n')
end = time.time()
print(f'machine translation took {end-start} seconds')

0/164 sents translated.
machine translation took 2.1140940189361572 seconds


In [181]:
translatedsourcesents

['Part I.',
 'For this day name, and can not put it where God put him from the month and Sunnis, but can not mention this day a particular time, but almost nearly.',
 'The biggest thought that this time was happening today in dawn or lover.']

In [237]:
if srclang not in ['Arabic','Russian']:
    with open(f'texts/{srclang}/translatedsource.txt','w') as f:
        f.write('\n'.join(' '))

In [11]:
# %%capture cap --no-stderr
start = time.time()
!./bleualign.py -s texts/russian/sourcetextforbleualign.txt -t texts/russian/targettextforbleualign.txt --srctotarget texts/russian/translatedsource.txt -o texts/russian/outputfile --verbosity 2
end = time.time()
print(f'sentence alignment took {end-start} seconds')

reading in article 0: 
processing
computing alignment between srctotarget (file 0) and target text
Evaluating sentences with bleu
finished
searching for longest path of good alignments
finished
Fri Apr 15 10:18:45 2022
filling gaps
finished
Fri Apr 15 10:18:45 2022
Results of BLEU 1-to-1 alignment
[92m0: 0[1;m
[92m1: 1[1;m
[92m2: 2[1;m
[92m3: 3[1;m
[92m4: 4[1;m
[92m5: 5[1;m
[92m6: 6[1;m
[92m7: 7[1;m
[92m8: 8[1;m
[92m9: 9[1;m
[92m10: 10[1;m
[92m11: 11[1;m
[92m12: 12[1;m
[92m13: 13[1;m
[92m14: 14[1;m
[92m15: 15[1;m
[92m16: 16[1;m
[92m17: 17[1;m
[92m18: 18[1;m
[92m19: 19[1;m
[92m20: 20[1;m
[92m21: 21[1;m
[92m22: 22[1;m
[92m23: 23[1;m
[92m24: 24[1;m
[92m25: 25[1;m
[92m26: 26[1;m
[92m27: 27[1;m
[92m28: 28[1;m
[92m29: 29[1;m
[92m30: 30[1;m
[92m31: 32[1;m
[92m32: 33[1;m
[92m33: 34[1;m
[92m34: 35[1;m
[92m35: 36[1;m
[92m36: 37[1;m
[92m37: 38[1;m
[92m38: 39[1;m
[92m39: 40[1;m
[92m40: 41[1;m
[92m41: 42[1;m
[92m42

## [START HERE] 3. Read sentence-aligned files (from Bleualign)

In [12]:
with open(f'texts/{srclang}/outputfile-s','r') as f:
    alignedsrc = f.read().split('\n')
with open(f'texts/{srclang}/outputfile-t','r') as f:
    alignedtgt = f.read().split('\n')

In [13]:
i = random.choice(range(len(alignedsrc)))
alignedsrc[i], alignedtgt[i]

(' على أنه لم يكن يستطيع أن يبلو من شاطئ هذه القناة مسافة بعيدة، فقد كان هذا الشاطئ محفوفًا عن يمينه وعن شماله بالخطر.',
 'However, he was not able to explore along the bank of the canal for a great distance, inasmuch as both to right and to left the way was fraught with danger.')

In [14]:
# sent to sent alignment
oneLinesrc, oneLineEng = rawsrcsents, rawtgtsents
alignedsrc, alignedEng = alignedsrc, alignedtgt
sentAlignments = []
alignmentLookup = dict()
srcIndex = 0
for alignsrcSent, alignEngSent in zip(alignedsrc, alignedEng):
    if srcIndex % 50 == 0:
        print(f'{srcIndex}/{len(rawsrcsents)} sentences parsed.')
    individualEngSents = [sent.text for sent in engNLP(alignEngSent).sents]
    for indEngSent in individualEngSents:
        for i, thisEngLine in enumerate(oneLineEng):
            if indEngSent.strip() == thisEngLine.strip():
                engIndex = i
        for j, thissrcLine in enumerate(oneLinesrc):
            if alignsrcSent.strip() == thissrcLine.strip():
                srcIndex = j
        sentAlignments.append({
            'indices' : (srcIndex, engIndex),
            'sents' : (oneLinesrc[srcIndex], oneLineEng[engIndex])
        })
        alignmentLookup.setdefault(srcIndex,[])
        alignmentLookup[srcIndex].append(engIndex)
    srcIndex += 1

0/164 sentences parsed.
100/164 sentences parsed.
150/164 sentences parsed.


In [15]:
with open(f'jsondata/{srclang}/sentAlignment4-11.json', 'w', encoding='utf-8') as f:
    json.dump(sentAlignments, f, ensure_ascii=False, indent=4)

# EXAMPLE DONT NEED - check it works

In [17]:
# chec, k it works
randSentAlign = random.choice(sentAlignments)
s, t = randSentAlign['sents']
print(s)
print(t)

 كان جده هذا ثقيل الظل بغيضًا إليه، وكان يقضي في البيت فصل الشتاء من كل سنة، وكان قد صلح ونسك حين اضطرته الحياة إلى الصلاح والنسك، فكان يصلي الخمس لأوقاتها، ولم يكن لسانه يفتر عن ذكر الله.
This grandfather of his was to him an unattractive and odious person, who used to spend every winter at the house.


## 4.1 Parse word alignment using SimAlign (recommended: fast and high coverage)

In [120]:
# pip install simalign

In [18]:
from simalign import SentenceAligner
start = time.time()
# making an instance of our model.
# You can specify the embedding model and all alignment settings in the constructor.
myaligner = SentenceAligner(model="bert", token_type="bpe", matching_methods="mai")
end = time.time()
print(f'downloading word aligner tool took {end-start} seconds')

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
2022-04-15 10:21:00,693 - simalign.simalign - INFO - Initialized the EmbeddingLoader with model: bert-base-multilingual-cased


downloading word aligner tool took 9.31760287284851 seconds


## calculate word alignment with SimAlign

In [19]:
len(rawsrcsents), len(rawtgtsents)

(164, 233)

In [20]:
# get rid of white space at end
your_data = zip(rawsrcsents, rawtgtsents)

start = time.time()

wordAlignmentList = []

for i, srcsent in enumerate(rawsrcsents):
    if i % 25 == 0:
        currently = time.time()
        print(f"{i}/{len(rawsrcsents)} sentences parsed in {currently-start} s.")
        
    srcTokens = []
    if srclang != 'Arabic':
        srcDoc = sourceNLP(srcsent)

        for token in srcDoc:
            srcTokens.append({
                'tokenid' : token.idx,
                'pos' : token.pos_, 
                'text' : token.text, 
                'lemma' : token.lemma_,
                'features' : Morphology.feats_to_dict(str(token.morph))
            })
    else:
        srcDoc = araby.tokenize(srcsent)
        for tidx, token in enumerate(srcDoc):
            srcTokens.append({
                'tokenid' : tidx,
                'pos' : 'N/A', 
                'text' : token, 
                'lemma' : token,
                'features' : 'N/A'
            })

    try:
        jLst = alignmentLookup[i]
    except:
        continue
        
    for j in jLst:
        tgtDoc = engNLP(rawtgtsents[j])

        tgtTokens = []
        for token in tgtDoc:
            tgtTokens.append({
                'tokenid' : token.idx,
                'pos' : token.pos_, 
                'text' : token.text, 
                'lemma' : token.lemma_,
                'features' : Morphology.feats_to_dict(str(token.morph))
            })

        if srclang != 'Arabic':
            src = [t.text for t in srcDoc]
        else:
            src = srcDoc
            
        tgt = [t.text for t in tgtDoc]

        alignments = myaligner.get_word_aligns(src, tgt)
        itermax = alignments['itermax']

        wordAlignmentList.append({
            'alignedwordindices' : itermax,
            'alignedwords' : [(src[s], tgt[t]) for s, t in itermax],
            'srctokens' : srcTokens,
            'tgttokens' : tgtTokens,
            'srcsentidx' : i,
            'tgtsentidx' : j,
        })
end = time.time()
print('parsed in',end-start,'s')

0/164 sentences parsed in 0.0006680488586425781 s.
25/164 sentences parsed in 153.3745470046997 s.
50/164 sentences parsed in 254.79790997505188 s.
75/164 sentences parsed in 297.28424286842346 s.
100/164 sentences parsed in 363.3640058040619 s.
125/164 sentences parsed in 449.0529410839081 s.
150/164 sentences parsed in 528.0997431278229 s.
parsed in 577.4362461566925 s


In [22]:
with open(f'jsondata/{srclang}/wordAlignment4-11.json', 'w',encoding='utf-8') as f:
    json.dump(wordAlignmentList, f, ensure_ascii=False, indent=4)

In [23]:
srctokens = []
tgttokens = []
for srcsent in rawsrcsents:
    srcdoc = sourceNLP(srcsent)
    senttokens = [{
            'tokenid' : token.idx,
            'pos' : token.pos_, 
            'text' : token.text, 
            'lemma' : token.lemma_,
            'features' : Morphology.feats_to_dict(str(token.morph)),
            'linebreak' : False
        } for token in srcdoc]
    srctokens.append({
        'text' : srcsent,
        'tokens' : senttokens
    })
for tgtsent in rawtgtsents:
    tgtdoc = engNLP(tgtsent)
    senttokens = [{
            'tokenid' : token.idx,
            'pos' : token.pos_, 
            'text' : token.text, 
            'lemma' : token.lemma_,
            'features' : Morphology.feats_to_dict(str(token.morph)),
            'linebreak' : False
        } for token in tgtdoc]
    tgttokens.append({
        'text' : tgtsent,
        'tokens' : senttokens
    })

sentsInOrderJSON = {'srcSentsInOrder' : srctokens, 'tgtSentsInOrder' : tgttokens}
with open(f'jsondata/{srclang}/sentsInOrder4-11.json', 'w', encoding='utf-8') as f:
    json.dump(sentsInOrderJSON, f, ensure_ascii=False, indent=4)

### Extract paragraph breaks

In [1]:
# importing the module
import json
 
# Opening JSON file
with open(f'jsondata/{srclang}/sentsInOrder4-7.json') as json_file:
    sentsInOrder = json.load(json_file)

In [24]:
srcLineBreaks = []
for i, s in enumerate(sentsInOrder['srcSentsInOrder']):
    for j, t in enumerate(s['tokens']):
        islinebreak = t['linebreak']
        if islinebreak:
            srcLineBreaks.append(str(i) +','+str(j))
with open('jsondata/spanish/srcLineBreaks.txt','w') as f:
    f.write('\n'.join(srcLineBreaks))

In [26]:
tgtLineBreaks = []
for i, s in enumerate(sentsInOrder['tgtSentsInOrder']):
    for j, t in enumerate(s['tokens']):
        islinebreak = t['linebreak']
        if islinebreak:
            tgtLineBreaks.append(str(i) +','+str(j))
with open('jsondata/spanish/tgtLineBreaks.txt','w') as f:
    f.write('\n'.join(tgtLineBreaks))