## Install libraries, modules

In [9]:
# !python -m spacy download es_core_news_sm

In [10]:
# !python -m spacy download en_core_web_sm

In [11]:
# pip install spacy

In [12]:
# pip install pandas

In [13]:
import pandas as pd
import random
import spacy
import time

In [14]:
# install spacy lang models

spaNLP = spacy.load("es_core_news_sm")
engNLP = spacy.load("en_core_web_sm")

## 1. Load raw texts

In [15]:
with open('texts/rawsource.txt','r') as f:
     sourcetxt = f.read()
with open('texts/rawtarget.txt','r') as f:
     targettxt = f.read()

### Apply language model

In [16]:
sourcedoc = spaNLP(sourcetxt)
targetdoc = engNLP(targettxt)

In [17]:
# sentenize
rawsrcsents = []
rawtgtsents = []
for sent in sourcedoc.sents:
    rawsrcsents.append(sent.text)
for sent in targetdoc.sents:
    rawtgtsents.append(sent.text)

## 2. Write standardized files (one line per sentence) for input to Bleualign

In [18]:
# these are the inputs to bleualign
with open('sourcetextforbleualign.txt','w') as f:
    f.write('\n'.join(rawsrcsents))
with open('targettextforbleualign.txt','w') as f:
    f.write('\n'.join(rawtgtsents))

### At this point, I also generate sourcetexttranslation.text using Google Translate and run Bleualign on the texts on my terminal.

# Run Bleualign

In [24]:
# !python setup.py install

In [25]:
# pip install translators --upgrade

In [26]:
import translators as ts

Using United States server backend.


In [None]:
start = time.time()
translatedsourcesents = []
for sent in rawsrcsents:
    translatedsourcesents.append(ts.google(sent, to_language = 'en'))
end = time.time()
print(f'machine translation took {end-start} seconds')

In [68]:
with open('translatedsource.txt','w') as f:
    f.write('\n'.join(translatedsourcesents))

In [70]:
!./bleualign.py -s sourcetextforbleualign.txt -t targettextforbleualign.txt --srctotarget translatedsource.txt -o outputfile

reading in article 0: 
processing
computing alignment between srctotarget (file 0) and target text
Evaluating sentences with bleu
finished
searching for longest path of good alignments
finished
filling gaps
finished

finished with article




## [START HERE] 3. Read sentence-aligned files (from Bleualign)

In [71]:
with open('outputfile-s','r') as f:
    srcsents = f.read().split('\n')
with open('outputfile-t','r') as f:
    tgtsents = f.read().split('\n')

In [75]:
i = random.choice(range(len(srcsents)))
srcsents[i], tgtsents[i]

('Madden era implacable.', 'Madden was implacable.')

In [78]:
# sent to sent alignment
oneLineSpa, oneLineEng = rawsrcsents, rawtgtsents
alignedSpa, alignedEng = srcsents, tgtsents
sentAlignments = []
spaIndex = 0
for alignSpaSent, alignEngSent in zip(alignedSpa, alignedEng):
    if spaIndex % 50 == 0:
        print(f'{spaIndex}/{len(srcsents)} sentences parsed.')
    individualEngSents = [sent.text for sent in engNLP(alignEngSent).sents]
    for indEngSent in individualEngSents:
        for i, thisEngLine in enumerate(oneLineEng):
            if indEngSent.strip() == thisEngLine.strip():
                engIndex = i
        for j, thisSpaLine in enumerate(oneLineSpa):
            if alignSpaSent.strip() == thisSpaLine.strip():
                spaIndex = j
        sentAlignments.append({
            'indices' : (spaIndex, engIndex),
            'sents' : (oneLineSpa[spaIndex], oneLineEng[engIndex])
        })
    spaIndex += 1

0/212 sentences parsed.
50/212 sentences parsed.
100/212 sentences parsed.
150/212 sentences parsed.
200/212 sentences parsed.


In [79]:
# chec, k it works
randSentAlign = random.choice(sentAlignments)
s, t = randSentAlign['sents']
print(s)
print(t)

Me pareció que el húmedo jardín que rodeaba la casa estaba saturado hasta lo infinito de invisibles personas.
It seemed to me that the dew-damp garden surrounding the house was infinitely saturated with invisible people.


In [80]:
import pickle

with open('borges_sent_alignment_3-21.pickle', 'wb') as handle:
    pickle.dump(sentAlignments, handle, protocol=pickle.HIGHEST_PROTOCOL)

## 4.1 Parse word alignment using SimAlign (recommended: fast and high coverage)

In [9]:
# pip install simalign

In [81]:
from simalign import SentenceAligner

# making an instance of our model.
# You can specify the embedding model and all alignment settings in the constructor.
myaligner = SentenceAligner(model="bert", token_type="bpe", matching_methods="mai")

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
2022-03-21 20:27:54,604 - simalign.simalign - INFO - Initialized the EmbeddingLoader with model: bert-base-multilingual-cased


In [8]:
# # The source and target sentences should be tokenized to words.
# src_sentence = ["This", "is", "a", "test", "."]
# trg_sentence = ["Das", "ist", "ein", "Test", "."]

# # The output is a dictionary with different matching methods.
# # Each method has a list of pairs indicating the indexes of aligned words (The alignments are zero-indexed).
# alignments = myaligner.get_word_aligns(src_sentence, trg_sentence)

# for matching_method in alignments:
#     print(matching_method, ":", alignments[matching_method])

## calculate word alignment with SimAlign

In [82]:
import time

In [87]:
your_data = zip(srcsents[:-1], tgtsents[:-1])
start = time.time()

alignmentList = []
i = 0
for sent_es_str, sent_en_str in your_data:
    if i % 10 == 0:
        print(f'{i}/{len(srcsents)} sentences parsed.')
    srcDoc = spaNLP.tokenizer(sent_es_str)
    tgtDoc = engNLP.tokenizer(sent_en_str)
    
    srcTokens = []
    for idx, token in enumerate(srcDoc):
        srcTokens.append({
            'tokenid' : idx
            'POS' : token.upos, 
            'text' : token.text, 
            'lemma' : token.lemma,
            'features' : token.feats
        })

    src = [t.text for t in srcDoc]
    tgt = [t.text for t in tgtDoc]
    alignments = myaligner.get_word_aligns(src, tgt)
    itermax = alignments['itermax']
    
    
    alignmentList.append({
        'indices' : itermax,
        'words' : [(src[s], tgt[t]) for s, t in itermax]

    })
    
    i += 1
end = time.time()
print('parsed in',end-start,'s')

0/212 sentences parsed.
10/212 sentences parsed.
20/212 sentences parsed.
30/212 sentences parsed.
40/212 sentences parsed.
50/212 sentences parsed.
60/212 sentences parsed.
70/212 sentences parsed.
80/212 sentences parsed.
90/212 sentences parsed.
100/212 sentences parsed.
110/212 sentences parsed.
120/212 sentences parsed.
130/212 sentences parsed.
140/212 sentences parsed.
150/212 sentences parsed.
160/212 sentences parsed.
170/212 sentences parsed.
180/212 sentences parsed.
190/212 sentences parsed.
200/212 sentences parsed.
210/212 sentences parsed.
parsed in 295.08400797843933 s


In [88]:
random.choice(alignmentList)

{'indices': [(0, 6),
  (1, 3),
  (2, 0),
  (2, 4),
  (3, 2),
  (5, 8),
  (6, 9),
  (7, 10),
  (8, 19),
  (9, 20),
  (10, 22),
  (11, 7),
  (11, 18),
  (12, 11),
  (13, 12),
  (14, 13),
  (15, 14),
  (16, 16),
  (17, 23),
  (18, 24),
  (19, 25),
  (19, 26),
  (20, 27),
  (22, 28),
  (23, 29),
  (23, 30),
  (23, 41),
  (24, 31),
  (24, 32),
  (25, 33),
  (26, 34),
  (27, 35),
  (28, 36),
  (29, 37),
  (30, 38),
  (31, 39),
  (32, 41),
  (35, 40),
  (36, 43),
  (37, 42)],
 'words': [('Leí', 'I'),
  ('con', 'without'),
  ('incomprensión', 'Eagerly'),
  ('incomprensión', 'understanding'),
  ('y', 'but'),
  ('estas', 'the'),
  ('palabras', 'words'),
  ('que', 'which'),
  ('con', 'with'),
  ('minucioso', 'a'),
  ('pincel', 'brush'),
  ('redactó', 'read'),
  ('redactó', 'written'),
  ('un', 'a'),
  ('hombre', 'man'),
  ('de', 'of'),
  ('mi', 'my'),
  ('sangre', 'blood'),
  (':', ':'),
  ('"', '"'),
  ('Dejo', 'I'),
  ('Dejo', 'leave'),
  ('a', 'to'),
  ('varios', 'various'),
  ('porvenires', '

### import to pickle (bytestream) for later access

In [90]:
import pickle

with open('borges_word_alignment_3-21.pickle', 'wb') as handle:
    pickle.dump(alignmentList, handle, protocol=pickle.HIGHEST_PROTOCOL)

### an example of simalign on a single pair

In [91]:
i = random.choice(range(len(srcsents)))
srcDoc = spaNLP.tokenizer(srcsents[i])
tgtDoc = engNLP.tokenizer(tgtsents[i])
src = [t.text for t in srcDoc]
tgt = [t.text for t in tgtDoc]
alignments = myaligner.get_word_aligns(src, tgt)

for match in alignments:
    print(match, ':', alignments[match])

mwmf : [(0, 0), (1, 2), (1, 3), (2, 4), (3, 5), (4, 6), (5, 1), (5, 7), (7, 8), (8, 9), (9, 10), (9, 12), (10, 21), (11, 11), (12, 13), (13, 14), (14, 15), (15, 16), (16, 17), (17, 18), (17, 19), (18, 20), (19, 21), (20, 22), (21, 23), (22, 24), (22, 25), (23, 26), (24, 27), (25, 28)]
inter : [(3, 5), (4, 6), (5, 7), (7, 8), (8, 9), (9, 10), (10, 11), (12, 13), (13, 14), (14, 15), (15, 16), (16, 17), (17, 18), (17, 19), (18, 20), (19, 21), (20, 22), (21, 23), (22, 24), (22, 25), (23, 26), (24, 27), (25, 28)]
itermax : [(0, 0), (3, 5), (4, 6), (5, 1), (5, 7), (7, 8), (8, 9), (9, 10), (9, 12), (10, 11), (12, 13), (13, 14), (14, 15), (15, 16), (16, 17), (17, 18), (17, 19), (18, 20), (19, 21), (20, 22), (21, 23), (22, 24), (22, 25), (23, 26), (24, 27), (25, 28)]


In [92]:
for match in alignments:
    for s, t in alignments[match]:
        print(src[s], tgt[t])
    print()

Lo I
hice out
hice my
, plan
porque because
yo I
sentía carried
sentía felt
el the
jefe Chief
temía had
temía fear
un uncountable
poco some
a of
los those
de of
mi my
raza race
-a ,
-a of
los those
innumerables uncountable
antepasados forebears
que whose
confluyen culmination
confluyen lies
en in
mí- me
. .

porque because
yo I
sentía felt
el the
jefe Chief
temía had
un some
a of
los those
de of
mi my
raza race
-a ,
-a of
los those
innumerables uncountable
antepasados forebears
que whose
confluyen culmination
confluyen lies
en in
mí- me
. .

Lo I
porque because
yo I
sentía carried
sentía felt
el the
jefe Chief
temía had
temía fear
un some
a of
los those
de of
mi my
raza race
-a ,
-a of
los those
innumerables uncountable
antepasados forebears
que whose
confluyen culmination
confluyen lies
en in
mí- me
. .



## 4.2 Parse aligned sentences using Astred word alignment (backup, not recommended)

In [13]:
# !pip install astred[stanza]

In [14]:
# !pip install git+https://github.com/BramVanroy/awesome-align.git@astred_compat

In [15]:
import time
from astred import AlignedSentences, Sentence

### Running Astred Word Alignment (https://github.com/BramVanroy/astred/issues/3)

In [16]:
from astred.aligned import AlignedSentences, Sentence
from astred.aligner import Aligner
from astred.utils import load_parser

In [17]:
nlp_en = load_parser("en", "stanza", is_tokenized=False, verbose=True)
nlp_es = load_parser("es", "stanza", is_tokenized=False, verbose=True)
aligner = Aligner()

your_data = zip(srcsents, tgtsents)

alignmentList = []
i = 0
success = 0
for sent_es_str, sent_en_str in your_data:
    if i % 10 == 0:
        print(f'{i}/{len(srcsents)} sentences parsed.')
    try:
#         joinedsents = sent_en_str.replace('.',';')
#         if joinedsents[-1] == ';':
#             joinedsents = joinedsents[:-1] + '.'
#         sent_en = Sentence.from_text(joinedsents, nlp_en)
        sent_en = Sentence.from_text(sent_en_str, nlp_en)
        sent_es = Sentence.from_text(sent_es_str, nlp_es)
        aligned = AlignedSentences(sent_es, sent_en, aligner=aligner)
        alignmentList.append({
            'spanish_text' : sent_es_str,
            'english_text' : sent_en_str,
            'spanish_nlp' : sent_es,
            'english_nlp' : sent_en,
            'alignment' : aligned
        })
        success += 1
    except:
        alignmentList.append({
            'spanish_text' : sent_es_str,
            'english_text' : sent_en_str,
            'spanish_nlp' : 'Error',
            'english_nlp' : 'Error',
            'alignment' : 'Error'
        })
    i += 1
#     if i == 20:
#         break

2022-02-26 08:06:19 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| pos       | combined |
| lemma     | combined |
| depparse  | combined |

2022-02-26 08:06:19 INFO: Use device: cpu
2022-02-26 08:06:19 INFO: Loading: tokenize
2022-02-26 08:06:19 INFO: Loading: pos
2022-02-26 08:06:21 INFO: Loading: lemma
2022-02-26 08:06:21 INFO: Loading: depparse
2022-02-26 08:06:23 INFO: Done loading processors!
2022-02-26 08:07:20 INFO: Loading these models for language: es (Spanish):
| Processor | Package |
-----------------------
| tokenize  | ancora  |
| mwt       | ancora  |
| pos       | ancora  |
| lemma     | ancora  |
| depparse  | ancora  |

2022-02-26 08:07:20 INFO: Use device: cpu
2022-02-26 08:07:20 INFO: Loading: tokenize
2022-02-26 08:07:20 INFO: Loading: mwt
2022-02-26 08:07:20 INFO: Loading: pos
2022-02-26 08:07:22 INFO: Loading: lemma
2022-02-26 08:07:22 INFO: Loading: depparse
2022-02-26 08:07:25 

0/219 sentences parsed.
10/219 sentences parsed.
20/219 sentences parsed.
30/219 sentences parsed.
40/219 sentences parsed.
50/219 sentences parsed.
60/219 sentences parsed.
70/219 sentences parsed.
80/219 sentences parsed.
90/219 sentences parsed.
100/219 sentences parsed.
110/219 sentences parsed.
120/219 sentences parsed.
130/219 sentences parsed.
140/219 sentences parsed.
150/219 sentences parsed.
160/219 sentences parsed.
170/219 sentences parsed.
180/219 sentences parsed.
190/219 sentences parsed.
200/219 sentences parsed.
210/219 sentences parsed.


## 5. Alignment done! View alignments below.

In [21]:
print(f'success rate of word alignment on sentences is {success}/{i} sents or {success/i}')

success rate of word alignment on sentences is 164/219 sents or 0.7488584474885844


### Sequence alignment AND word alignment (more importantly)

In [22]:
a = random.choice(alignmentList)
for pair in a['alignment'].aligned_seq_spans:
    print(pair[0].text, '\t\t'+pair[1].text)
print()
print()
for pair in a['alignment'].aligned_words:
    print(pair[0].text, '\t\t'+pair[1].text)

 		
 		as
 		as
 		possible
Aniquilado , trémulo , me encogí en la otra punta de el sillón , 		Shattered , trembling , I huddled in the distant corner of the seat ,
lejos 		far
de el temido cristal . 		from the fearful window .


[[NULL]] 		[[NULL]]
[[NULL]] 		as
[[NULL]] 		as
[[NULL]] 		possible
Aniquilado 		Shattered
, 		,
trémulo 		trembling
, 		,
me 		I
encogí 		huddled
en 		in
la 		the
otra 		distant
punta 		corner
de 		of
el 		the
sillón 		seat
, 		,
lejos 		far
de 		from
el 		the
temido 		fearful
cristal 		window
. 		.


In [23]:
### Write something not completely huge to the Pickle
pickleList = []
for a in alignmentList:
    if a['alignment'] != 'Error': 
        spanishTokens = [{
            'tokenid' : token.id, 
            'POS' : token.upos, 
            'text' : token.text, 
            'lemma' : token.lemma,
            'features' : token.feats
        } for token in a['spanish_nlp']]
        englishTokens = [{
            'tokenid' : token.id, 
            'POS' : token.upos, 
            'text' : token.text, 
            'lemma' : token.lemma,
            'features' : token.feats
        } for token in a['english_nlp']]
        alignmentTuples = [(pair[0].id, pair[1].id) for pair in a['alignment'].aligned_words]
    
    else:
        spanishTokens = [{
            'tokenid' : token.i, 
            'POS' : token.pos_, 
            'text' : token.text, 
            'lemma' : token.lemma_,
            'features' : 'Error'
        } for token in spaNLP(a['spanish_text'])]
        englishTokens = [{
            'tokenid' : token.i, 
            'POS' : token.pos_, 
            'text' : token.text, 
            'lemma' : token.lemma_,
            'features' : 'Error'
        } for token in engNLP(a['english_text'])]
        
        alignmentTuples = []
    
    # build list
    pickleList.append({
        'spanishRawText' : a['spanish_text'],
        'englishRawText' : a['english_text'],
        'spanishTokenList' : spanishTokens,
        'englishTokenList' : englishTokens,
        'alignmentTuples' : alignmentTuples
    })

In [32]:
p =random.choice(pickleList)
print(p['alignmentTuples'])

[]


In [25]:
import pickle

with open('alignment2-26.pickle', 'wb') as handle:
    pickle.dump(pickleList, handle, protocol=pickle.HIGHEST_PROTOCOL)

## ignore below

In [11]:
# jank version which worked ok but took SO long (8+ hrs)

# # alignment tool (takes a freaking long time)
# start = time.time()

# alignmentList = []
# success = 0
# total = 0
# for i in range(len(srcsents)):
#     if srcsents[i] == '' or tgtsents[i] == '':
#         alignmentList.append((srcsents[i],tgtsents[i]))
#         print(f'blank space on sentence {i}')
#     else:
#         try:
#             sent_sp = Sentence.from_text(srcsents[i], "es", is_tokenized=False)
#             sent_es = Sentence.from_text(tgtsents[i], "en", is_tokenized=False)
#             aligned = AlignedSentences(sent_sp, sent_es)
#             print(aligned.aligned_words)
#             alignmentList.append(aligned)
#             success += 1
#             print(f'successfully aligned sentence {i}')
#         except:
#             alignmentList.append((srcsents[i],tgtsents[i]))
#             print(f'spacy failure on sentence {i}')
#     total += 1
    
# end = time.time()
# print(f'only took {end-start} seconds or {(end-start)/60} minutes to run')