## Install libraries, modules

In [17]:
# !python -m spacy download es_core_news_sm

In [18]:
# !python -m spacy download en_core_web_sm

In [19]:
# pip install spacy

In [4]:
# pip install pandas

In [1]:
import pandas as pd
import random
import spacy
import time

In [2]:
# install spacy lang models

spaNLP = spacy.load("es_core_news_sm")
engNLP = spacy.load("en_core_web_sm")

## 1. Load raw texts

In [15]:
with open('raw_english.txt','r') as f:
     engtext = f.read()
with open('raw_spanish.txt','r') as f:
     spatext = f.read()

### Apply language model

In [16]:
spadoc = spaNLP(spatext)
engdoc = engNLP(engtext)

In [17]:
# sentenize
spasents = []
engsents = []
for sent in spadoc.sents:
    spasents.append(sent.text)
for sent in engdoc.sents:
    engsents.append(sent.text)

## 2. Write standardized files (one line per sentence) for input to Bleualign

In [10]:
# these are the inputs to bleualign
with open('sourcetext.txt','w') as f:
    f.write('\n'.join(spasents))
with open('targettext.txt','w') as f:
    f.write('\n'.join(engsents))

### At this point, I also generate sourcetexttranslation.text using Google Translate and run Bleualign on the texts on my terminal.

## [START HERE] 3. Read sentence-aligned files (from Bleualign)

In [3]:
with open('sent_aligned_spanish.txt','r') as f:
    srcsents = f.read().split('\n')
with open('sent_aligned_english.txt','r') as f:
    tgtsents = f.read().split('\n')

In [18]:
i = random.choice(range(len(srcsents)))
srcsents[i], tgtsents[i]

('Reconocí, encuadernados en seda amarilla, algunos tomos manuscritos de la Enciclopedia Perdida que dirigió el Tercer Emperador de la Dinastía Luminosa y que no se dio nunca a la imprenta.',
 'I recognized some large volumes bound in yellow silk-manuscripts of the Lost Encyclopedia which was edited by the Third Emperor of the Luminous Dynasty. They had never been printed.')

In [42]:
# sent to sent alignment
oneLineSpa, oneLineEng = spasents, engsents
alignedSpa, alignedEng = srcsents, tgtsents
sentAlignments = []
spaIndex = 0
for alignSpaSent, alignEngSent in zip(alignedSpa, alignedEng):
    if spaIndex % 50 == 0:
        print(f'{spaIndex}/{len(srcsents)} sentences parsed.')
    individualEngSents = [sent.text for sent in engNLP(alignEngSent).sents]
    for indEngSent in individualEngSents:
        for i, thisEngLine in enumerate(oneLineEng):
            if indEngSent.strip() == thisEngLine.strip():
                engIndex = i
        for j, thisSpaLine in enumerate(oneLineSpa):
            if alignSpaSent.strip() == thisSpaLine.strip():
                spaIndex = j
        sentAlignments.append({
            'indices' : (spaIndex, engIndex),
            'sents' : (oneLineSpa[spaIndex], oneLineEng[engIndex])
        })
    spaIndex += 1

0/214 sentences parsed.
50/214 sentences parsed.
100/214 sentences parsed.
150/214 sentences parsed.


In [43]:
# chec, k it works
randSentAlign = random.choice(sentAlignments)
s, t = randSentAlign['sents']
print(s)
print(t)

"¿Ashgrove?", les pregunté a unos chicos en el andén.
"Ashgrove?"


In [44]:
import pickle

with open('borges_sent_alignment_3-12.pickle', 'wb') as handle:
    pickle.dump(sentAlignments, handle, protocol=pickle.HIGHEST_PROTOCOL)

## 4.1 Parse word alignment using SimAlign (recommended: fast and high coverage)

In [9]:
# pip install simalign

In [5]:
from simalign import SentenceAligner

# making an instance of our model.
# You can specify the embedding model and all alignment settings in the constructor.
myaligner = SentenceAligner(model="bert", token_type="bpe", matching_methods="mai")

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
2022-03-12 11:28:27,941 - simalign.simalign - INFO - Initialized the EmbeddingLoader with model: bert-base-multilingual-cased


In [8]:
# # The source and target sentences should be tokenized to words.
# src_sentence = ["This", "is", "a", "test", "."]
# trg_sentence = ["Das", "ist", "ein", "Test", "."]

# # The output is a dictionary with different matching methods.
# # Each method has a list of pairs indicating the indexes of aligned words (The alignments are zero-indexed).
# alignments = myaligner.get_word_aligns(src_sentence, trg_sentence)

# for matching_method in alignments:
#     print(matching_method, ":", alignments[matching_method])

## calculate word alignment with SimAlign

In [6]:
import time

In [7]:
your_data = zip(srcsents, tgtsents)
start = time.time()

alignmentList = []
i = 0
for sent_es_str, sent_en_str in your_data:
    if i % 10 == 0:
        print(f'{i}/{len(srcsents)} sentences parsed.')
    srcDoc = spaNLP.tokenizer(sent_es_str)
    tgtDoc = engNLP.tokenizer(sent_en_str)
    src = [t.text for t in srcDoc]
    tgt = [t.text for t in tgtDoc]
    alignments = myaligner.get_word_aligns(src, tgt)
    itermax = alignments['itermax']
    
    alignmentList.append({
        'indices' : itermax,
        'words' : [(src[s], tgt[t]) for s, t in itermax]
    })
    
    i += 1
end = time.time()
print('parsed in',end-start,'s')

0/214 sentences parsed.
10/214 sentences parsed.
20/214 sentences parsed.
30/214 sentences parsed.
40/214 sentences parsed.
50/214 sentences parsed.
60/214 sentences parsed.
70/214 sentences parsed.
80/214 sentences parsed.
90/214 sentences parsed.
100/214 sentences parsed.
110/214 sentences parsed.
120/214 sentences parsed.
130/214 sentences parsed.
140/214 sentences parsed.
150/214 sentences parsed.
160/214 sentences parsed.
170/214 sentences parsed.
180/214 sentences parsed.
190/214 sentences parsed.
200/214 sentences parsed.
210/214 sentences parsed.
parsed in 644.7377479076385 s


In [10]:
random.choice(alignmentList)

{'indices': [(0, 0),
  (1, 2),
  (2, 3),
  (3, 4),
  (4, 5),
  (5, 6),
  (6, 7),
  (7, 1),
  (8, 8),
  (9, 9)],
 'words': [('Alcé', 'I'),
  ('los', 'my'),
  ('ojos', 'eyes'),
  ('y', 'and'),
  ('la', 'the'),
  ('tenue', 'short'),
  ('pesadilla', 'nightmare'),
  ('se', 'lifted'),
  ('disipó', 'disappeared'),
  ('.', '.')]}

### import to pickle (bytestream) for later access

In [33]:
import pickle

with open('borges_word_alignment_3-12.pickle', 'wb') as handle:
    pickle.dump(alignmentList, handle, protocol=pickle.HIGHEST_PROTOCOL)

### an example of simalign on a single pair

In [38]:
i = random.choice(range(len(srcsents)))
srcDoc = spaNLP.tokenizer(srcsents[i])
tgtDoc = engNLP.tokenizer(tgtsents[i])
src = [t.text for t in srcDoc]
tgt = [t.text for t in tgtDoc]
alignments = myaligner.get_word_aligns(src, tgt)

for match in alignments:
    print(match, ':', alignments[match])

mwmf : [(0, 0), (5, 21), (9, 32), (10, 3), (11, 9), (16, 13), (17, 15), (18, 1), (19, 2), (21, 4), (23, 5), (23, 31), (24, 6), (25, 7), (26, 8), (27, 11), (29, 10), (30, 12), (31, 33), (32, 14), (33, 16), (34, 17), (35, 18), (36, 19), (37, 20), (38, 22), (39, 23), (40, 24), (41, 25), (42, 26), (43, 27), (44, 28), (45, 29), (46, 30), (47, 31), (48, 34)]
inter : [(0, 0), (11, 9), (16, 13), (18, 1), (21, 4), (23, 5), (24, 6), (25, 7), (26, 8), (27, 11), (29, 10), (30, 12), (32, 14), (34, 17), (35, 18), (36, 19), (37, 20), (38, 22), (39, 23), (40, 24), (41, 25), (42, 26), (43, 27), (44, 28), (45, 29), (46, 30), (48, 32)]
itermax : [(0, 0), (10, 3), (11, 9), (12, 10), (15, 7), (16, 13), (17, 0), (18, 1), (19, 2), (21, 4), (23, 5), (24, 6), (25, 7), (26, 8), (27, 11), (28, 9), (29, 10), (30, 12), (32, 14), (33, 16), (34, 17), (35, 18), (36, 19), (37, 20), (38, 21), (38, 22), (39, 23), (40, 24), (41, 25), (42, 26), (43, 27), (44, 28), (45, 29), (46, 30), (47, 31), (48, 32), (48, 34)]


In [39]:
for match in alignments:
    for s, t in alignments[match]:
        print(src[s], tgt[t])
    print()

En In
favorable very
, .
usted you
ha have
; .
en yet
otro another
, ,
, ,
atravesar crossing
atravesar phantom
el the
jardín garden
, ,
me me
encontrado found
muerto dead
; "
en In
otro another
, ,
yo I
digo say
estas these
mismas same
palabras words
, ,
pero but
soy am
un an
error error
, ,
un a
fantasma phantom
. 8

En In
ha have
; .
otro another
, ,
atravesar crossing
el the
jardín garden
, ,
me me
encontrado found
muerto dead
en In
, ,
yo I
digo say
estas these
mismas same
palabras words
, ,
pero but
soy am
un an
error error
, ,
un a
. .

En In
usted you
ha have
llegado found
casa garden
; .
en In
otro another
, ,
, ,
atravesar crossing
el the
jardín garden
, ,
me me
ha have
encontrado found
muerto dead
en In
otro another
, ,
yo I
digo say
estas these
mismas very
mismas same
palabras words
, ,
pero but
soy am
un an
error error
, ,
un a
fantasma phantom
. .
. 8



## 4.2 Parse aligned sentences using Astred word alignment (backup, not recommended)

In [13]:
# !pip install astred[stanza]

In [14]:
# !pip install git+https://github.com/BramVanroy/awesome-align.git@astred_compat

In [15]:
import time
from astred import AlignedSentences, Sentence

### Running Astred Word Alignment (https://github.com/BramVanroy/astred/issues/3)

In [16]:
from astred.aligned import AlignedSentences, Sentence
from astred.aligner import Aligner
from astred.utils import load_parser

In [17]:
nlp_en = load_parser("en", "stanza", is_tokenized=False, verbose=True)
nlp_es = load_parser("es", "stanza", is_tokenized=False, verbose=True)
aligner = Aligner()

your_data = zip(srcsents, tgtsents)

alignmentList = []
i = 0
success = 0
for sent_es_str, sent_en_str in your_data:
    if i % 10 == 0:
        print(f'{i}/{len(srcsents)} sentences parsed.')
    try:
#         joinedsents = sent_en_str.replace('.',';')
#         if joinedsents[-1] == ';':
#             joinedsents = joinedsents[:-1] + '.'
#         sent_en = Sentence.from_text(joinedsents, nlp_en)
        sent_en = Sentence.from_text(sent_en_str, nlp_en)
        sent_es = Sentence.from_text(sent_es_str, nlp_es)
        aligned = AlignedSentences(sent_es, sent_en, aligner=aligner)
        alignmentList.append({
            'spanish_text' : sent_es_str,
            'english_text' : sent_en_str,
            'spanish_nlp' : sent_es,
            'english_nlp' : sent_en,
            'alignment' : aligned
        })
        success += 1
    except:
        alignmentList.append({
            'spanish_text' : sent_es_str,
            'english_text' : sent_en_str,
            'spanish_nlp' : 'Error',
            'english_nlp' : 'Error',
            'alignment' : 'Error'
        })
    i += 1
#     if i == 20:
#         break

2022-02-26 08:06:19 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| pos       | combined |
| lemma     | combined |
| depparse  | combined |

2022-02-26 08:06:19 INFO: Use device: cpu
2022-02-26 08:06:19 INFO: Loading: tokenize
2022-02-26 08:06:19 INFO: Loading: pos
2022-02-26 08:06:21 INFO: Loading: lemma
2022-02-26 08:06:21 INFO: Loading: depparse
2022-02-26 08:06:23 INFO: Done loading processors!
2022-02-26 08:07:20 INFO: Loading these models for language: es (Spanish):
| Processor | Package |
-----------------------
| tokenize  | ancora  |
| mwt       | ancora  |
| pos       | ancora  |
| lemma     | ancora  |
| depparse  | ancora  |

2022-02-26 08:07:20 INFO: Use device: cpu
2022-02-26 08:07:20 INFO: Loading: tokenize
2022-02-26 08:07:20 INFO: Loading: mwt
2022-02-26 08:07:20 INFO: Loading: pos
2022-02-26 08:07:22 INFO: Loading: lemma
2022-02-26 08:07:22 INFO: Loading: depparse
2022-02-26 08:07:25 

0/219 sentences parsed.
10/219 sentences parsed.
20/219 sentences parsed.
30/219 sentences parsed.
40/219 sentences parsed.
50/219 sentences parsed.
60/219 sentences parsed.
70/219 sentences parsed.
80/219 sentences parsed.
90/219 sentences parsed.
100/219 sentences parsed.
110/219 sentences parsed.
120/219 sentences parsed.
130/219 sentences parsed.
140/219 sentences parsed.
150/219 sentences parsed.
160/219 sentences parsed.
170/219 sentences parsed.
180/219 sentences parsed.
190/219 sentences parsed.
200/219 sentences parsed.
210/219 sentences parsed.


## 5. Alignment done! View alignments below.

In [21]:
print(f'success rate of word alignment on sentences is {success}/{i} sents or {success/i}')

success rate of word alignment on sentences is 164/219 sents or 0.7488584474885844


### Sequence alignment AND word alignment (more importantly)

In [22]:
a = random.choice(alignmentList)
for pair in a['alignment'].aligned_seq_spans:
    print(pair[0].text, '\t\t'+pair[1].text)
print()
print()
for pair in a['alignment'].aligned_words:
    print(pair[0].text, '\t\t'+pair[1].text)

 		
 		as
 		as
 		possible
Aniquilado , trémulo , me encogí en la otra punta de el sillón , 		Shattered , trembling , I huddled in the distant corner of the seat ,
lejos 		far
de el temido cristal . 		from the fearful window .


[[NULL]] 		[[NULL]]
[[NULL]] 		as
[[NULL]] 		as
[[NULL]] 		possible
Aniquilado 		Shattered
, 		,
trémulo 		trembling
, 		,
me 		I
encogí 		huddled
en 		in
la 		the
otra 		distant
punta 		corner
de 		of
el 		the
sillón 		seat
, 		,
lejos 		far
de 		from
el 		the
temido 		fearful
cristal 		window
. 		.


In [23]:
### Write something not completely huge to the Pickle
pickleList = []
for a in alignmentList:
    if a['alignment'] != 'Error': 
        spanishTokens = [{
            'tokenid' : token.id, 
            'POS' : token.upos, 
            'text' : token.text, 
            'lemma' : token.lemma,
            'features' : token.feats
        } for token in a['spanish_nlp']]
        englishTokens = [{
            'tokenid' : token.id, 
            'POS' : token.upos, 
            'text' : token.text, 
            'lemma' : token.lemma,
            'features' : token.feats
        } for token in a['english_nlp']]
        alignmentTuples = [(pair[0].id, pair[1].id) for pair in a['alignment'].aligned_words]
    
    else:
        spanishTokens = [{
            'tokenid' : token.i, 
            'POS' : token.pos_, 
            'text' : token.text, 
            'lemma' : token.lemma_,
            'features' : 'Error'
        } for token in spaNLP(a['spanish_text'])]
        englishTokens = [{
            'tokenid' : token.i, 
            'POS' : token.pos_, 
            'text' : token.text, 
            'lemma' : token.lemma_,
            'features' : 'Error'
        } for token in engNLP(a['english_text'])]
        
        alignmentTuples = []
    
    # build list
    pickleList.append({
        'spanishRawText' : a['spanish_text'],
        'englishRawText' : a['english_text'],
        'spanishTokenList' : spanishTokens,
        'englishTokenList' : englishTokens,
        'alignmentTuples' : alignmentTuples
    })

In [32]:
p =random.choice(pickleList)
print(p['alignmentTuples'])

[]


In [25]:
import pickle

with open('alignment2-26.pickle', 'wb') as handle:
    pickle.dump(pickleList, handle, protocol=pickle.HIGHEST_PROTOCOL)

## ignore below

In [11]:
# jank version which worked ok but took SO long (8+ hrs)

# # alignment tool (takes a freaking long time)
# start = time.time()

# alignmentList = []
# success = 0
# total = 0
# for i in range(len(srcsents)):
#     if srcsents[i] == '' or tgtsents[i] == '':
#         alignmentList.append((srcsents[i],tgtsents[i]))
#         print(f'blank space on sentence {i}')
#     else:
#         try:
#             sent_sp = Sentence.from_text(srcsents[i], "es", is_tokenized=False)
#             sent_es = Sentence.from_text(tgtsents[i], "en", is_tokenized=False)
#             aligned = AlignedSentences(sent_sp, sent_es)
#             print(aligned.aligned_words)
#             alignmentList.append(aligned)
#             success += 1
#             print(f'successfully aligned sentence {i}')
#         except:
#             alignmentList.append((srcsents[i],tgtsents[i]))
#             print(f'spacy failure on sentence {i}')
#     total += 1
    
# end = time.time()
# print(f'only took {end-start} seconds or {(end-start)/60} minutes to run')