## Install libraries, modules

In [1]:
# !python -m spacy download es_core_news_sm

In [2]:
# !python -m spacy download en_core_web_sm

In [3]:
# !python -m spacy download ru_core_news_sm

In [4]:
# pip install spacy

In [5]:
# pip install pandas

In [137]:
# pip install PyArabic

In [140]:
# !python -m spacy download xx_ent_wiki_sm

In [1]:
import spacy
from spacy.morphology import Morphology

import pyarabic.araby as araby
import pyarabic.number as number

import pandas as pd
import random

import json

import time

In [2]:
# SPECIFY SOURCE LANGUAGE
srclang = 'Russian'

In [3]:
# install spacy lang models

if srclang == 'Spanish':
    sourceNLP = spacy.load("es_core_news_sm")
elif srclang == 'Russian':
    sourceNLP = spacy.load("ru_core_news_sm")
elif srclang == 'Arabic':
    sourceNLP = spacy.load("xx_ent_wiki_sm")
    sourceNLP.add_pipe('sentencizer')
    
engNLP = spacy.load("en_core_web_sm")

## 1. Load raw texts

In [4]:
with open(f'texts/{srclang}/rawsource.txt','r') as f:
     sourcetxt = f.read().replace('\n',' ').replace('\t','')
with open(f'texts/{srclang}/rawtarget.txt','r') as f:
     targettxt = f.read().replace('\n',' ')

### Apply language model

In [5]:
if srclang == 'Arabic':
    # sourcedoc = araby.sentence_tokenize(sourcetxt)
    sourcedoc = sourceNLP(sourcetxt)
else:
    sourcedoc = sourceNLP(sourcetxt)
targetdoc = engNLP(targettxt)

In [6]:
# sentenize
rawsrcsents = []
rawtgtsents = []

for sent in sourcedoc.sents:
    rawsrcsents.append(sent.text)
    
newrawsrcsents = []
for sent in rawsrcsents:
    if sent == '':
        continue
    partfound = False
    for part in ['Part I.','Part II.','Part III.','Part IV.','Part V.']:
        if part in sent:
            newrawsrcsents.append(part)
            newrawsrcsents.append(sent.split(part)[1].strip())
            partfound = True
    if not partfound:
        newrawsrcsents.append(sent)
rawsrcsents = newrawsrcsents
    
        
for sent in targetdoc.sents:
    rawtgtsents.append(sent.text)

newrawtgtsents = []
for sent in rawtgtsents:
    if sent == '':
        continue
    partfound = False
    for part in ['Part I.','Part II.','Part III.','Part IV.','Part V.']:
        if part in sent:
            newrawtgtsents.append(part)
            newrawtgtsents.append(sent.split(part)[1])
            partfound = True
    if not partfound:
        newrawtgtsents.append(sent)
rawtgtsents = newrawtgtsents

## 2. Write standardized files (one line per sentence) for input to Bleualign

In [117]:
# these are the inputs to bleualign
with open(f'texts/{srclang}/sourcetextforbleualign.txt','w') as f:
    f.write('\n'.join(rawsrcsents))
with open(f'texts/{srclang}/targettextforbleualign.txt','w') as f:
    f.write('\n'.join(rawtgtsents))

In [118]:
# tokenized sentences for data output
srctokens = []
if srclang == 'Arabic':
    for srcsent in rawsrcsents:
        tokens = araby.tokenize(srcsent)
        srctokens.append([{'text' : t, 'lemma' : t} for t in tokens])
else:
    for srcsent in rawsrcsents:
        tokens = sourceNLP(srcsent)
        srctokens.append([{'text' : t.text, 'lemma' : t.lemma_} for t in tokens])
tgttokens = []
for tgtsent in rawtgtsents:
    tokens = engNLP(tgtsent)
    tgttokens.append([{'text' : t.text, 'lemma' : t.lemma_} for t in tokens])

# Run Bleualign

In [119]:
# !python setup.py install

In [120]:
# pip install translators --upgrade

In [233]:
import translators as ts

In [179]:
start = time.time()
translatedsourcesents = []
for i, sent in enumerate(rawsrcsents[:3]):
    if i % 25 == 0:
        print(f'{i}/{len(rawsrcsents)} sents translated.')
    try:
        translatedsourcesents.append(ts.google(sent, to_language = 'en'))
    except:
        print('problem on',sent)
        translatedsourcesents.append('\n')
end = time.time()
print(f'machine translation took {end-start} seconds')

0/164 sents translated.
machine translation took 2.1140940189361572 seconds


In [181]:
translatedsourcesents

['Part I.',
 'For this day name, and can not put it where God put him from the month and Sunnis, but can not mention this day a particular time, but almost nearly.',
 'The biggest thought that this time was happening today in dawn or lover.']

In [237]:
if srclang not in ['Arabic','Russian']:
    with open(f'texts/{srclang}/translatedsource.txt','w') as f:
        f.write('\n'.join(' '))

In [11]:
# %%capture cap --no-stderr
start = time.time()
!./bleualign.py -s texts/russian/sourcetextforbleualign.txt -t texts/russian/targettextforbleualign.txt --srctotarget texts/russian/translatedsource.txt -o texts/russian/outputfile --verbosity 2
end = time.time()
print(f'sentence alignment took {end-start} seconds')

reading in article 0: 
processing
computing alignment between srctotarget (file 0) and target text
Evaluating sentences with bleu
finished
searching for longest path of good alignments
finished
Fri Apr 15 10:18:45 2022
filling gaps
finished
Fri Apr 15 10:18:45 2022
Results of BLEU 1-to-1 alignment
[92m0: 0[1;m
[92m1: 1[1;m
[92m2: 2[1;m
[92m3: 3[1;m
[92m4: 4[1;m
[92m5: 5[1;m
[92m6: 6[1;m
[92m7: 7[1;m
[92m8: 8[1;m
[92m9: 9[1;m
[92m10: 10[1;m
[92m11: 11[1;m
[92m12: 12[1;m
[92m13: 13[1;m
[92m14: 14[1;m
[92m15: 15[1;m
[92m16: 16[1;m
[92m17: 17[1;m
[92m18: 18[1;m
[92m19: 19[1;m
[92m20: 20[1;m
[92m21: 21[1;m
[92m22: 22[1;m
[92m23: 23[1;m
[92m24: 24[1;m
[92m25: 25[1;m
[92m26: 26[1;m
[92m27: 27[1;m
[92m28: 28[1;m
[92m29: 29[1;m
[92m30: 30[1;m
[92m31: 32[1;m
[92m32: 33[1;m
[92m33: 34[1;m
[92m34: 35[1;m
[92m35: 36[1;m
[92m36: 37[1;m
[92m37: 38[1;m
[92m38: 39[1;m
[92m39: 40[1;m
[92m40: 41[1;m
[92m41: 42[1;m
[92m42

## [START HERE] 3. Read sentence-aligned files (from Bleualign)

In [7]:
with open(f'texts/{srclang}/outputfile-s','r') as f:
    alignedsrc = f.read().split('\n')
with open(f'texts/{srclang}/outputfile-t','r') as f:
    alignedtgt = f.read().split('\n')

In [10]:
i = random.choice(range(len(alignedsrc)))
alignedsrc[i], alignedtgt[i]

('Не заставляйте же меня страдать еще больше!',
 'Don’t make me suffer still more!')

In [15]:
rawsrcsents[:5]

['Часть I (Дама с собачкой).',
 ' Говорили, что на набережной появилось новое лицо: дама с собачкой.',
 'Дмитрий Дмитрич Гуров, проживший в Ялте уже две недели и привыкший тут, тоже стал интересоваться новыми лицами.',
 'Сидя в павильоне у Верне, он видел, как по набережной прошла молодая дама, невысокого роста блондинка, в берете; за нею бежал белый шпиц.',
 'И потом он встречал ее в городском саду и на сквере по нескольку раз в день.']

In [17]:
alignedsrc[:5]

['Часть I (Дама с собачкой).',
 ' Говорили, что на набережной появилось новое лицо: дама с собачкой.',
 'Дмитрий Дмитрич Гуров, проживший в Ялте уже две недели и привыкший тут, тоже стал интересоваться новыми лицами.',
 'Сидя в павильоне у Верне, он видел, как по набережной прошла молодая дама, невысокого роста блондинка, в берете; за нею бежал белый шпиц.',
 'И потом он встречал ее в городском саду и на сквере по нескольку раз в день.']

In [27]:
# sent to sent alignment
# oneLinesrc, oneLineEng = rawsrcsents, rawtgtsents
# alignedsrc, alignedEng = alignedsrc, alignedtgt

sentAlignments = []
alignmentLookup = dict()

for alignsrcLine, aligntgtLine in zip(alignedsrc, alignedtgt):
    srcIndices = []
    tgtIndices = []
    for i, srcSent in enumerate(rawsrcsents):
        if srcSent in ['','\n',' ',')']:
            continue
        if srcSent in alignsrcLine:
            srcIndices.append(i)
    for j, tgtSent in enumerate(rawtgtsents):
        if tgtSent in ['','\n',' ',')','. .']:
            continue
        if tgtSent in aligntgtLine:
            tgtIndices.append(j)

    sentAlignments.append({
        'indices' : (srcIndices, tgtIndices),
        'sents' : (alignsrcLine, aligntgtLine)
    })
    for i in srcIndices:
        alignmentLookup.setdefault(i, [])
        for j in tgtIndices:
            alignmentLookup[i].append(j)
    
# for alignsrcSent, alignEngSent in zip(alignedsrc, alignedEng):
#     if srcIndex % 50 == 0:
#         print(f'{srcIndex}/{len(rawsrcsents)} sentences parsed.')
#     individualEngSents = [sent.text for sent in engNLP(alignEngSent).sents]
#     for indEngSent in individualEngSents:
#         for i, thisEngLine in enumerate(oneLineEng):
#             if indEngSent.strip() == thisEngLine.strip():
#                 engIndex = i
#         for j, thissrcLine in enumerate(oneLinesrc):
#             if alignsrcSent.strip() == thissrcLine.strip():
#                 srcIndex = j
#         sentAlignments.append({
#             'indices' : (srcIndex, engIndex),
#             'sents' : (oneLinesrc[srcIndex], oneLineEng[engIndex])
#         })
#         alignmentLookup.setdefault(srcIndex,[])
#         alignmentLookup[srcIndex].append(engIndex)
#     srcIndex += 1

In [29]:
alignmentLookup

{0: [0],
 1: [1],
 2: [2],
 3: [3],
 4: [4],
 5: [5],
 6: [6],
 7: [7],
 8: [8],
 9: [9],
 10: [10],
 11: [11],
 12: [12],
 13: [13],
 14: [14],
 15: [15],
 16: [16],
 17: [17],
 18: [18],
 19: [19],
 20: [20],
 21: [21],
 22: [22],
 23: [23],
 24: [24],
 25: [25],
 26: [26],
 27: [27],
 28: [28],
 29: [29],
 30: [30, 31],
 31: [32],
 32: [33],
 33: [34],
 34: [35],
 35: [36],
 36: [37],
 37: [38],
 38: [39],
 39: [40],
 40: [41],
 41: [42],
 42: [43],
 43: [44],
 45: [45, 46],
 46: [47],
 47: [48],
 48: [49],
 49: [51],
 50: [52],
 51: [53],
 52: [54],
 53: [55],
 54: [56],
 55: [57],
 56: [58],
 57: [59],
 58: [60],
 59: [61],
 60: [62],
 61: [63],
 62: [64],
 63: [65],
 64: [66],
 65: [67],
 66: [68, 69],
 67: [71],
 68: [72],
 69: [73],
 70: [74],
 71: [75],
 72: [76],
 73: [77],
 74: [78, 109],
 75: [79],
 76: [80],
 77: [81],
 78: [82],
 79: [83],
 80: [84],
 81: [85],
 82: [86, 87],
 83: [88],
 84: [89],
 85: [90],
 86: [91],
 87: [92],
 88: [93],
 89: [94],
 90: [95],
 91: [96,

In [127]:
with open(f'jsondata/{srclang}/sentAlignment4-18.json', 'w', encoding='utf-8') as f:
    json.dump(sentAlignments, f, ensure_ascii=False, indent=4)

# EXAMPLE DONT NEED - check it works

In [39]:
# chec, k it works
randSentAlign = random.choice(sentAlignments)
s, t = randSentAlign['sents']
print(s)
print(t)

В его наружности, в характере, во всей его натуре было что-то привлекательное, неуловимое, что располагало к нему женщин, манило их; он знал об этом, и самого его тоже какая-то сила влекла к ним.
In his appearance, in his character, in his whole nature there was something attractive and elusive that disposed women towards him and enticed them; he knew that, and he himself was attracted to them by some force.


## 4.1 Parse word alignment using SimAlign (recommended: fast and high coverage)

In [58]:
# pip install simalign

In [43]:
from simalign import SentenceAligner
start = time.time()
# making an instance of our model.
# You can specify the embedding model and all alignment settings in the constructor.
myaligner = SentenceAligner(model="bert", token_type="bpe", matching_methods="mai")
end = time.time()
print(f'downloading word aligner tool took {end-start} seconds')

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
2022-04-18 19:06:33,080 - simalign.simalign - INFO - Initialized the EmbeddingLoader with model: bert-base-multilingual-cased


downloading word aligner tool took 4.5657641887664795 seconds


## calculate word alignment with SimAlign

In [44]:
len(rawsrcsents), len(rawtgtsents)

(321, 344)

In [45]:
# get rid of white space at end
your_data = zip(rawsrcsents, rawtgtsents)

start = time.time()

wordAlignmentList = []

for i, srcsent in enumerate(rawsrcsents):
    if i % 25 == 0:
        currently = time.time()
        print(f"{i}/{len(rawsrcsents)} sentences parsed in {currently-start} s.")
        
    srcTokens = []
    if srclang != 'Arabic':
        srcDoc = sourceNLP(srcsent)

        for tid, token in enumerate(srcDoc):
            srcTokens.append({
                'tokenid' : tid,
                'pos' : token.pos_, 
                'text' : token.text, 
                'lemma' : token.lemma_,
                'features' : Morphology.feats_to_dict(str(token.morph))
            })
    else:
        srcDoc = araby.tokenize(srcsent)
        for tidx, token in enumerate(srcDoc):
            srcTokens.append({
                'tokenid' : tidx,
                'pos' : 'N/A', 
                'text' : token, 
                'lemma' : token,
                'features' : 'N/A'
            })

    try:
        jLst = alignmentLookup[i]
    except:
        continue
        
    for j in jLst:
        tgtDoc = engNLP(rawtgtsents[j])

        tgtTokens = []
        for tid, token in enumerate(tgtDoc):
            tgtTokens.append({
                'tokenid' : tid,
                'pos' : token.pos_, 
                'text' : token.text, 
                'lemma' : token.lemma_,
                'features' : Morphology.feats_to_dict(str(token.morph))
            })

        if srclang != 'Arabic':
            src = [t.text for t in srcDoc]
        else:
            src = srcDoc
            
        tgt = [t.text for t in tgtDoc]
        # alignments = myaligner.get_word_aligns(src, tgt)
        
        try:
            alignments = myaligner.get_word_aligns(src, tgt)
            itermax = alignments['itermax']

            wordAlignmentList.append({
                'alignedwordindices' : itermax,
                'alignedwords' : [(src[s], tgt[t]) for s, t in itermax],
                'srctokens' : srcTokens,
                'tgttokens' : tgtTokens,
                'srcsentidx' : i,
                'tgtsentidx' : j,
            })
        except:
            print('problem on', src, tgt)
end = time.time()
print('parsed in',end-start,'s')

0/321 sentences parsed in 0.0007679462432861328 s.
25/321 sentences parsed in 61.51004886627197 s.
50/321 sentences parsed in 96.67140698432922 s.
75/321 sentences parsed in 208.13037300109863 s.
100/321 sentences parsed in 224.94836902618408 s.
125/321 sentences parsed in 253.2319040298462 s.
150/321 sentences parsed in 305.49098110198975 s.
175/321 sentences parsed in 336.7091188430786 s.
200/321 sentences parsed in 360.60863995552063 s.
225/321 sentences parsed in 411.69376516342163 s.
250/321 sentences parsed in 440.1134970188141 s.
275/321 sentences parsed in 451.2287931442261 s.
300/321 sentences parsed in 483.06016993522644 s.
parsed in 508.19329619407654 s


In [59]:
wordAlignmentList[3]

{'alignedwordindices': [(0, 0),
  (1, 1),
  (2, 2),
  (2, 3),
  (3, 4),
  (4, 5),
  (4, 6),
  (5, 7),
  (6, 8),
  (7, 9),
  (10, 25),
  (11, 27),
  (12, 24),
  (13, 10),
  (13, 11),
  (14, 12),
  (15, 13),
  (16, 15),
  (16, 16),
  (17, 17),
  (19, 19),
  (19, 23),
  (20, 20),
  (21, 22),
  (22, 28),
  (23, 29),
  (24, 30),
  (25, 31),
  (26, 32),
  (26, 33),
  (27, 34),
  (28, 35)],
 'alignedwords': [('Сидя', 'Sitting'),
  ('в', 'in'),
  ('павильоне', 'a'),
  ('павильоне', 'pavilion'),
  ('у', 'at'),
  ('Верне', 'Vernet'),
  ('Верне', '’s'),
  (',', ','),
  ('он', 'he'),
  ('видел', 'saw'),
  ('по', 'along'),
  ('набережной', 'embankment'),
  ('прошла', 'walking'),
  ('молодая', 'a'),
  ('молодая', 'young'),
  ('дама', 'woman'),
  (',', ','),
  ('невысокого', 'very'),
  ('невысокого', 'tall'),
  ('роста', ','),
  (',', ','),
  (',', ','),
  ('в', 'in'),
  ('берете', 'beret'),
  (';', ';'),
  ('за', 'behind'),
  ('нею', 'her'),
  ('бежал', 'ran'),
  ('белый', 'a'),
  ('белый', 'white')

In [58]:
with open(f'jsondata/{srclang}/wordAlignment4-18.json', 'w',encoding='utf-8') as f:
    json.dump(wordAlignmentList, f, ensure_ascii=False, indent=4)

## Dump Sentence Order

In [53]:
with open(f'jsondata/{srclang}/srcLineBreaks.txt','r') as f:
    file = f.read().split('\n')
    srcLineBreaks = [(int(l.split(',')[0]), int(l.split(',')[1])) for l in file]
with open(f'jsondata/{srclang}/tgtLineBreaks.txt','r') as f:
    file = f.read().split('\n')
    tgtLineBreaks = [(int(l.split(',')[0]), int(l.split(',')[1])) for l in file]

In [54]:
srcLineBreaks[:3]

[(0, 7), (3, 28), (5, 30)]

In [55]:
srctokens = []
tgttokens = []
for i, srcsent in enumerate(rawsrcsents):
    srcdoc = sourceNLP(srcsent)
    
    tokenswithlinebreak = []
    for sentidx, tokenidx in srcLineBreaks:
        if sentidx == i:
            tokenswithlinebreak.append(tokenidx)
            print(sentidx, tokenswithlinebreak)
    
    senttokens = [{
            'tokenid' : t,
            'pos' : token.pos_, 
            'text' : token.text, 
            'lemma' : token.lemma_,
            'features' : Morphology.feats_to_dict(str(token.morph)),
            'linebreak' : t in tokenswithlinebreak
        } for t, token in enumerate(srcdoc)]
    srctokens.append({
        'text' : srcsent,
        'tokens' : senttokens
    })
for i, tgtsent in enumerate(rawtgtsents):
    tgtdoc = engNLP(tgtsent)
    
    tokenswithlinebreak = []
    for sentidx, tokenidx in tgtLineBreaks:
        if sentidx == i:
            tokenswithlinebreak.append(tokenidx)
    
    senttokens = [{
            'tokenid' : t,
            'pos' : token.pos_, 
            'text' : token.text, 
            'lemma' : token.lemma_,
            'features' : Morphology.feats_to_dict(str(token.morph)),
            'linebreak' : t in tokenswithlinebreak
        } for t, token in enumerate(tgtdoc)]
    tgttokens.append({
        'text' : tgtsent,
        'tokens' : senttokens
    })

sentsInOrderJSON = {'srcSentsInOrder' : srctokens, 'tgtSentsInOrder' : tgttokens}
with open(f'jsondata/{srclang}/sentsInOrder4-18.json', 'w', encoding='utf-8') as f:
    json.dump(sentsInOrderJSON, f, ensure_ascii=False, indent=4)

0 [7]
3 [28]
5 [30]
6 [24]
11 [35]
11 [35, 39]
14 [44]
16 [28]
19 [86]
22 [3]
23 [9]
24 [10]
26 [19]
27 [3]
28 [8]
29 [2]
30 [20]
34 [7]
42 [10]
46 [12]
47 [19]
48 [8]
53 [3]
56 [24]
59 [27]
61 [12]
64 [7]
65 [4]
66 [34]
66 [34, 44]
67 [4]
70 [152]
73 [26]
75 [8]
78 [9]
79 [39]
81 [8]
83 [3]
84 [4]
96 [25]
97 [38]
98 [15]
99 [11]
103 [12]
104 [8]
105 [31]
106 [43]
107 [6]
109 [4]
110 [16]
116 [73]
119 [16]
120 [11]
122 [2]
123 [4]
128 [30]
131 [3]
133 [4]
136 [15]
138 [2]
139 [16]
144 [5]
149 [30]
152 [4]
153 [7]
157 [34]
161 [9]
170 [14]
177 [28]
177 [28, 39]
178 [18]
179 [12]
179 [12, 16]
180 [2]
181 [12]
186 [51]
189 [16]
192 [14]
195 [5]
197 [13]
198 [19]
207 [23]
209 [24]
212 [8]
213 [18]
215 [5]
217 [8]
218 [16]
221 [14]
224 [3]
227 [43]
229 [17]
229 [17, 20]
235 [10]
236 [9]
238 [7]
239 [17]
243 [3]
245 [5]
246 [23]
249 [13]
250 [33]
255 [2]
256 [8]
264 [9]
266 [19]
267 [8]
271 [13]
274 [4]
276 [18]
277 [9]
283 [26]
287 [13]
289 [3]
291 [2]
293 [8]
294 [19]
297 [5]
298 [8]
300 [

In [57]:
srctokens[0]

{'text': 'Часть I (Дама с собачкой).',
 'tokens': [{'tokenid': 0,
   'pos': 'NOUN',
   'text': 'Часть',
   'lemma': 'часть',
   'features': {'Animacy': 'Inan',
    'Case': 'Nom',
    'Gender': 'Fem',
    'Number': 'Sing'},
   'linebreak': False},
  {'tokenid': 1,
   'pos': 'ADJ',
   'text': 'I',
   'lemma': 'i',
   'features': {},
   'linebreak': False},
  {'tokenid': 2,
   'pos': 'PUNCT',
   'text': '(',
   'lemma': '(',
   'features': {},
   'linebreak': False},
  {'tokenid': 3,
   'pos': 'PROPN',
   'text': 'Дама',
   'lemma': 'дама',
   'features': {'Animacy': 'Anim',
    'Case': 'Nom',
    'Gender': 'Fem',
    'Number': 'Sing'},
   'linebreak': False},
  {'tokenid': 4,
   'pos': 'ADP',
   'text': 'с',
   'lemma': 'с',
   'features': {},
   'linebreak': False},
  {'tokenid': 5,
   'pos': 'NOUN',
   'text': 'собачкой',
   'lemma': 'собачка',
   'features': {'Animacy': 'Inan',
    'Case': 'Ins',
    'Gender': 'Fem',
    'Number': 'Sing'},
   'linebreak': False},
  {'tokenid': 6,
   '

### Extract paragraph breaks

In [48]:
# importing the module
import json
 
# Opening JSON file
with open(f'jsondata/{srclang}/sentsInOrder.json') as json_file:
    sentsInOrder = json.load(json_file)

In [51]:
srcLineBreaks = []
for i, s in enumerate(sentsInOrder['srcSentsInOrder']):
    for tid, t in enumerate(s['tokens']):
        islinebreak = t['linebreak']
        j = tid
        if islinebreak:
            srcLineBreaks.append(str(i) +','+str(j))
with open(f'jsondata/{srclang}/srcLineBreaks.txt','w') as f:
    f.write('\n'.join(srcLineBreaks))

In [52]:
tgtLineBreaks = []
for i, s in enumerate(sentsInOrder['tgtSentsInOrder']):
    for tid, t in enumerate(s['tokens']):
        islinebreak = t['linebreak']
        j = tid
        if islinebreak:
            tgtLineBreaks.append(str(i) +','+str(j))
with open(f'jsondata/{srclang}/tgtLineBreaks.txt','w') as f:
    f.write('\n'.join(tgtLineBreaks))