## Spacy models installation / loading

In [1]:
!python -m spacy download en_core_web_sm
!python -m spacy download fr_core_news_sm

import spacy
en_model = spacy.load("en_core_web_sm")
fr_model = spacy.load("fr_core_news_sm")

if en_model and fr_model:
    print("Installation successful")
else:
    print("Installation error")

Collecting en-core-web-sm==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.1/en_core_web_sm-3.4.1-py3-none-any.whl (12.8 MB)
     --------------------------------------- 12.8/12.8 MB 11.3 MB/s eta 0:00:00
✔ Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')
Collecting fr-core-news-sm==3.4.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.4.0/fr_core_news_sm-3.4.0-py3-none-any.whl (16.3 MB)
     ---------------------------------------- 16.3/16.3 MB 7.8 MB/s eta 0:00:00
✔ Download and installation successful
You can now load the package via spacy.load('fr_core_news_sm')


  from .autonotebook import tqdm as notebook_tqdm


Installation successful


## Loading of the file with sentence pairs

In [43]:
import pandas as pd
data = pd.read_csv("en_fr_1000.csv") # change with the desired file (in the same folder, as a .csv)
data # check that everything's fine

Unnamed: 0,EN,FR,Bad,Unnamed: 3
0,In 1835 he met and befriended Taras Shevchenko.,En juillet 1835 il rencontre et se lie avec Ta...,False,
1,Edward kept a camel as a pet and as a young ma...,Édouard avait un chameau comme animal de compa...,False,
2,Although it represents a limited activity toda...,Aujourd’hui réduite l’activité ferroviaire a é...,False,
3,In 1979 Gebhard (with Alan B. Johnson) conclud...,En 1979 Gebhard (avec Alan B. Johnson) a concl...,False,
4,In 1998 the USAF's Lockheed U-2S fleet was fit...,En 1998 la flotte américaine de Lockheed U-2 e...,False,
...,...,...,...,...
1983,He was deployed to the Russian front.,Il est appelé sur le front russe.,False,
1984,The telescope is remotely controllable from mu...,Le télescope est contrôlé à distance grâce à d...,False,
1985,With armistice declared of June 18 1940 he ref...,À l'appel du 18 juin (1940) il refuse l'armist...,False,
1986,Faas Wilkes futbolista holandés.,Faas Wilkes footballeur néerlandais.,True,


#### POS parsing

In [44]:
en_parsed, fr_parsed = [], [] # list containing all of the POS-parsed sentences

for en_sentence, fr_sentence, is_bad in zip(data["EN"], data["FR"], data["Bad"]): # change columns names if needed
    en_s, fr_s = "", ""
    
    if is_bad == False:
        parsed_en_s = en_model(en_sentence)
        parsed_fr_s = fr_model(fr_sentence)
        
        for token in parsed_en_s:
            en_s += f"{token} {token.pos_}\n"
        
        for token in parsed_fr_s:
            fr_s += f"{token} {token.pos_}\n"
    
    en_parsed.append(en_s)
    fr_parsed.append(fr_s)

print(en_parsed[:10], fr_parsed[:10])

['In ADP\n1835 NUM\nhe PRON\nmet VERB\nand CCONJ\nbefriended VERB\nTaras PROPN\nShevchenko PROPN\n. PUNCT\n', 'Edward PROPN\nkept VERB\na DET\ncamel NOUN\nas ADP\na DET\npet NOUN\nand CCONJ\nas SCONJ\na DET\nyoung ADJ\nman NOUN\ntook VERB\na DET\nlion NOUN\nwith ADP\nhim PRON\non ADP\ncampaign NOUN\nto ADP\nScotland PROPN\n. PUNCT\n', 'Although SCONJ\nit PRON\nrepresents VERB\na DET\nlimited ADJ\nactivity NOUN\ntoday NOUN\nrail NOUN\ntransportation NOUN\nwas AUX\na DET\nparticularly ADV\nlarge ADJ\nmarket NOUN\nin ADP\nyears NOUN\ngone VERB\nby ADV\n: PUNCT\n442 NUM\n% NOUN\nof ADP\nall DET\nthe DET\nengines NOUN\nproduced VERB\nin ADP\nthe DET\nhistory NOUN\nof ADP\nS.E.M.T. PROPN\nwere AUX\ndestined VERB\nfor ADP\nlocomotives NOUN\n. PUNCT\n', "In ADP\n1979 NUM\nGebhard PROPN\n( PUNCT\nwith ADP\nAlan PROPN\nB. PROPN\nJohnson PROPN\n) PUNCT\nconcluded VERB\nthat SCONJ\nnone NOUN\nof ADP\nKinsey PROPN\n's PART\noriginal ADJ\nestimates NOUN\nwere AUX\nsignificantly ADV\naffected VERB\nb

In [45]:
data["POS EN"], data["POS FR"] = en_parsed, fr_parsed # change column name if needed
data # check that the annotations have been added

Unnamed: 0,EN,FR,Bad,Unnamed: 3,POS EN,POS FR
0,In 1835 he met and befriended Taras Shevchenko.,En juillet 1835 il rencontre et se lie avec Ta...,False,,In ADP\n1835 NUM\nhe PRON\nmet VERB\nand CCONJ...,En ADP\njuillet NOUN\n1835 NUM\nil PRON\nrenco...
1,Edward kept a camel as a pet and as a young ma...,Édouard avait un chameau comme animal de compa...,False,,Edward PROPN\nkept VERB\na DET\ncamel NOUN\nas...,Édouard PROPN\navait AUX\nun DET\nchameau NOUN...
2,Although it represents a limited activity toda...,Aujourd’hui réduite l’activité ferroviaire a é...,False,,Although SCONJ\nit PRON\nrepresents VERB\na DE...,Aujourd’hui ADV\nréduite VERB\nl’ SPACE\nactiv...
3,In 1979 Gebhard (with Alan B. Johnson) conclud...,En 1979 Gebhard (avec Alan B. Johnson) a concl...,False,,In ADP\n1979 NUM\nGebhard PROPN\n( PUNCT\nwith...,En ADP\n1979 NUM\nGebhard PROPN\n( PUNCT\navec...
4,In 1998 the USAF's Lockheed U-2S fleet was fit...,En 1998 la flotte américaine de Lockheed U-2 e...,False,,In ADP\n1998 NUM\nthe DET\nUSAF PROPN\n's PART...,En ADP\n1998 NUM\nla DET\nflotte NOUN\namérica...
...,...,...,...,...,...,...
1983,He was deployed to the Russian front.,Il est appelé sur le front russe.,False,,He PRON\nwas AUX\ndeployed VERB\nto ADP\nthe D...,Il PRON\nest AUX\nappelé VERB\nsur ADP\nle DET...
1984,The telescope is remotely controllable from mu...,Le télescope est contrôlé à distance grâce à d...,False,,The DET\ntelescope NOUN\nis AUX\nremotely ADV\...,Le DET\ntélescope NOUN\nest AUX\ncontrôlé VERB...
1985,With armistice declared of June 18 1940 he ref...,À l'appel du 18 juin (1940) il refuse l'armist...,False,,With SCONJ\narmistice NOUN\ndeclared VERB\nof ...,À ADP\nl' DET\nappel NOUN\ndu ADP\n18 NUM\njui...
1986,Faas Wilkes futbolista holandés.,Faas Wilkes footballeur néerlandais.,True,,,


In [42]:
#data = data[data["Bad"]==False] # removing bad pairs
data

Unnamed: 0,EN,FR,Bad,Unnamed: 3,POS EN,POS FR
0,In 1835 he met and befriended Taras Shevchenko.,En juillet 1835 il rencontre et se lie avec Ta...,False,,In ADP\n1835 NUM\nhe PRON\nmet VERB\nand CCONJ...,En ADP\njuillet NOUN\n1835 NUM\nil PRON\nrenco...
1,Edward kept a camel as a pet and as a young ma...,Édouard avait un chameau comme animal de compa...,False,,Edward PROPN\nkept VERB\na DET\ncamel NOUN\nas...,Édouard PROPN\navait AUX\nun DET\nchameau NOUN...
2,Although it represents a limited activity toda...,Aujourd’hui réduite l’activité ferroviaire a é...,False,,Although SCONJ\nit PRON\nrepresents VERB\na DE...,Aujourd’hui ADV\nréduite VERB\nl’ SPACE\nactiv...
3,In 1979 Gebhard (with Alan B. Johnson) conclud...,En 1979 Gebhard (avec Alan B. Johnson) a concl...,False,,In ADP\n1979 NUM\nGebhard PROPN\n( PUNCT\nwith...,En ADP\n1979 NUM\nGebhard PROPN\n( PUNCT\navec...
4,In 1998 the USAF's Lockheed U-2S fleet was fit...,En 1998 la flotte américaine de Lockheed U-2 e...,False,,In ADP\n1998 NUM\nthe DET\nUSAF PROPN\n's PART...,En ADP\n1998 NUM\nla DET\nflotte NOUN\namérica...
...,...,...,...,...,...,...
1982,Nitta pinned under the dead horse and unable t...,Nitta coincé sous le cheval mort et incapable ...,False,,Nitta PROPN\npinned VERB\nunder ADP\nthe DET\n...,Nitta PROPN\ncoincé VERB\nsous ADP\nle DET\nch...
1983,He was deployed to the Russian front.,Il est appelé sur le front russe.,False,,He PRON\nwas AUX\ndeployed VERB\nto ADP\nthe D...,Il PRON\nest AUX\nappelé VERB\nsur ADP\nle DET...
1984,The telescope is remotely controllable from mu...,Le télescope est contrôlé à distance grâce à d...,False,,The DET\ntelescope NOUN\nis AUX\nremotely ADV\...,Le DET\ntélescope NOUN\nest AUX\ncontrôlé VERB...
1985,With armistice declared of June 18 1940 he ref...,À l'appel du 18 juin (1940) il refuse l'armist...,False,,With SCONJ\narmistice NOUN\ndeclared VERB\nof ...,À ADP\nl' DET\nappel NOUN\ndu ADP\n18 NUM\njui...


### Word Alignment

In [21]:
%pip install simalign
from simalign import SentenceAligner

Note: you may need to restart the kernel to use updated packages.


In [50]:
en = data['EN']
fr = data['FR']

# making an instance of our model.
# You can specify the embedding model and all alignment settings in the constructor.
myaligner = SentenceAligner(model="bert", token_type="bpe", matching_methods="mai")

# List of alignments for all sentences
alignments = []

n = data[data.columns[0]].count()

for i in range(n):
    # Tokenize the sentence with spacy and then turn it into a list of tokens
    print(i)

    src_sentence = [str(token) for token in en_model(str(en[i]))]
    trg_sentence = [str(token) for token in fr_model(str(fr[i]))]

    # The source and target sentences should be tokenized to words.
    als = myaligner.get_word_aligns(src_sentence, trg_sentence)
    
    # The alignment is represented in the format like 0-0 1-1 ... where 0...m are indeces of the words.
    # The first index is from the source sentence, the second one is from the target
    alignments.append(' '.join(map(lambda x: f'{x[0]}-{x[1]}', als['itermax'])))

data['WA'] = alignments

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
2022-11-19 22:40:12,646 - simalign.simalign - INFO - Initialized the EmbeddingLoader with model: bert-base-multilingual-cased


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

In [51]:
data

Unnamed: 0,EN,FR,Bad,Unnamed: 3,POS EN,POS FR,WA
0,In 1835 he met and befriended Taras Shevchenko.,En juillet 1835 il rencontre et se lie avec Ta...,False,,In ADP\n1835 NUM\nhe PRON\nmet VERB\nand CCONJ...,En ADP\njuillet NOUN\n1835 NUM\nil PRON\nrenco...,0-0 0-1 1-2 2-3 3-4 4-5 5-6 5-7 5-8 6-9 7-10 8-11
1,Edward kept a camel as a pet and as a young ma...,Édouard avait un chameau comme animal de compa...,False,,Edward PROPN\nkept VERB\na DET\ncamel NOUN\nas...,Édouard PROPN\navait AUX\nun DET\nchameau NOUN...,0-0 1-1 2-2 3-3 4-4 5-6 6-5 7-8 8-9 9-10 10-11...
2,Although it represents a limited activity toda...,Aujourd’hui réduite l’activité ferroviaire a é...,False,,Although SCONJ\nit PRON\nrepresents VERB\na DE...,Aujourd’hui ADV\nréduite VERB\nl’ SPACE\nactiv...,0-0 1-2 2-14 4-1 5-3 6-0 7-4 9-6 10-5 12-7 13-...
3,In 1979 Gebhard (with Alan B. Johnson) conclud...,En 1979 Gebhard (avec Alan B. Johnson) a concl...,False,,In ADP\n1979 NUM\nGebhard PROPN\n( PUNCT\nwith...,En ADP\n1979 NUM\nGebhard PROPN\n( PUNCT\navec...,0-0 1-1 2-2 3-3 4-4 5-5 6-6 7-7 8-8 9-10 10-11...
4,In 1998 the USAF's Lockheed U-2S fleet was fit...,En 1998 la flotte américaine de Lockheed U-2 e...,False,,In ADP\n1998 NUM\nthe DET\nUSAF PROPN\n's PART...,En ADP\n1998 NUM\nla DET\nflotte NOUN\namérica...,0-0 1-1 2-2 3-4 4-5 5-6 6-7 7-3 8-8 9-9 10-10 ...
...,...,...,...,...,...,...,...
1983,He was deployed to the Russian front.,Il est appelé sur le front russe.,False,,He PRON\nwas AUX\ndeployed VERB\nto ADP\nthe D...,Il PRON\nest AUX\nappelé VERB\nsur ADP\nle DET...,0-0 1-1 2-2 3-3 4-4 5-6 6-5 7-7
1984,The telescope is remotely controllable from mu...,Le télescope est contrôlé à distance grâce à d...,False,,The DET\ntelescope NOUN\nis AUX\nremotely ADV\...,Le DET\ntélescope NOUN\nest AUX\ncontrôlé VERB...,0-0 1-1 2-2 3-3 4-3 5-4 8-6 9-10 10-10 11-11 1...
1985,With armistice declared of June 18 1940 he ref...,À l'appel du 18 juin (1940) il refuse l'armist...,False,,With SCONJ\narmistice NOUN\ndeclared VERB\nof ...,À ADP\nl' DET\nappel NOUN\ndu ADP\n18 NUM\njui...,0-0 1-12 2-13 3-3 4-5 5-4 5-6 6-7 7-9 8-10 9-1...
1986,Faas Wilkes futbolista holandés.,Faas Wilkes footballeur néerlandais.,True,,,,0-0 1-1 2-2 3-3 4-4


#### Export results

In [52]:
data.to_csv("en_fr_1000_pos_we.csv", index=False) # change output file name with desired name

#### Remove bad pairs

In [3]:
import pandas as pd
data = pd.read_csv("en_fr_1000_pos_we.csv")
cleaned_data = data[data["Bad"]==False]
print(cleaned_data)
cleaned_data.to_csv("en_fr_1788_pos_wa.csv", index=False)

                                                     EN  \
0       In 1835 he met and befriended Taras Shevchenko.   
1     Edward kept a camel as a pet and as a young ma...   
2     Although it represents a limited activity toda...   
3     In 1979 Gebhard (with Alan B. Johnson) conclud...   
4     In 1998 the USAF's Lockheed U-2S fleet was fit...   
...                                                 ...   
1982  Nitta pinned under the dead horse and unable t...   
1983              He was deployed to the Russian front.   
1984  The telescope is remotely controllable from mu...   
1985  With armistice declared of June 18 1940 he ref...   
1987  For a long time this path constituted the only...   

                                                     FR    Bad Unnamed: 3  \
0     En juillet 1835 il rencontre et se lie avec Ta...  False        NaN   
1     Édouard avait un chameau comme animal de compa...  False        NaN   
2     Aujourd’hui réduite l’activité ferroviaire a é...  Fal