<a href="https://colab.research.google.com/github/fatemeh-azadi/CLE-WordAlignments/blob/main/EvaluatingHuggingFaceModels.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%bash
pip install transformers
pip install sentencepiece


Collecting transformers
  Downloading https://files.pythonhosted.org/packages/fd/1a/41c644c963249fd7f3836d926afa1e3f1cc234a1c40d80c5f03ad8f6f1b2/transformers-4.8.2-py3-none-any.whl (2.5MB)
Collecting huggingface-hub==0.0.12
  Downloading https://files.pythonhosted.org/packages/2f/ee/97e253668fda9b17e968b3f97b2f8e53aa0127e8807d24a547687423fe0b/huggingface_hub-0.0.12-py3-none-any.whl
Collecting tokenizers<0.11,>=0.10.1
  Downloading https://files.pythonhosted.org/packages/d4/e2/df3543e8ffdab68f5acc73f613de9c2b155ac47f162e725dcac87c521c11/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3MB)
Collecting sacremoses
  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
Installing collected packages: huggingface-hub, tokenizers, sacremoses, transformers
Successfully installed huggingface-hub-0.0.12 sacremoses-0.0.45 

In [2]:
import torch
import sys
import numpy as np
from transformers import XLMRobertaTokenizer, XLMRobertaModel
from transformers import BertTokenizer, BertModel
from transformers import XLMTokenizer, XLMModel

SRC = "en"
TRG = "fa"


In [3]:
#### compute the cosine similarity between two vectors
def dist(w1, w2):
  return (np.dot(w1, w2) / (np.sqrt(np.dot(w1, w1)) * np.sqrt(np.dot(w2, w2))))

### compute the word aligments between two sentences 
## V1 and V2 are embedding matrices of the source and target sentences
def getAlignment(V1, V2):
  t1 = len(V1)
  t2 = len(V2)
  alignment = ""
  for i in range(0, t1):
    maxd = -1
    maxidx = -1
    for j in range(0, t2):
      d = dist(V1[i], V2[j])
      if(d > maxd):
        maxd = d
        maxidx = j
    alignment += "%d-%d "%(i, maxidx)
  return alignment

In [4]:
### computeAlignments() compute the source 2 target and target 2 source alignments 
### between the source and target file with respect to the modelType
## - srcFile: source file address
## - trgFile: target file address 
## - alignFile: outputs prefix address (the final alignments will be saved in alignFile.S2T.modelType and alignFile.T2S.modelType)
## - modelType: could be one of XLMR-Base/XLMR-Large/MBERT/XLM100/XLM15
###
def computeAlignments(srcFile, trgFile, alignFile, modelType):
 
  if (modelType == "XLMR-Base"):
    tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
    model = XLMRobertaModel.from_pretrained('xlm-roberta-base')
  elif (modelType == "XLMR-Large"):
    tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-large')
    model = XLMRobertaModel.from_pretrained('xlm-roberta-large')
  elif (modelType == "MBERT"):
    tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
    model = BertModel.from_pretrained('bert-base-multilingual-cased')
  elif (modelType == "XLM100"):
    tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-100-1280')
    model = XLMModel.from_pretrained('xlm-mlm-100-1280')
  elif (modelType == "XLM15"):
    tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-tlm-xnli15-1024')
    model = XLMModel.from_pretrained('xlm-mlm-tlm-xnli15-1024')
    language_id_src = tokenizer.lang2id[SRC]
    language_id_trg = tokenizer.lang2id[TGT]
  else:
    print("Model Type is not valid!")
    return

  inSrc = open(srcFile, "r")
  inTrg = open(trgFile, "r")
  S2T_AlignFile = open(alignFile + ".S2T." + modelType, "w")
  T2S_AlignFile = open(alignFile + ".T2S." + modelType, "w")
  srcBPE = open(srcFile + ".bpe." + modelType, "w")
  trgBPE = open(trgFile + ".bpe." + modelType, "w")

  for srcLine, trgLine in zip(inSrc, inTrg):
    src_input = tokenizer(srcLine.strip(), return_tensors="pt")
    src_bpe = [tokenizer._convert_id_to_token(i) for i in src_input.input_ids[0].tolist()]
    if(modelType == "XLM15"):
        langs = torch.tensor([language_id_src]* src_input.input_ids.shape[1])
        langs = langs.view(1, -1)
        src_output = model(**src_input, langs=langs)
    else:
        src_output = model(**src_input)
    src_embedding = src_output.last_hidden_state[0].detach().numpy()

    trg_input = tokenizer(trgLine.strip(), return_tensors="pt")
    trg_bpe = [tokenizer._convert_id_to_token(i) for i in trg_input.input_ids[0].tolist()]
    if(modelType == "XLM15"):
        langs = torch.tensor([language_id_trg] * trg_input.input_ids.shape[1])
        langs = langs.view(1, -1)
        trg_output = model(**trg_input, langs=langs)
    else:
        trg_output = model(**trg_input)
    trg_embedding = trg_output.last_hidden_state[0].detach().numpy()

    srcBPE.write(' '.join(src_bpe[1:-1]))
    srcBPE.write("\n")
    trgBPE.write(' '.join(trg_bpe[1:-1]))
    trgBPE.write("\n")

    alignS2T = getAlignment(src_embedding[1:-1], trg_embedding[1:-1])
    alignT2S = getAlignment(trg_embedding[1:-1], src_embedding[1:-1])
    S2T_AlignFile.write(alignS2T + "\n")
    T2S_AlignFile.write(alignT2S + "\n")

  srcBPE.close()
  trgBPE.close()
  S2T_AlignFile.close()
  T2S_AlignFile.close()


In [5]:
### change the format of the alignment file to giza format
### in order to give it to symal (symmetrizing tool in moses)
## - srcFile: source file address
## - trgFile: target file address 
## - alignFile: alignment file address 
## - the results will be saved in alignFile.giza-format
###
def convertToGizaFormat(srcFile, trgFile, alignFile):
  k = 1
  inSrc = open(srcFile, "r")
  inTrg = open(trgFile, "r")
  inAlign = open(alignFile, "r")
  outFile = open(alignFile + ".giza-format", "w")

  for lineSrc, lineTrg, lineAlign in zip(inSrc, inTrg, inAlign):
    outFile.write("#\n")
    outFile.write(lineSrc)

    lenSrc = len(lineSrc.strip().split())
    lineTrg = lineTrg.strip().split()
    lenTrg = len(lineTrg)

    lineAlign = lineAlign.strip().split()
    alignmentList = []
    mark = [0]*lenSrc
    ln = 0
    for x in lineTrg:
      alignmentList.append([])
      ln += 1
    for a in lineAlign:
      src, trg = a.split('-')
      src = int(src)
      trg = int(trg)
      if(trg >= len(alignmentList)):
        print(k)
        print(src, trg)
      alignmentList[trg].append(src)
      mark[src] = 1
    k += 1
    outFile.write("NULL ({")
    for i in range(lenSrc):
      if(mark[i] == 0):
        outFile.write(" " + str(i+1))
    outFile.write(" })")
    ln = 0
    for w in lineTrg:
      outFile.write(" " + w + " ({")
      for x in alignmentList[ln]:
        outFile.write(" " + str(x+1))
      outFile.write(" })")
      ln += 1
    outFile.write("\n")
  outFile.close()

In [6]:
### convert the subword level alignments to word level
## - srcFile: source file address
## - trgFile: target file address 
## - alignFile: the bidirectional alignments file address after symmetrizing
## - splitChar: the character used for subword tokenization in srcFile and trgFile 
### depends on the model's tokenizer (it could be #/@/_)
## - one: 1 if the alignFile indexes are 1-based and 0 otherwise
## the word level alignments results will be saved in alignFile.final-alignment
###
def convertToWordLevel(srcFile, trgFile, alignFile, splitChar, one = False):

  inSrc = open(srcFile, "r")
  inTrg = open(trgFile, "r")
  inAlign = open(alignFile, "r")
  outFile = open(alignFile + ".final-alignment", "w")
  cnt = 0
  for lineAlign, lineSrc, lineTrg in zip(inAlign, inSrc, inTrg):
    lineAlign = lineAlign.strip().split()
    lineSrc = lineSrc.strip().split()
    lineTrg = lineTrg.strip().split()
    aligns2 = ""
    srcIdx = []
    trgIdx = []
    cnt += 1
    if(splitChar == "@"):
        num = 1
    else:
        num = 0
    for w in lineSrc:
        if (splitChar == "@"):
           srcIdx.append(num)
           if(not w.endswith("@@")):
               num += 1
        elif (splitChar == "#"):
            if(not w.startswith("##")):
                num += 1
            srcIdx.append(num)
        else:
            if (w.startswith("▁")):
                num += 1
            srcIdx.append(num)
    if(splitChar == "@"):
        num = 1
    else:
        num = 0
    for w in lineTrg:
        if(splitChar == "@"):
            trgIdx.append(num)
            if(not w.endswith("@@")):
                num += 1
        elif (splitChar == "#"):
            if(not w.startswith("##")):
                num += 1
            trgIdx.append(num)
        else:
          if (w.startswith("▁")):
                num += 1
          trgIdx.append(num)
    alignmentsList = []
    for a in lineAlign:
        [s, t] = a.split("-")
        if(one):
          s = int(s) - 1
          t = int(t) - 1
        if(int(s) >= len(srcIdx) or int(t) >= len(trgIdx)):
          print(cnt)
          print(s,t)
        a = "%s-%s"%(srcIdx[int(s)], trgIdx[int(t)])
        if(not a in alignmentsList):
                alignmentsList.append(a)
    outFile.write(" ".join(alignmentsList) + "\n")
  outFile.close()


In [7]:
### model tokenizer do some preprocesses in addition to bpe that may split some words 
## e.g. the zero width spaces in farsi were replaced by spaces
## this function will get the reference file and the tokenizers output file after concatenating bpe tokens
## and returned the tokenized file with '##' before some words that should be concatenated 
## to the previous word with respect to the reference file
## inputFile: test file address after removing zero width spaces
## bpeFile: bpe output address after concatenating bpe tokens
###
def getTokenizedInput(inputFile, bpeFile, outputFile):
  inMain = open(inputFile, "r", encoding='utf-8')
  inBPE = open(bpeFile, "r", encoding='utf-8')
  outFile = open(outputFile, "w")
  cnt = 0
  for line1, line2 in zip(inMain, inBPE):
    line1 = line1.strip()
    line2 = line2.strip()
    line1 = line1.split()
    line2 = line2.split()
    n1 = 0
    n2 = 0
    w2 = ""
    cnt += 1
    for w1 in line1:
        w2 = ""
        while (len(w2) != len(w1) and n2 < len(line2)):
          w2 += line2[n2]
          n2 += 1
          if (w2 == "[UNK]"):
            break
        if (len(w1) == len(w2) or w2 == "[UNK]"):
          outFile.write(line2[n1] + " ")
          n1 += 1
          while (n1 < n2):
            outFile.write("##" + line2[n1] + " ")
            n1 += 1
        else:
          print(cnt, w1)
          print(w2)
          return
    outFile.write("\n")
  outFile.close()

In [8]:
## downloading aer.py for computing AER and symal for symmetrizing alignments
!wget https://raw.githubusercontent.com/lilt/alignment-scripts/master/scripts/aer.py
!git clone https://github.com/moses-smt/mosesdecoder.git
%cd mosesdecoder/symal
!b2
%cp bin/gcc-7/release/link-static/threading-multi/symal ../bin
%cd ../..

--2021-07-07 18:02:29--  https://raw.githubusercontent.com/lilt/alignment-scripts/master/scripts/aer.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8848 (8.6K) [text/plain]
Saving to: ‘aer.py’


2021-07-07 18:02:29 (106 MB/s) - ‘aer.py’ saved [8848/8848]

Cloning into 'mosesdecoder'...
remote: Enumerating objects: 148070, done.[K
remote: Counting objects: 100% (498/498), done.[K
remote: Compressing objects: 100% (206/206), done.[K
remote: Total 148070 (delta 315), reused 433 (delta 289), pack-reused 147572[K
Receiving objects: 100% (148070/148070), 129.86 MiB | 22.12 MiB/s, done.
Resolving deltas: 100% (114341/114341), done.
/content/mosesdecoder/symal
Tip: install tcmalloc for faster threading.  See BUILD-INSTRUCTIONS.txt for more information.
Perfo

In [9]:
SCRIPTS_ROOTDIR = "/content/mosesdecoder/scripts/"
GIZA2BAL = SCRIPTS_ROOTDIR + "/training/giza2bal.pl"
SYMAL = SCRIPTS_ROOTDIR + "/../bin/symal"


srcText = "data/text.en"
trgText = "data/text.fa"

!mkdir "outputs"

# **XLMR-Large**

In [None]:
MODEL_TYPE = "XLMR-Large"

srcBPE = srcText + ".bpe." + MODEL_TYPE
trgBPE = trgText + ".bpe." + MODEL_TYPE

S2T_Alignment = "outputs/alignments.S2T." + MODEL_TYPE
T2S_Alignment = "outputs/alignments.T2S." + MODEL_TYPE

outFile = "outputs/alignments." + MODEL_TYPE

mkdir: cannot create directory ‘outputs’: File exists


In [None]:
computeAlignments(srcText, trgText, "outputs/alignments", MODEL_TYPE)


In [None]:
convertToGizaFormat(srcBPE, trgBPE, S2T_Alignment)
convertToGizaFormat(trgBPE, srcBPE, T2S_Alignment)

In [None]:
!$GIZA2BAL -d $T2S_Alignment".giza-format" -i $S2T_Alignment".giza-format" | $SYMAL -alignment="grow" -diagonal="yes" -final="yes" -both="yes" > $outFile".grow-diag-final-and"

symal: computing grow alignment: diagonal (1) final (1)both-uncovered (1)
skip=<0> counts=<1572>


In [None]:
convertToWordLevel(srcBPE, trgBPE, outFile + ".grow-diag-final-and", "_")

In [None]:
CHARS=(u"▁").encode("utf8")
print(CHARS)
#concat bpe tokens
!sed "s/ //g" data/text.fa.bpe."$MODEL_TYPE" > data/text.fa."$MODEL_TYPE".remove-bpe
!sed "s/ //g" data/text.en.bpe."$MODEL_TYPE" > data/text.en."$MODEL_TYPE".remove-bpe

!sed -i "s/\xe2\x96\x81/ /g" data/text.fa."$MODEL_TYPE".remove-bpe
!sed -i "s/\xe2\x96\x81/ /g" data/text.en."$MODEL_TYPE".remove-bpe

!cp data/text.en data/text.en."$MODEL_TYPE".remove-zw
# remove zero width space characters in reference file
!sed "s/\xe2\x80\x8c//g" data/text.fa > data/text.fa."$MODEL_TYPE".remove-zw
!sed -i "s/\xe2\x80\x8f//g" data/text.fa."$MODEL_TYPE".zw
#convert … to ... in reference file
!sed -i "s/\xe2\x80\xa6/.../g" data/text.fa."$MODEL_TYPE".zw
!sed -i "s/\xe2\x80\xa6/.../g" data/text.en."$MODEL_TYPE".zw

b'\xe2\x96\x81'


In [None]:
getTokenizedInput("data/text.fa." + MODEL_TYPE + ".zw", "data/text.fa." + MODEL_TYPE + ".remove-bpe", "data/text.fa." + MODEL_TYPE + ".tok")
getTokenizedInput("data/text.en." + MODEL_TYPE + ".zw", "data/text.en." + MODEL_TYPE + ".remove-bpe", "data/text.en." + MODEL_TYPE + ".tok")

In [None]:
convertToWordLevel(srcText + "." + MODEL_TYPE + ".tok", trgText + "." + MODEL_TYPE + ".tok", outFile + ".grow-diag-final-and.final-alignment", "#", True)

In [None]:
!python aer.py --oneHyp --source "$srcText" --target "$trgText" data//gold."$SRC"-"$TRG".aligned "$outFile".grow-diag-final-and.final-alignment.final-alignment

outputs/alignments.XLMR-Large.grow-diag-final-and.final-alignment.final-alignment: 50.3% (53.9%/46.2%/22087)
[((',', '.'), 244), (('the', 'در'), 89), ((',', '،'), 81), (('of', 'از'), 55), (('of', 'به'), 47), (('the', 'و'), 45), (('the', 'از'), 45), (('the', 'به'), 42), (('the', 'را'), 39), (('the', 'که'), 39)]
Internal Jumps: 2271, External Jumps: 6990
Source Coverage: 80.0%, Target Coverage: 80.3%


# **XLMR-Base**

In [10]:
MODEL_TYPE = "XLMR-Base"

In [11]:
srcBPE = srcText + ".bpe." + MODEL_TYPE
trgBPE = trgText + ".bpe." + MODEL_TYPE

S2T_Alignment = "outputs/alignments.S2T." + MODEL_TYPE
T2S_Alignment = "outputs/alignments.T2S." + MODEL_TYPE

outFile = "outputs/alignments." + MODEL_TYPE

In [12]:
computeAlignments(srcText, trgText, "outputs/alignments", MODEL_TYPE)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=5069051.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=9096718.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=512.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1115590446.0, style=ProgressStyle(descr…




Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [13]:
convertToGizaFormat(srcBPE, trgBPE, S2T_Alignment)
convertToGizaFormat(trgBPE, srcBPE, T2S_Alignment)
!$GIZA2BAL -d $T2S_Alignment".giza-format" -i $S2T_Alignment".giza-format" | $SYMAL -alignment="grow" -diagonal="yes" -final="yes" -both="yes" > $outFile".grow-diag-final-and"

symal: computing grow alignment: diagonal (1) final (1)both-uncovered (1)
skip=<0> counts=<1572>


In [14]:
convertToWordLevel(srcBPE, trgBPE, outFile + ".grow-diag-final-and", "_")

In [15]:
#concat bpe tokens
!sed "s/ //g" data/text.fa.bpe."$MODEL_TYPE" > data/text.fa."$MODEL_TYPE".remove-bpe
!sed -i "s/\xe2\x96\x81/ /g" data/text.fa."$MODEL_TYPE".remove-bpe
!sed "s/ //g" data/text.en.bpe."$MODEL_TYPE" > data/text.en."$MODEL_TYPE".remove-bpe
!sed -i "s/\xe2\x96\x81/ /g" data/text.en."$MODEL_TYPE".remove-bpe

!cp data/text.en data/text.en."$MODEL_TYPE".remove-zw
# remove zero width characters in reference file
!sed "s/\xe2\x80\x8c//g" data/text.fa > data/text.fa."$MODEL_TYPE".remove-zw
!sed -i "s/\xe2\x80\x8f//g" data/text.fa."$MODEL_TYPE".remove-zw
#convert … to ... in reference file
!sed -i "s/\xe2\x80\xa6/.../g" data/text.fa."$MODEL_TYPE".remove-zw
!sed -i "s/\xe2\x80\xa6/.../g" data/text.en."$MODEL_TYPE".remove-zw

In [16]:
getTokenizedInput("data/text.fa." + MODEL_TYPE + ".remove-zw", "data/text.fa." + MODEL_TYPE + ".remove-bpe", "data/text.fa." + MODEL_TYPE + ".tok")
getTokenizedInput("data/text.en." + MODEL_TYPE + ".remove-zw", "data/text.en." + MODEL_TYPE + ".remove-bpe", "data/text.en." + MODEL_TYPE + ".tok")

convertToWordLevel(srcText + "." + MODEL_TYPE + ".tok", trgText + "." + MODEL_TYPE + ".tok", outFile + ".grow-diag-final-and.final-alignment", "#", True)

In [17]:
!python aer.py --oneHyp --source "$srcText" --target "$trgText" data//gold."$SRC"-"$TRG".aligned "$outFile".grow-diag-final-and.final-alignment.final-alignment

outputs/alignments.XLMR-Base.grow-diag-final-and.final-alignment.final-alignment: 43.3% (61.1%/52.9%/22288)
[(('the', 'به'), 131), (('the', 'در'), 92), ((',', '.'), 84), ((',', '،'), 77), (('the', 'از'), 68), (('of', 'از'), 63), (('the', 'که'), 55), (('to', 'را'), 52), (('the', 'و'), 42), (('to', 'به'), 37)]
Internal Jumps: 1744, External Jumps: 6563
Source Coverage: 82.1%, Target Coverage: 81.2%


# **XLM-100**


In [None]:
MODEL_TYPE = "XLM100"

srcBPE = srcText + ".bpe." + MODEL_TYPE
trgBPE = trgText + ".bpe." + MODEL_TYPE

S2T_Alignment = "outputs/alignments.S2T." + MODEL_TYPE
T2S_Alignment = "outputs/alignments.T2S." + MODEL_TYPE

outFile = "outputs/alignments." + MODEL_TYPE

In [None]:
computeAlignments(srcText, trgText, "outputs/alignments", MODEL_TYPE)


In [None]:
def convertBPEFormat(inputFile, outputFile):
  inFile = open(inputFile, "r")
  outFile = open(outputFile,"w")
  for line in inFile:
    line = line.strip().split()
    for i in range(1, len(line) - 1):
        if (not line[i-1].endswith("</w>")):
          line[i] = "##" + line[i]
    for i in range(0, len(line)):
        if (line[i].endswith("</w>")):
          line[i] = line[i][0:-4]
    outFile.write(' '.join(line))
    outFile.write("\n")
  outFile.close()

convertBPEFormat(srcBPE, srcBPE + ".converted")
convertBPEFormat(trgBPE, trgBPE + ".converted")


In [None]:
convertToGizaFormat(srcBPE + ".converted", trgBPE + ".converted", S2T_Alignment)
convertToGizaFormat(trgBPE + ".converted", srcBPE + ".converted", T2S_Alignment)
!$GIZA2BAL -d $T2S_Alignment".giza-format" -i $S2T_Alignment".giza-format" | $SYMAL -alignment="grow" -diagonal="yes" -final="yes" -both="yes" > $outFile".grow-diag-final-and"

symal: computing grow alignment: diagonal (1) final (1)both-uncovered (1)
skip=<0> counts=<1572>


In [None]:
convertToWordLevel(srcBPE + ".converted", trgBPE + ".converted", outFile + ".grow-diag-final-and", "#")

In [None]:
#concat bpe tokens
!sed "s/ ##//g" data/text.fa.bpe."$MODEL_TYPE".converted > data/text.fa."$MODEL_TYPE".remove-bpe
!sed "s/ ##//g" data/text.en.bpe."$MODEL_TYPE".converted > data/text.en."$MODEL_TYPE".remove-bpe

!cp data/text.en data/text.en."$MODEL_TYPE".remove-zw
# remove zero width characters in reference file
!sed "s/\xe2\x80\x8c//g" data/text.fa > data/text.fa."$MODEL_TYPE".remove-zw
!sed -i "s/\xe2\x80\x8f//g" data/text.fa."$MODEL_TYPE".remove-zw
#remove اً
!sed -i "s/\xd9\x8b//g" data/text.fa."$MODEL_TYPE".remove-zw
#convert … to ... in reference file
!sed -i "s/\xe2\x80\xa6/.../g" data/text.fa."$MODEL_TYPE".remove-zw
!sed -i "s/\xe2\x80\xa6/.../g" data/text.en."$MODEL_TYPE".remove-zw

In [None]:
getTokenizedInput("data/text.fa." + MODEL_TYPE + ".remove-zw", "data/text.fa." + MODEL_TYPE + ".remove-bpe", "data/text.fa." + MODEL_TYPE + ".tok")
getTokenizedInput("data/text.en." + MODEL_TYPE + ".remove-zw", "data/text.en." + MODEL_TYPE + ".remove-bpe", "data/text.en." + MODEL_TYPE + ".tok")

convertToWordLevel(srcText + "." + MODEL_TYPE + ".tok", trgText + "." + MODEL_TYPE + ".tok", outFile + ".grow-diag-final-and.final-alignment", "#", True)

In [None]:
!python aer.py --oneHyp --source "$srcText" --target "$trgText" data//gold."$SRC"-"$TRG".aligned "$outFile".grow-diag-final-and.final-alignment.final-alignment

outputs/alignments.XLM100.grow-diag-final-and.final-alignment.final-alignment: 44.4% (58.0%/53.4%/23700)
[((',', '،'), 182), (('the', 'در'), 90), (('.', '،'), 86), (('the', 'از'), 68), (('of', 'از'), 47), (('a', 'به'), 45), (('to', 'به'), 43), (('the', 'را'), 40), (('the', 'به'), 39), (('the', 'که'), 38)]
Internal Jumps: 2084, External Jumps: 6937
Source Coverage: 85.8%, Target Coverage: 85.2%


# **MBERT**

In [None]:
MODEL_TYPE = "MBERT"

srcText = "data/text.en"
trgText = "data/text.fa"

srcBPE = srcText + ".bpe." + MODEL_TYPE
trgBPE = trgText + ".bpe." + MODEL_TYPE

S2T_Alignment = "outputs/alignments.S2T." + MODEL_TYPE
T2S_Alignment = "outputs/alignments.T2S." + MODEL_TYPE

outFile = "outputs/alignments." + MODEL_TYPE

!mkdir "outputs"

In [None]:
computeAlignments(srcText, trgText, "outputs/alignments", MODEL_TYPE)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
convertToGizaFormat(srcBPE, trgBPE, S2T_Alignment)
convertToGizaFormat(trgBPE, srcBPE, T2S_Alignment)
!$GIZA2BAL -d $T2S_Alignment".giza-format" -i $S2T_Alignment".giza-format" | $SYMAL -alignment="grow" -diagonal="yes" -final="yes" -both="yes" > $outFile".grow-diag-final-and"

symal: computing grow alignment: diagonal (1) final (1)both-uncovered (1)
skip=<0> counts=<1572>


In [None]:
convertToWordLevel(srcBPE, trgBPE, outFile + ".grow-diag-final-and", "#")

In [None]:
#concat bpe tokens
!sed "s/ ##//g" data/text.fa.bpe."$MODEL_TYPE" > data/text.fa."$MODEL_TYPE".remove-bpe
!sed "s/ ##//g" data/text.en.bpe."$MODEL_TYPE" > data/text.en."$MODEL_TYPE".remove-bpe

# remove zero width characters in reference file
!sed "s/\xe2\x80\x8c//g" data/text.fa > data/text.fa."$MODEL_TYPE".zw
!sed -i "s/\xe2\x80\x8f//g" data/text.fa."$MODEL_TYPE".zw
#convert … to ... in reference file
!sed -i "s/\xe2\x80\xa6/.../g" data/text.fa."$MODEL_TYPE".zw

In [None]:
getTokenizedInput("data/text.fa." + MODEL_TYPE + ".remove-zw", "data/text.fa." + MODEL_TYPE + ".remove-bpe", "data/text.fa." + MODEL_TYPE + ".tok")
getTokenizedInput("data/text.en", "data/text.en." + MODEL_TYPE + ".remove-bpe", "data/text.en." + MODEL_TYPE + ".tok")

convertToWordLevel(srcText + "." + MODEL_TYPE + ".tok", trgText + "." + MODEL_TYPE + ".tok", outFile + ".grow-diag-final-and.final-alignment", "#", True)

In [None]:
!python aer.py --oneHyp --source "$srcText" --target "$trgText" data//gold."$SRC"-"$TRG".aligned "$outFile".grow-diag-final-and.final-alignment.final-alignment

outputs/alignments.MBERT.grow-diag-final-and.final-alignment.final-alignment: 47.5% (53.9%/51.2%/24465)
[((',', '،'), 76), (('the', 'در'), 49), (('to', 'به'), 45), (('of', 'از'), 37), (('i', 'و'), 36), (('the', 'از'), 33), (('the', 'به'), 26), (('was', 'بود'), 25), (('the', '،'), 25), (('and', 'و'), 24)]
Internal Jumps: 1999, External Jumps: 5883
Source Coverage: 85.5%, Target Coverage: 81.6%
