**Notebook for aligning slovenian data to correct indices by finding the word directly in the text**

- import libraries and set values

In [14]:
import numpy as np
import pandas as pd
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizer

INITIAL_VALUE = -1
CLS = 103
SEP = 104

dataset_name = "dev-v2.0"

**Find data directly in text**

In [15]:
i = 0
j = 0
df = pd.read_json(f"..\\data\\{dataset_name}_unaligned_SL.json")
for _, dset in df.iterrows():
    for row in dset['data']['paragraphs']:
        article_text = row['context']
        
        for qas in row['qas']:
            for ans in qas['answers']:
                ans_text = ans['text']

                if ans_text in article_text:
                    ans['answer_start'] = article_text.find(ans_text)
                    ans['answer_end'] = ans['answer_start'] + len(ans_text)
                    i += 1

                j += 1

            qas['answers'] = list(filter(lambda ans: ans['answer_start'] != -1, qas['answers']))
        row['qas'] = list(filter(lambda qas: len(qas['answers']) > 0, row['qas']))
    dset['data']['paragraphs'] = list(filter(lambda paragraph: len(paragraph['qas']) > 0, dset['data']['paragraphs']))

# df.to_json(f"..\\data\\aligned_data_SL\\{dataset_name}_aligned_directly_all_answers.json", indent=2)
df.to_json(f"..\\data\\aligned_data_SL\\{dataset_name}_aligned_directly.json", indent=2)
print(f"Accurately found {(i * 100 / j):.3f}% of answers in text")

Accurately found 43.162% of answers in text


**Find data with tokenization**

In [15]:
df = pd.read_json(f"..\\data\\{dataset_name}_unaligned_SL.json")
tokenizer = BertTokenizer.from_pretrained('../data/CroSloEngual_BERT/vocab_slocroeng.txt')
with open('../data/CroSloEngual_BERT/vocab_slocroeng.txt', 'r', encoding='utf-8') as fp:
    vocab = fp.read().split('\n')

i = 0
j = 0
for _, dset in df.iterrows():
    for row in dset['data']['paragraphs']:
        article_text = row['context']
        article_tokens = tokenizer(article_text)['input_ids']

        for qas in row['qas']:
            for ans in qas['answers']:
                ans_text = ans['text']
                ans_tokens = tokenizer(ans_text)['input_ids']
                ans_tokens.remove(SEP)
                ans_tokens.remove(CLS)

                ans_tokens = list(filter(lambda t: not vocab[t].startswith("##"), ans_tokens))
                first_token = ans_tokens[0]
                special_chars = [",", ".", ":", ";", "-", "_", "(", ")"]
                answer_text = ""
                should_append = False
                c = -1
                if all(a in article_tokens for a in ans_tokens):
                    for at in article_tokens:
                        

                        if at == first_token:
                            should_append = True
                        
                        if should_append:
                            t = vocab[at]

                            if t.startswith("##"):
                                answer_text += t[2:]
                            else:
                                c += 1
                                if c == len(ans_tokens):
                                    break

                                if t not in special_chars:
                                    answer_text += " " + t
                                else:
                                    answer_text += t

                    answer_text = answer_text.lstrip()
                    ans['answer_start'] = article_text.lower().find(answer_text.lower())
                    ans['answer_end'] = ans['answer_start'] + len(answer_text)
                    ans['text'] = article_text[ans['answer_start']:ans['answer_end']]
                    i += 1

                j += 1
                
            qas['answers'] = list(filter(lambda ans: ans['answer_start'] != -1, qas['answers']))
        row['qas'] = list(filter(lambda qas: len(qas['answers']) > 0, row['qas']))
    dset['data']['paragraphs'] = list(filter(lambda paragraph: len(paragraph['qas']) > 0, dset['data']['paragraphs']))

# df.to_json(f"..\\data\\aligned_data_SL\\{dataset_name}_aligned_tokenization_all_answers.json", indent=2)
df.to_json(f"..\\data\\aligned_data_SL\\{dataset_name}_aligned_tokenization.json", indent=2)
print(f"Accurately found {(i * 100 / j):.3f}% of answers in text")



Accurately found 67.112% of answers in text
