**Notebook for aligning slovenian data to correct indices by finding the word directly in the text**

- import libraries and set values

In [1]:
import numpy as np
import pandas as pd
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizer

INITIAL_VALUE = -1

dataset_name = "dev-v2.0"

  from .autonotebook import tqdm as notebook_tqdm


**Find data directly in text**

In [10]:
i = 0
j = 0
df = pd.read_json(f"..\\data\\{dataset_name}_unaligned_SL.json")
for _, dset in df.iterrows():
    for row in dset['data']['paragraphs']:
        article_text = row['context']
        
        for qas in row['qas']:
            for ans in qas['answers']:
                ans_text = ans['text']

                if ans_text in article_text:
                    ans['answer_start'] = article_text.find(ans_text)
                    ans['answer_end'] = ans['answer_start'] + len(ans_text)
                    i += 1

                j += 1

            qas['answers'] = list(filter(lambda ans: ans['answer_start'] != -1, qas['answers']))
        row['qas'] = list(filter(lambda qas: len(qas['answers']) > 0, row['qas']))
    dset['data']['paragraphs'] = list(filter(lambda paragraph: len(paragraph['qas']) > 0, dset['data']['paragraphs']))

df.to_json(f"..\\data\\aligned_data_SL\\{dataset_name}_aligned_directly.json", indent=2)
print(f"Accurately found {(i * 100 / j):.3f}% of answers in text")

Accurately found 43.084% of answers in text


**Find data with tokenization**

In [28]:
tokenizer = BertTokenizer.from_pretrained('../data/bert_sl/sl-vocab.txt')
with open('../data/bert_sl/sl-vocab.txt', 'r', encoding='utf-8') as fp:
    vocab = fp.read().split('\n')

i = 0
j = 0
for _, dset in df.iterrows():
    for row in dset['data']['paragraphs']:
        article_text = row['context']
        article_tokens = tokenizer(article_text)['input_ids']

        for qas in row['qas']:
            for ans in qas['answers']:
                ans_text = ans['text']
                ans_tokens = tokenizer(ans_text)['input_ids']

                if all(a in article_text for a in ans_text):
                    # find answer from tokens below
                    ans['answer_start'] = article_text.find(ans_text)
                    ans['answer_end'] = ans['answer_start'] + len(ans_text)
                    i += 1

                j += 1
                
            qas['answers'] = list(filter(lambda ans: ans['answer_start'] != -1, qas['answers']))
        row['qas'] = list(filter(lambda qas: len(qas['answers']) > 0, row['qas']))
    dset['data']['paragraphs'] = list(filter(lambda paragraph: len(paragraph['qas']) > 0, dset['data']['paragraphs']))

df.to_json(f"..\\data\\aligned_data_SL\\{dataset_name}_aligned_tokenization.json", indent=2)
print(f"Accurately found {(i * 100 / j):.3f}% of answers in text")



Accurately found 90.439% of answers in text
