**Notebook for aligning slovenian data to correct indices by finding the word directly in the text**

- import libraries and set values

In [22]:
import json
import pandas as pd
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizer

INITIAL_VALUE = -1
CLS = 103
SEP = 104

dataset_name = "train-v2.0"
remove_bad_data = True # set to False to keep all data, even if answers are invalid

**Find data directly in text**

In [13]:
i = 0
j = 0
df = pd.read_json(f"..\\data\\{dataset_name}_unaligned_SL.json")
for _, dset in df.iterrows():
    for row in dset['data']['paragraphs']:
        article_text = row['context']
        
        for qas in row['qas']:
            for ans in qas['answers']:
                ans_text = ans['text']

                if ans_text in article_text:
                    ans['answer_start'] = article_text.find(ans_text)
                    ans['answer_end'] = ans['answer_start'] + len(ans_text)
                    i += 1

                j += 1

            if remove_bad_data:    
                qas['answers'] = list(filter(lambda ans: ans['answer_start'] != -1, qas['answers']))
        if remove_bad_data:    
            row['qas'] = list(filter(lambda qas: len(qas['answers']) > 0, row['qas']))
    if remove_bad_data:    
        dset['data']['paragraphs'] = list(filter(lambda paragraph: len(paragraph['qas']) > 0, dset['data']['paragraphs']))

actual_data = df['data'].to_numpy()
with open(f"..\\data\\aligned_data_SL\\{dataset_name}_aligned_directly{'_all_answers' if not remove_bad_data else ''}.json", "w") as new_file:
    new_file.write(json.dumps({'data': list(actual_data)}, indent=2))

print(f"Accurately found {(i * 100 / j):.3f}% of answers in text")

Accurately found 43.162% of answers in text


**Find data with tokenization**

In [23]:
df = pd.read_json(f"..\\data\\{dataset_name}_unaligned_SL.json")
df_eng = pd.read_json(f"..\\data\\{dataset_name}.json")
tokenizer = BertTokenizer.from_pretrained('../data/CroSloEngual_BERT/vocab_slocroeng.txt')
with open('../data/CroSloEngual_BERT/vocab_slocroeng.txt', 'r', encoding='utf-8') as fp:
    vocab = fp.read().split('\n')

i = 0
j = 0
for df_idx, dset in df.iterrows():
    for row_idx, row in enumerate(dset['data']['paragraphs']):
        article_text = row['context']
        article_tokens = tokenizer(article_text)['input_ids']

        for qas_idx, qas in enumerate(row['qas']):
            for ans_idx, ans in enumerate(qas['answers']):
                ans_text = ans['text']
                ans_tokens = tokenizer(ans_text)['input_ids']
                ans_tokens.remove(SEP)
                ans_tokens.remove(CLS)

                ans_tokens = list(filter(lambda t: not vocab[t].startswith("##"), ans_tokens))
                first_token = ans_tokens[0]
                special_chars = [",", ".", ":", ";", "-", "_", "(", ")"]
                answer_text = ""
                should_append = False
                c = -1
                if all(a in article_tokens for a in ans_tokens):
                    for at in article_tokens:
                        if at == first_token:
                            should_append = True
                        
                        if should_append:
                            t = vocab[at]

                            if t.startswith("##"):
                                answer_text += t[2:]
                            else:
                                c += 1
                                if c == len(ans_tokens):
                                    eng_ans_start = df_eng.iloc[df_idx]['data']['paragraphs'][row_idx]['qas'][qas_idx]['answers'][ans_idx]['answer_start']
                                    eng_article_text = df_eng.iloc[df_idx]['data']['paragraphs'][row_idx]['context']
                                    answer_text = answer_text.lstrip()
                                    slo_ans_start = article_text.lower().find(answer_text.lower())

                                    if abs(eng_ans_start - slo_ans_start) < ((len(article_text) + len(eng_article_text)) / 2) * 0.27:
                                        ans['answer_start'] = slo_ans_start
                                        ans['answer_end'] = ans['answer_start'] + len(answer_text)
                                        ans['text'] = article_text[ans['answer_start']:ans['answer_end']]
                                        i += 1
                                        break
                                    else:
                                        answer_text = ""
                                        should_append = False
                                        c = -1
                                if should_append:
                                    if t not in special_chars:
                                        answer_text += " " + t
                                    else:
                                        answer_text += t

                j += 1

            if remove_bad_data:    
                qas['answers'] = list(filter(lambda ans: ans['answer_start'] != -1, qas['answers']))
        if remove_bad_data:
            row['qas'] = list(filter(lambda qas: len(qas['answers']) > 0, row['qas']))
    if remove_bad_data:
        dset['data']['paragraphs'] = list(filter(lambda paragraph: len(paragraph['qas']) > 0, dset['data']['paragraphs']))

actual_data = df['data'].to_numpy()
with open(f"..\\data\\aligned_data_SL\\{dataset_name}_aligned_tokenization{'_all_answers' if not remove_bad_data else ''}.json", "w") as new_file:
    new_file.write(json.dumps({'data': list(actual_data)}, indent=2))

print(f"Accurately found {(i * 100 / j):.3f}% of answers in text")



Accurately found 51.026% of answers in text
