**Notebook for aligning slovenian data to correct indices by finding the word in the same sentence as original and using Levenstein distance**

- import libraries and set values

In [None]:
import numpy as np
import json
import pandas as pd
import nltk
from difflib import ndiff

dataset_name = "train-v2.0"
remove_bad_data = True
nltk.download('punkt')

**Functions**

In [41]:
def find_direct_sentence(context, start):
    context_sentences = nltk.tokenize.sent_tokenize(context)

    sentence_start, sentence_end = -1, -1
    for sentence in context_sentences:
        sentence_start = context.find(sentence)
        sentence_end = sentence_start + len(sentence)

        if start >= sentence_start and start <= sentence_end:
            break
    
    return sentence_start, sentence_end

def levenstein_distance(token1, token2):
    distances = np.zeros((len(token1) + 1, len(token2) + 1))

    for t1 in range(len(token1) + 1):
        distances[t1][0] = t1

    for t2 in range(len(token2) + 1):
        distances[0][t2] = t2
        
    a = 0
    b = 0
    c = 0
    
    for t1 in range(1, len(token1) + 1):
        for t2 in range(1, len(token2) + 1):
            if (token1[t1-1] == token2[t2-1]):
                distances[t1][t2] = distances[t1 - 1][t2 - 1]
            else:
                a = distances[t1][t2 - 1]
                b = distances[t1 - 1][t2]
                c = distances[t1 - 1][t2 - 1]
                
                if (a <= b and a <= c):
                    distances[t1][t2] = a + 1
                elif (b <= a and b <= c):
                    distances[t1][t2] = b + 1
                else:
                    distances[t1][t2] = c + 1
    return distances[len(token1)][len(token2)]

def find_best_fit(sentence, answer):
    
    ans = ""
    dist = len(answer) + 10
    start_idx, end_idx = -1, -1
    for i in range(0, len(sentence) - len(answer)):
        ld = levenstein_distance(sentence[i:i+len(answer)], answer)
        if (ld < dist):
            ans = sentence[i:i+len(answer)]
            dist = ld
            start_idx = i
            end_idx = i + len(answer)
    
    # print(f'Real answer: {answer}')
    # print(f'Found answer: {ans}, distance: {dist}, start: {start_idx}, end: {end_idx}')
    # print()

    if (dist <= len(answer) / 2):
        return start_idx, end_idx, ans
    else:
        return -1, -1, ""

**Find data in a direct sentence using Levenstein distance**

In [42]:
i = 0
j = 0
df = pd.read_json(f"..\\data\\{dataset_name}_unaligned_original_indices_SL.json")

for _, dset in df.iterrows():
    for row in dset['data']['paragraphs']:
        article_text = row['context']

        for qas in row['qas']:
            for ans in qas['answers']:
                ans_text = ans['text']
                ans_start = ans['answer_start']
                ans_end = ans['answer_end']
                
                if ans_text in article_text:
                    ans['answer_start'] = article_text.find(ans_text)
                    ans['answer_end'] = ans['answer_start'] + len(ans_text)
                    i += 1
                else:
                    sentence_start, sentence_end = find_direct_sentence(article_text, ans_start)
                    direct_sentence = article_text[sentence_start:sentence_end]
                    
                    start, end, real_ans = find_best_fit(direct_sentence, ans_text)

                    if start != -1:
                        ans['answer_start'] = start + sentence_start
                        ans['answer_end'] = end + sentence_start
                        ans['text'] = real_ans
                        i += 1
                    else:
                        ans['answer_start'] = -1

                j += 1

            if remove_bad_data:    
                qas['answers'] = list(filter(lambda ans: ans['answer_start'] != -1, qas['answers']))
        if remove_bad_data:    
            row['qas'] = list(filter(lambda qas: len(qas['answers']) > 0, row['qas']))
    if remove_bad_data:    
        dset['data']['paragraphs'] = list(filter(lambda paragraph: len(paragraph['qas']) > 0, dset['data']['paragraphs']))

actual_data = df['data'].to_numpy()
with open(f"..\\data\\aligned_data_SL\\{dataset_name}_aligned_directly_levenstein{'_all_answers' if not remove_bad_data else ''}.json", "w") as new_file:
    new_file.write(json.dumps({'data': list(actual_data)}, indent=2))

print(f"Accurately found {(i * 100 / j):.3f}% of answers in text")

Accurately found 85.942% of answers in text
