In [2]:
import pandas as pd
import numpy as np
import torch
from transformers import BertForQuestionAnswering, BertTokenizer
import numpy as np

In [3]:
def inverse_hamming_distance(s1, s2):
    count = 0
    for i in range(0, len(s1)):
        if s1[i] == s2[i]:
            count += 1
    return count

def levenshteinDistanceDP(token1, token2):
    distances = np.zeros((len(token1) + 1, len(token2) + 1))

    for t1 in range(len(token1) + 1):
        distances[t1][0] = t1

    for t2 in range(len(token2) + 1):
        distances[0][t2] = t2
        
    a = 0
    b = 0
    c = 0
    
    for t1 in range(1, len(token1) + 1):
        for t2 in range(1, len(token2) + 1):
            if (token1[t1-1] == token2[t2-1]):
                distances[t1][t2] = distances[t1 - 1][t2 - 1]
            else:
                a = distances[t1][t2 - 1]
                b = distances[t1 - 1][t2]
                c = distances[t1 - 1][t2 - 1]
                
                if (a <= b and a <= c):
                    distances[t1][t2] = a + 1
                elif (b <= a and b <= c):
                    distances[t1][t2] = b + 1
                else:
                    distances[t1][t2] = c + 1

    return distances[len(token1)][len(token2)]

def find_answer(s1, s2):
    best_fit = -1
    best_index = -1
    for i in range(len(s1) - len(s2) + 1):
        score = inverse_hamming_distance(s1[i : i + len(s2)], s2)
        if (score >= best_fit):
            best_fit = score
            best_index = i
    if (best_fit < len(s2)/3):
        return -1, -1
    return best_fit, best_index  




In [6]:
squad = pd.read_json('..\\data\\dev-slo-v2.0-test.json')
del squad['version']
squad.reset_index()

# s1 = "Jernnej se je rodil v 10. in 11. stoletju, zaradi česa je izpustil velik del otroštva"
# s2 = "10. in 11. stoletje"
# fit, idx = find_answer("Jernnej se je rodil v 10. in 11. stoletju, zaradi česa je izpustil velik del otroštva", "10. in 11. stoletje")
# print(s1[idx : idx + len(s2)])

for _, dset in squad.iterrows():
    for row in dset['data']['paragraphs']:
        context = row['context']
        for qas in row['qas']:
            if('answers' in qas.keys()):
                for ans in qas['answers']:
                    ans_to_find = ans['text']
                    fit, idx = find_answer(context, ans_to_find)

                    # Spremenimo index
                    if idx != -1:
                        ans['answer_start'] = idx
                        ans["answer_end"] = idx + len(ans_to_find)
                    else:
                        ans["answer_end"] = ans["answer_start"] + len(ans_to_find)

            if('plausible_answers' in qas.keys()):
                for ans in qas['plausible_answers']:
                    ans_to_find = ans['text']
                    fit, idx = find_answer(context, ans_to_find)

                    # Spremenimo index
                    if idx != -1:
                        ans['answer_start'] = idx
                        ans["answer_end"] = idx + len(ans_to_find)
                    else:
                        ans["answer_end"] = ans["answer_start"] + len(ans_to_find)
                        
with open('df.json', 'w', encoding='utf-8') as file:
    squad.to_json(file, force_ascii=False)