In [3]:
# Load necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
from nltk.tokenize import sent_tokenize

# GLOBALS
MAX_SENTS = 0

In [4]:
df_train = pd.read_json(r'../data/training.json', orient='split')
df_test = pd.read_json(r'../data/test.json', orient='split')

In [5]:
def get_correct_answers(df):
    correct_answers = []
    correct_answers_loc = []
    answer_reformulations = []
    # parse out the correct answer in the choices column
    for index, row in df.iterrows():
        answers = row['choices']
        df_row = pd.DataFrame(answers)
        # Collect the correct answer, add it to list. Save refomulation for those that use it
        answer_row = df_row.loc[df_row['type'] == 'Correct answer']
        answer_reformulation = False
        if answer_row.iloc[0]['extra']:
            answer_reformulation = answer_row.iloc[0]['extra']['comment']

        correct_answer = answer_row.iloc[0]['text']
        
        correct_answer_loc = answer_row.iloc[0]['start']
        correct_answers.append(correct_answer)
        correct_answers_loc.append(correct_answer_loc)
        answer_reformulations.append(answer_reformulation)
   
    return correct_answers, correct_answers_loc, answer_reformulations


In [6]:
# add the correct answers as a column to the DataFrame
df_train['correct_answer'], df_train['correct_answer_loc'], df_train['answer_reformulation'] = get_correct_answers(df_train)
df_test['correct_answer'], df_test['correct_answer_loc'], df_test['answer_reformulation'] = get_correct_answers(df_test)

In [8]:
# check in which sentence the answer can be found

def collect_sentence_number_statistics(df):
    idx_of_ans = []
    sentences_with_ans = []
    idx_of_ans_text = []
    total_num_sents = []
    ans_loc_frac = []
    for index, row in df.iterrows():
        # iterate over all characters in the paragraph and find in which sentence the location is
        tot_chars = 0
        answer = row['correct_answer']
        answer_loc = int(row['correct_answer_loc'])
        text = row['context']
        # split the text into each sentence
        text = text.replace("\r\n",". ")
        text = text.replace("\n",". ")
        text = text.replace("..",".")
        sentences = sent_tokenize(text)

        # find in which sentences the answer is. How to know if it is the answer to the correct question??
        found_indexes = []
        loc_idx = None
        sentence_with_ans = None
        for index, sent in enumerate(sentences):
            num_chars = len(sent)+1 # TODO: check how to do this correctly with the current parsing!!
            tot_chars += num_chars
            if not loc_idx and tot_chars > answer_loc: # only collect if not already found
                loc_idx = index
                sentence_with_ans = sent
            if answer in sent:
                found_indexes.append(index)
        
        # Match the indexes with the indexes found in text
        if not loc_idx in found_indexes:
            # print('matching index does not match!!')
            if len(found_indexes) == 1:
                # replace with where the index was found in the text
                loc_idx = found_indexes[0]
                sentence_with_ans = sentences[loc_idx]
            elif len(found_indexes) > 1:
                # find the index that is closer to the index found by position
                diff = np.abs(np.array(found_indexes) - loc_idx)
                min_diff = np.min(diff)
                min_diff_idx = diff.tolist().index(min_diff)
                # replace the index with the one found in text
                loc_idx = found_indexes[min_diff_idx]
                sentence_with_ans = sentences[loc_idx]
            else:
                print('ALERT - answer not found!')
                print('sentence by index: ', sentence_with_ans)
                print('answer: ', answer)



        # append the found indexes to the array for all paragraphs
        idx_of_ans_text.append(found_indexes)
        sentences_with_ans.append(sentence_with_ans) # append the sentence with the correct answer
        idx_of_ans.append(loc_idx) # append the location of the answer!
        total_num_sents.append(len(sentences))
        fracs = loc_idx/len(sentences)
        ans_loc_frac.append(fracs)

    return idx_of_ans, sentences_with_ans, idx_of_ans_text, total_num_sents, ans_loc_frac

        
df_train['answer_location'], df_train['sent_with_ans'], df_train['answer_locations_text'], df_train['paragraph_len'], df_train['loc_frac'] = collect_sentence_number_statistics(df_train)
df_test['answer_location'], df_test['sent_with_ans'], df_test['answer_locations_text'], df_test['paragraph_len'], df_test['loc_frac'] = collect_sentence_number_statistics(df_test)
df_train.head()

ALERT - answer not found!
sentence by index:  När speditören kontaktas av en kund börjar man med att skaffa sig kunskap om t.ex.
answer:  skaffa sig kunskap om t.ex. godsets storlek, värde och behov av transport
ALERT - answer not found!
sentence by index:  förordningen (2017:1179) om finansiering av kärntekniska .
answer:  finansiering av kärntekniska 
 restprodukter
ALERT - answer not found!
sentence by index:  FOI ska på uppdrag av Försvarets materielverk bedriva exportrelaterad verksamhet .
answer:  bedriva exportrelaterad verksamhet 
 inom försvarssektorn


Unnamed: 0,choices,context,question,correct_answer,correct_answer_loc,answer_reformulation,answer_location,sent_with_ans,answer_locations_text,paragraph_len,loc_frac
0,"[{'end': 224, 'extra': None, 'start': 175, 'te...",Visum för affärs- och konferensbesök\r\nOm du ...,Vad är ett visum?,ett tillstånd för att resa in och vara i ett land,175,False,2,Visum är ett tillstånd för att resa in och var...,[2],27,0.074074
1,"[{'end': 707, 'extra': None, 'start': 681, 'te...",Eget företag\r\nEfter beslut\r\nBeslutet skick...,Vad ska du ta med när du hämtar ditt beslut?,ditt pass,156,False,3,När du ska hämta ditt beslut ska du ta med dit...,"[3, 4]",20,0.15
2,"[{'end': 1165, 'extra': None, 'start': 1154, '...",Utbytesdoktorander\r\nIntervju\r\nDu kommer at...,Hur länge gäller ett uppehållstillstånd för be...,ett år,1619,False,21,Ett uppehållstillstånd för besök gäller som lä...,[21],29,0.724138
3,"[{'end': 598, 'extra': None, 'start': 589, 'te...",Eget företag\r\nNär vi har tagit emot din ansö...,Vad är ett uppehållstillståndskort?,ett bevis,589,False,7,Kortet är ett bevis på att du har tillstånd at...,[7],30,0.233333
4,"[{'end': 1932, 'extra': None, 'start': 1924, '...",Flytta tillbaka till Sverige\r\nSå ansöker du\...,Vad är ett uppehållstillståndskort?,ett bevis,673,False,9,Kortet är ett bevis på att du har tillstånd at...,[9],23,0.391304


In [48]:
# Save dataframes to file
df_train.to_pickle("./data_frames/df_train.pkl")
df_test.to_pickle("./data_frames/df_test.pkl")