In [1]:
# Load necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# GLOBALS
MAX_SENTS = 0

In [2]:
df_train = pd.read_json(r'../data/training.json', orient='split')
df_test = pd.read_json(r'../data/test.json', orient='split')

In [3]:
def get_correct_answers(df):
    correct_answers = []
    correct_answers_loc = []
    # parse out the correct answer in the choices column
    for index, row in df.iterrows():
        answers = row['choices']
        df_row = pd.DataFrame(answers)
        # Collect the correct answer, add it to list
        answer_row = df_row.loc[df_row['type'] == 'Correct answer']
        correct_answer = answer_row.iloc[0]['text']
        correct_answer_loc = answer_row.iloc[0]['start']
        correct_answers.append(correct_answer)
        correct_answers_loc.append(correct_answer_loc)
   
    return correct_answers, correct_answers_loc


In [4]:
# add the correct answers as a column to the DataFrame
df_train['correct_answer'], df_train['correct_answer_loc'] = get_correct_answers(df_train)
df_test['correct_answer'], df_test['correct_answer_loc'] = get_correct_answers(df_test)

df_train.head()

Unnamed: 0,choices,context,question,correct_answer,correct_answer_loc
0,"[{'end': 224, 'extra': None, 'start': 175, 'te...",Visum för affärs- och konferensbesök\r\nOm du ...,Vad är ett visum?,ett tillstånd för att resa in och vara i ett land,175
1,"[{'end': 707, 'extra': None, 'start': 681, 'te...",Eget företag\r\nEfter beslut\r\nBeslutet skick...,Vad ska du ta med när du hämtar ditt beslut?,ditt pass,156
2,"[{'end': 1165, 'extra': None, 'start': 1154, '...",Utbytesdoktorander\r\nIntervju\r\nDu kommer at...,Hur länge gäller ett uppehållstillstånd för be...,ett år,1619
3,"[{'end': 598, 'extra': None, 'start': 589, 'te...",Eget företag\r\nNär vi har tagit emot din ansö...,Vad är ett uppehållstillståndskort?,ett bevis,589
4,"[{'end': 1932, 'extra': None, 'start': 1924, '...",Flytta tillbaka till Sverige\r\nSå ansöker du\...,Vad är ett uppehållstillståndskort?,ett bevis,673


In [13]:
# check in which sentence the answer can be found

def collect_sentence_number_statistics(df):
    idx_of_ans = []
    idx_of_ans_text = []
    total_num_sents = []
    ans_loc_frac = []
    for index, row in df.iterrows():
        # iterate over all characters in the paragraph and find in which sentence the location is
        tot_chars = 0
        answer = row['correct_answer']
        answer_loc = int(row['correct_answer_loc'])
        text = row['context']
        # split the text into each sentence
        sentences = text.split('.')
        # find in which sentences the answer is. How to know if it is the answer to the correct question??
        found_indexes = []
        loc_idx = None
        for index, sent in enumerate(sentences):
            num_chars = len(sent)+1 # +1 since we remove the '.' when splitting!
            tot_chars += num_chars
            if not loc_idx and tot_chars > answer_loc: # only collect if not already found
                loc_idx = index
            if answer in sent:
                found_indexes.append(index)

        # append the found indexes to the array for all paragraphs
        idx_of_ans_text.append(found_indexes)
        idx_of_ans.append(loc_idx) # append the location of the answer!
        total_num_sents.append(len(sentences))
        fracs = loc_idx/len(sentences)
        ans_loc_frac.append(fracs)
    # if index > MAX_SENTS:
        # MAX_SENTS = index
    return idx_of_ans, idx_of_ans_text, total_num_sents, ans_loc_frac

        
df_train['answer_location'], df_train['answer_locations_text'], df_train['paragraph_len'], df_train['loc_frac'] = collect_sentence_number_statistics(df_train)
df_test['answer_location'], df_test['answer_locations_text'], df_test['paragraph_len'], df_test['loc_frac'] = collect_sentence_number_statistics(df_test)
df_train.head()

Unnamed: 0,choices,context,question,correct_answer,correct_answer_loc,answer_locations,paragraph_len,loc_frac,answer_location,answer_locations_text
0,"[{'end': 224, 'extra': None, 'start': 175, 'te...",Visum för affärs- och konferensbesök\r\nOm du ...,Vad är ett visum?,ett tillstånd för att resa in och vara i ett land,175,1,19,0.052632,1,[1]
1,"[{'end': 707, 'extra': None, 'start': 681, 'te...",Eget företag\r\nEfter beslut\r\nBeslutet skick...,Vad ska du ta med när du hämtar ditt beslut?,ditt pass,156,1,18,0.055556,1,"[1, 2]"
2,"[{'end': 1165, 'extra': None, 'start': 1154, '...",Utbytesdoktorander\r\nIntervju\r\nDu kommer at...,Hur länge gäller ett uppehållstillstånd för be...,ett år,1619,16,23,0.695652,16,[16]
3,"[{'end': 598, 'extra': None, 'start': 589, 'te...",Eget företag\r\nNär vi har tagit emot din ansö...,Vad är ett uppehållstillståndskort?,ett bevis,589,5,17,0.294118,5,[5]
4,"[{'end': 1932, 'extra': None, 'start': 1924, '...",Flytta tillbaka till Sverige\r\nSå ansöker du\...,Vad är ett uppehållstillståndskort?,ett bevis,673,6,20,0.3,6,[6]


In [14]:
# Save dataframes to file
df_train.to_pickle("./df_train.pkl")
df_test.to_pickle("./df_test.pkl")