In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd
import sys
import random

from tqdm import tqdm

import re
import string

import os

import shutil

import json

In [None]:
squad_train = '/content/drive/MyDrive/data/train-v2.0.json'
squad_val = '/content/drive/MyDrive/data/dev-v2.0.json'

In [None]:
def squad_json_to_dataframe_train(input_file_path=squad_train, record_path = ['data','paragraphs','qas','answers'],
                           verbose = 1):
    """
    input_file_path: path to the squad json file.
    record_path: path to deepest level in json file default value is
    ['data','paragraphs','qas','answers']
    verbose: 0 to suppress it default is 1
    """
    if verbose:
        print("Reading the json file")    
    file = json.loads(open(input_file_path).read())
    if verbose:
        print("processing...")
    # parsing different level's in the json file
    js = pd.io.json.json_normalize(file , record_path )
    m = pd.io.json.json_normalize(file, record_path[:-1] )
    r = pd.io.json.json_normalize(file,record_path[:-2])
    
    #combining it into single dataframe
    idx = np.repeat(r['context'].values, r.qas.str.len())
    ndx  = np.repeat(m['id'].values,m['answers'].str.len())
    m['context'] = idx
    js['q_idx'] = ndx
    main = pd.concat([ m[['id','question','context']].set_index('id'),js.set_index('q_idx')],1,sort=False).reset_index()
    main['c_id'] = main['context'].factorize()[0]
    if verbose:
        print("shape of the dataframe is {}".format(main.shape))
        print("Done")
    return main

In [None]:
def squad_json_to_dataframe_dev(input_file_path=squad_val, record_path = ['data','paragraphs','qas','answers'],
                           verbose = 1):
    """
    input_file_path: path to the squad json file.
    record_path: path to deepest level in json file default value is
    ['data','paragraphs','qas','answers']
    verbose: 0 to suppress it default is 1
    """
    if verbose:
        print("Reading the json file")    
    file = json.loads(open(input_file_path).read())
    if verbose:
        print("processing...")
    # parsing different level's in the json file
    js = pd.io.json.json_normalize(file , record_path )
    m = pd.io.json.json_normalize(file, record_path[:-1] )
    r = pd.io.json.json_normalize(file,record_path[:-2])
    
    #combining it into single dataframe
    idx = np.repeat(r['context'].values, r.qas.str.len())
#     ndx  = np.repeat(m['id'].values,m['answers'].str.len())
    m['context'] = idx
#     js['q_idx'] = ndx
    main = m[['id','question','context','answers']].set_index('id').reset_index()
    main['c_id'] = main['context'].factorize()[0]
    if verbose:
        print("shape of the dataframe is {}".format(main.shape))
        print("Done")
    return main

In [None]:
def find_ans_loc(context, ans, ans_start): 
    new_loc = 0
    for c in context[0:int(ans_start)]:
        if c.isspace():
            new_loc += 1

    new_length = 1
    for c in ans:
        if c.isspace():
            new_length += 1

    return new_loc, new_length

In [None]:
s_train = squad_json_to_dataframe_train()

Reading the json file
processing...


  from ipykernel import kernelapp as app
  app.launch_new_instance()


shape of the dataframe is (130319, 6)
Done


In [None]:
null_text = s_train.isnull()['text']
s_train['no_answer'] = null_text
s_train.head(20)

Unnamed: 0,index,question,context,text,answer_start,c_id,no_answer
0,56be85543aeaaa14008c9063,When did Beyonce start becoming popular?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,in the late 1990s,269.0,0,False
1,56be85543aeaaa14008c9065,What areas did Beyonce compete in when she was...,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,singing and dancing,207.0,0,False
2,56be85543aeaaa14008c9066,When did Beyonce leave Destiny's Child and bec...,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,2003,526.0,0,False
3,56bf6b0f3aeaaa14008c9601,In what city and state did Beyonce grow up?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,"Houston, Texas",166.0,0,False
4,56bf6b0f3aeaaa14008c9602,In which decade did Beyonce become famous?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,late 1990s,276.0,0,False
5,56bf6b0f3aeaaa14008c9603,In what R&B group was she the lead singer?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,Destiny's Child,320.0,0,False
6,56bf6b0f3aeaaa14008c9604,What album made her a worldwide known artist?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,Dangerously in Love,505.0,0,False
7,56bf6b0f3aeaaa14008c9605,Who managed the Destiny's Child group?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,Mathew Knowles,360.0,0,False
8,56d43c5f2ccc5a1400d830a9,When did Beyoncé rise to fame?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,late 1990s,276.0,0,False
9,56d43c5f2ccc5a1400d830aa,What role did Beyoncé have in Destiny's Child?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,lead singer,290.0,0,False


In [41]:
def getSquadInstancesTrain(s_train=s_train, debug=False):
    INSTANCE_WORDS_LEN = 500 
    STRIDE = 128 
    list_ins = []
    for rowid, row in s_train.iterrows():
        example_id = str(row['index'])
        question = row['question']
        if question[-1] == '?':
            question = question[0:len(question)-1]
        
        an_start, an_stop = -1, -1
        yes_no_ans = False

        text = row['context']
        textSplit = text.split()
        if row['no_answer'] == False: # if there is an answer 
            ans = row['text']
            if ans.lower() != 'no': # after screen data I see there is only NO answer for yes no question
                c_start = row['answer_start']
                newloc, newlen = find_ans_loc(text, ans, c_start)
                an_start = newloc
                an_stop = newloc + newlen
            else: 
                an_start = 0
                an_stop = len(textSplit)
                yes_no_ans = True
        
        len_ques = len(question.split())
        if len(textSplit) > INSTANCE_WORDS_LEN: # long context
            part_len = INSTANCE_WORDS_LEN - len_ques 
            num_ins = (len(textSplit) - part_len)//STRIDE + 1

            for part_id in range(num_ins + 1):
                part_start = part_id*STRIDE
                part_stop = min(len(textSplit), part_id*STRIDE + part_len)
                part_split = textSplit[part_start:part_stop]
                part = ' '.join(part_split)

                if an_start > -1 and an_stop > an_start and an_stop - an_start <= part_len:
                    # there is answer short enough
                    if an_start >= part_start and an_stop <= part_stop:
                        # answer is inside this part 
                        ins_start = an_start - part_start
                        ins_stop = an_stop - part_start 
                        if not yes_no_ans:
                            instance = {'example_id': example_id, 'part_id': part_id, 'question': question, 'context': part,
                                        'start': ins_start, 'stop': ins_stop, 'target': 'SHORT'}
                            list_ins.append(instance)
                        else:
                            instance = {'example_id': example_id, 'part_id': part_id, 'question': question, 'context': part,
                                        'start': ins_start, 'stop': ins_stop, 'target': 'NO'}
                            list_ins.append(instance)
                    else:
                        instance = {'example_id': example_id, 'part_id': part_id, 'question': question, 'context': part,
                                    'start': 0, 'stop': 0, 'target': 'NO_ANSWER'}
                        list_ins.append(instance)
                else:
                    instance = {'example_id': example_id, 'part_id': part_id, 'question': question, 'context': part,
                                'start': 0, 'stop': 0, 'target': 'NO_ANSWER'}
                    list_ins.append(instance)
        else: # context is short enough
            if an_start > -1 and an_stop > an_start:
                if not yes_no_ans:
                    instance = {'example_id': example_id, 'part_id': 0, 'question': question, 'context': text,
                                'start': an_start, 'stop': an_stop, 'target': 'SHORT'}
                    list_ins.append(instance)
                else:
                    instance = {'example_id': example_id, 'part_id': 0, 'question': question, 'context': text,
                                'start': an_start, 'stop': an_stop, 'target': 'NO'}
                    list_ins.append(instance)
            else:
                instance = {'example_id': example_id, 'part_id': 0, 'question': question, 'context': text,
                                'start': 0, 'stop': 0, 'target': 'NO_ANSWER'}
                list_ins.append(instance)
    if debug:
        # print random some ins
        list_id = random.sample(range(0, len(list_ins)), 100)
        for id in list_id:
            ins = list_ins[id]
            print(ins['example_id'])
            print(ins['question'])
            print(ins['context'])
            print(ins['target'], ins['context'].split()[ins['start']:ins['stop']])
            print('\n')
    return list_ins
                        

In [58]:
list_ins = getSquadInstancesTrain(debug=True)

56dec6913277331400b4d726
What drug does the documentary Pumping Iron show Schwarzenegger using
Schwarzenegger admitted that he has "behaved badly sometimes" and apologized, but also stated that "a lot of [what] you see in the stories is not true". This came after an interview in adult magazine Oui from 1977 surfaced, in which Schwarzenegger discussed attending sexual orgies and using substances such as marijuana. Schwarzenegger is shown smoking a marijuana joint after winning Mr. Olympia in the 1975 documentary film Pumping Iron. In an interview with GQ magazine in October 2007, Schwarzenegger said, "[Marijuana] is not a drug. It's a leaf. My drug was pumping iron, trust me." His spokesperson later said the comment was meant to be a joke.
SHORT ['marijuana.']


5ad55eed5b96ef001a10acf2
How many iMac units sold in the first 193 days
In 1998, Apple introduced its new iMac which, like the original 128K Mac, was an all-in-one computer. Its translucent plastic case, originally Bondi blue an

In [59]:
print(len(list_ins))

130364


In [60]:
random.shuffle(list_ins)

In [61]:
list_ins_df = pd.DataFrame(list_ins)

In [62]:
list_ins_df.head()

Unnamed: 0,example_id,part_id,question,context,start,stop,target
0,57282dda4b864d1900164663,0,What country joined Egypt and Syria against Is...,"On 21 May, Amer asked Nasser to order the Stra...",183,184,SHORT
1,57301a6b04bcaa1900d771a2,0,What is one example of antibiotics that may ha...,Antibiotics are screened for any negative effe...,184,191,SHORT
2,5a830f90e60761001a2eb345,0,What does recognizing the different characteri...,One of the ways to prevent or slow down the tr...,0,0,NO_ANSWER
3,57341d964776f419006618b7,0,About what percentage of the Native Americans ...,"Approximately 66,000 people of Native American...",161,162,SHORT
4,56cda9c462d2951400fa67e4,0,Who published the Twilight Princess comic book...,A Japan-exclusive manga series based on Twilig...,28,29,SHORT


In [63]:
list_ins_df.to_csv('/content/drive/MyDrive/data/fine_data/train_instance_squad.csv', index=False, 
                         columns=['example_id', 'part_id', 'question', 'context', 'start', 'stop', 'target'])

In [None]:
s_dev = squad_json_to_dataframe_dev()

Reading the json file
processing...


  from ipykernel import kernelapp as app
  app.launch_new_instance()


shape of the dataframe is (11873, 5)
Done


In [None]:
s_dev.head()

Unnamed: 0,id,question,context,answers,c_id
0,56ddde6b9a695914005b9628,In what country is Normandy located?,The Normans (Norman: Nourmands; French: Norman...,"[{'text': 'France', 'answer_start': 159}, {'te...",0
1,56ddde6b9a695914005b9629,When were the Normans in Normandy?,The Normans (Norman: Nourmands; French: Norman...,"[{'text': '10th and 11th centuries', 'answer_s...",0
2,56ddde6b9a695914005b962a,From which countries did the Norse originate?,The Normans (Norman: Nourmands; French: Norman...,"[{'text': 'Denmark, Iceland and Norway', 'answ...",0
3,56ddde6b9a695914005b962b,Who was the Norse leader?,The Normans (Norman: Nourmands; French: Norman...,"[{'text': 'Rollo', 'answer_start': 308}, {'tex...",0
4,56ddde6b9a695914005b962c,What century did the Normans first gain their ...,The Normans (Norman: Nourmands; French: Norman...,"[{'text': '10th century', 'answer_start': 671}...",0


In [None]:
print(s_train.shape)
print(s_dev.shape)

(130319, 6)
(11873, 5)


In [None]:
print(s_dev.iloc[3]['answers'])

[{'text': 'Rollo', 'answer_start': 308}, {'text': 'Rollo', 'answer_start': 308}, {'text': 'Rollo', 'answer_start': 308}, {'text': 'Rollo', 'answer_start': 308}]
