In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd
import sys
import random

from tqdm import tqdm

import string
import re

import os

import json

In [3]:
f_train = '/content/drive/MyDrive/data/simplified-nq-train.jsonl'
f_test = '/content/drive/MyDrive/data/simplified-nq-test.jsonl'
f_sub = '/content/drive/MyDrive/data/submission.csv'
num_train_samples = 307372
num_test_samples = 346

In [4]:
def get_id_df(filename=f_test):
    list_id = []
    with open(filename) as f:
        progress = tqdm(f)  
        for sam_count, line in enumerate(progress):
            data = json.loads(line)
            example_id = str(data['example_id'])
            doc = {'example_id':example_id}
            list_id.append(doc)
    list_id_df = pd.DataFrame(list_id)
    return list_id_df 

In [5]:
def get_doc_df(filename=f_test):
    list_doc = []
    with open(filename) as f:
        progress = tqdm(f)
        for sam_count, line in enumerate(progress):
            data = json.loads(line)
            list_doc.append(data)
    list_doc_df = pd.DataFrame(list_doc)
    return list_doc_df

In [6]:
list_doc_df = get_doc_df(f_test)

346it [00:01, 323.39it/s]


In [7]:
list_doc_df.head()

Unnamed: 0,example_id,question_text,document_text,long_answer_candidates
0,-1220107454853145579,who is the south african high commissioner in ...,"High Commission of South Africa , London - wik...","[{'end_token': 136, 'start_token': 18, 'top_le..."
1,8777415633185303067,the office episode when they sing to michael,Michael 's Last Dundies - wikipedia <H1> Micha...,"[{'end_token': 190, 'start_token': 23, 'top_le..."
2,4640548859154538040,what is the main idea of the cross of gold speech,Cross of gold speech - wikipedia <H1> Cross of...,"[{'end_token': 165, 'start_token': 12, 'top_le..."
3,-5316095317154496261,when was i want to sing in opera written,Wilkie Bard - wikipedia <H1> Wilkie Bard </H1>...,"[{'end_token': 105, 'start_token': 8, 'top_lev..."
4,-8752372642178983917,who does the voices in ice age collision course,Ice Age : Collision Course - Wikipedia <H1> Ic...,"[{'end_token': 287, 'start_token': 16, 'top_le..."


In [8]:
AnswerType = {
    'NO_ANSWER': 0,
    'YES': 1,
    'NO': 2,
    'SHORT' : 3,
    'LONG' : 4
}

AnswerTypeRev = {
    0: 'NO_ANSWER',
    1: 'YES',
    2: 'NO',
    3: 'SHORT',
    4: 'LONG'
}

cleanr = re.compile('<.*?>')
def clean_html(raw_html):
    cleantext = re.sub('$', '\$', raw_html)
    cleantext = re.sub(cleanr, '$<tag>$', cleantext)
    return cleantext

In [9]:
sub = pd.read_csv(f_sub)

In [10]:
def visualizeSub(sub, list_doc_df, debug=False):
    sub.fillna('', inplace=True)
    id = 0

    for rowid, row in sub.iterrows():
        if 'long' in str(row['example_id']):
            example_id = str(row['example_id']).replace('_long',"")
            
            longid = str(row['example_id'])
            longStr = str(row['PredictionString'])
            lan_start, lan_stop = -1, -1

            if str(row['PredictionString']) != '':
                tokens = str(row['PredictionString']).split(':')
                lan_start = int(tokens[0])
                lan_stop = int(tokens[1])
            
            # find corresponding short answer 
            san_start, san_stop = -1, -1
            
            sanid = str(example_id) + '_short'
            san = sub.loc[sub['example_id'] == sanid].iloc[0]
            sanStr = str(san['PredictionString'])
            
            if debug:
                print(example_id)
                print(longStr)
                print(sanStr)
            
            if sanStr != '' and sanStr != 'YES' and sanStr != 'NO':
                tokensans = sanStr.split(':')
                san_start = int(tokensans[0])
                san_stop = int(tokensans[1])
            

            # find corresponding document 
            doc = list_doc_df.loc[list_doc_df['example_id'] == str(example_id)].iloc[0]
            cleandoc = clean_html(doc['document_text'])
            # print(cleandoc)

            # print(id, ' \\\\')    
            print('\\item')
            print('\\begin{itemize}')                    
            print("\\item QUESTION: ", doc['question_text'])
            print("\\item LONG: ", ' '.join(cleandoc.split()[lan_start:lan_stop]))
            if sanStr == 'YES' or sanStr == 'NO':
                print("\\item SHORT: ", sanStr)
            else:
                print("\\item SHORT: ",' '.join(cleandoc.split()[san_start:san_stop]))
            # print("\\item URL: ", doc[''])
            print(' \\end{itemize}')

            id += 1

In [39]:
visualizeSub(sub, list_doc_df)

\item
\begin{itemize}
\item QUESTION:  association of producers that control supply and prices
\item LONG:  $<tag>$ International price fixing by private entities can be prosecuted under the antitrust laws of many countries . Examples of prosecuted international cartels are those that controlled the prices and output of lysine , citric acid , graphite electrodes , and bulk vitamins . $<tag>$
\item SHORT:  
 \end{itemize}
\item
\begin{itemize}
\item QUESTION:  norman cook you've come a long way
\item LONG:  $<tag>$ $<tag>$ Norman Cook -- performer , production $<tag>$ $<tag>$ Red Design -- photography $<tag>$ $<tag>$ Simon Thornton -- engineering , mixing , photography $<tag>$ $<tag>$ Eve -- provides the vocals for the song `` Cowboy '' . $<tag>$ $<tag>$ Freddy Fresh -- provides the vocal sample for the song `` Fucking in Heaven '' . $<tag>$ $<tag>$ Myriam Tisler -- provides the vocals for the song `` Radioactivity '' . $<tag>$ $<tag>$
\item SHORT:  
 \end{itemize}
\item
\begin{itemize}