In [4]:
import sys
import pandas as pd
import json
import numpy as np
import torch
from torch import Tensor
import pickle
from tqdm.auto import tqdm

from matplotlib import pyplot as plt

sys.path.append("../")
from src.log import myLogger
from src.repository.data_repository import DataRepository
from src.checkpoint.checkpoint import Checkpoint
from src.metrics.jaccard import jaccard

pd.set_option("display.max_columns", 300)
pd.set_option("display.max_rows", 600)

%load_ext autoreload
%autoreload 2

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))
%config InlineBackend.figure_formats = {'png', 'retina'}

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
logger = myLogger("../logs/temp.log", exp_id="e000", wdb_prj_id="temp", exp_config=None, use_wdb=False) 
dr = DataRepository(logger=logger, local_root_path="..")

2021-11-14 13:31:02,659 log.py               31   [INFO] [__init__] skip wandb init 
2021-11-14 13:31:02,659 log.py               31   [INFO] [__init__] skip wandb init 


In [6]:
import pickle

train_prep_df = dr.load_preprocessed_df(
          dataset_name="train",                                                                                                                                                                                                                                   
          class_name="BaselineKernelPreprocessorV1",
          tokenizer_name="XLMRobertaTokenizerFast",
          max_length=400,
          pad_on_right=True,
          stride=135,
          split=False,
          lstrip=False,
          use_language_as_question=False,
          add_overflowing_batch_id=False,
    )

mlqa_hindi_prep_df = dr.load_preprocessed_df(
          dataset_name="mlqa_hindi",                                                                                                                                                                                                                                   
          class_name="BaselineKernelPreprocessorV1",
          tokenizer_name="XLMRobertaTokenizerFast",
          max_length=400,
          pad_on_right=True,
          stride=135,
          split=False,
          lstrip=False,
          use_language_as_question=False,
          add_overflowing_batch_id=False,
    )

xquad_prep_df = dr.load_preprocessed_df(
          dataset_name="xquad",                                                                                                                                                                                                                                   
          class_name="BaselineKernelPreprocessorV1",
          tokenizer_name="XLMRobertaTokenizerFast",
          max_length=400,
          pad_on_right=True,
          stride=135,
          split=False,
          lstrip=False,
          use_language_as_question=False,
          add_overflowing_batch_id=False,
    )

prep_df = pd.concat([train_prep_df, mlqa_hindi_prep_df, xquad_prep_df], axis=0).reset_index(drop=True)
prep_df

2021-11-14 13:31:03,264 data_repository.py   262  [INFO] [load_preprocessed_df] loading data/preprocessed/train_BaselineKernelPreprocessorV1_XLMRobertaTokenizerFast_400_True_135_False_False_False_False.pkl ... 
2021-11-14 13:31:03,264 data_repository.py   262  [INFO] [load_preprocessed_df] loading data/preprocessed/train_BaselineKernelPreprocessorV1_XLMRobertaTokenizerFast_400_True_135_False_False_False_False.pkl ... 
2021-11-14 13:31:07,109 data_repository.py   269  [INFO] [load_preprocessed_df] done. 
2021-11-14 13:31:07,109 data_repository.py   269  [INFO] [load_preprocessed_df] done. 
2021-11-14 13:31:07,112 data_repository.py   262  [INFO] [load_preprocessed_df] loading data/preprocessed/mlqa_hindi_BaselineKernelPreprocessorV1_XLMRobertaTokenizerFast_400_True_135_False_False_False_False.pkl ... 
2021-11-14 13:31:07,112 data_repository.py   262  [INFO] [load_preprocessed_df] loading data/preprocessed/mlqa_hindi_BaselineKernelPreprocessorV1_XLMRobertaTokenizerFast_400_True_135_False

Unnamed: 0,id,context,question,answer_text,answer_start,language,top20_context,answer_text_count,input_ids,token_type_ids,attention_mask,sequence_ids,offset_mapping,overflowing_batch_id,duplicated_elems_num_with,part_answer_text_count,start_position,end_position,segmentation_position,is_contain_answer_text
0,903deec17,ஒரு சாதாரண வளர்ந்த மனிதனுடைய எலும்புக்கூடு பின...,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,53,tamil,ஒரு சாதாரண வளர்ந்த ம,1,"[0, 69535, 81049, 37368, 153264, 12095, 52989,...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, Non...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",0,-1,1,27,27,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1
1,903deec17,ஒரு சாதாரண வளர்ந்த மனிதனுடைய எலும்புக்கூடு பின...,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,53,tamil,ஒரு சாதாரண வளர்ந்த ம,1,"[0, 69535, 81049, 37368, 153264, 12095, 52989,...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, Non...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",1,-1,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0
2,903deec17,ஒரு சாதாரண வளர்ந்த மனிதனுடைய எலும்புக்கூடு பின...,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,53,tamil,ஒரு சாதாரண வளர்ந்த ம,1,"[0, 69535, 81049, 37368, 153264, 12095, 52989,...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, Non...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",2,-1,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0
3,903deec17,ஒரு சாதாரண வளர்ந்த மனிதனுடைய எலும்புக்கூடு பின...,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,53,tamil,ஒரு சாதாரண வளர்ந்த ம,1,"[0, 69535, 81049, 37368, 153264, 12095, 52989,...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, Non...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",3,-1,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0
4,903deec17,ஒரு சாதாரண வளர்ந்த மனிதனுடைய எலும்புக்கூடு பின...,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,53,tamil,ஒரு சாதாரண வளர்ந்த ம,1,"[0, 69535, 81049, 37368, 153264, 12095, 52989,...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, Non...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",4,-1,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22321,xquad_1185,विद्युत आवेश के परिवर्तन की समय दर के रूप में ...,इलेक्ट्रोस्टैटिक और चुंबकीय बल के योग के रूप क...,इलेक्ट्रोस्टैटिक बल,328,hindi,विद्युत आवेश के परिव,1,"[0, 234186, 2284, 17433, 3045, 71683, 871, 369...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",0,-1,1,116,121,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1
22322,xquad_1186,उस आयतन के लिए प्रासंगिक क्रॉस-सेक्शनल क्षेत्र...,संरचनाओं में तनाव का कारण क्या बनता है?,तनाव टेंसर,343,hindi,उस आयतन के लिए प्रास,1,"[0, 120018, 11846, 421, 129558, 641, 6701, 600...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",0,-1,1,128,131,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1
22323,xquad_1187,उस आयतन के लिए प्रासंगिक क्रॉस-सेक्शनल क्षेत्र...,किसी वस्तु के आयतन में क्रॉस सेक्शन क्षेत्र की...,दबाव की शर्तें,118,hindi,उस आयतन के लिए प्रास,1,"[0, 12820, 89773, 287, 34889, 41420, 421, 4761...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",0,-1,1,66,68,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1
22324,xquad_1188,उस आयतन के लिए प्रासंगिक क्रॉस-सेक्शनल क्षेत्र...,सामान्य ताकतों से क्या जुड़ा है?,दबाव की शर्तें,118,hindi,उस आयतन के लिए प्रास,1,"[0, 38338, 217186, 1302, 646, 6004, 158371, 10...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, None, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",0,-1,1,51,53,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1


#### load values

In [7]:
import gc
from collections import defaultdict
from tqdm.auto import tqdm

target_exp_ids = ["e072", "e073"]

checkpoints_info = {}

for target_exp_id in target_exp_ids:
    checkpoints_info[target_exp_id] = defaultdict(list)
    best_checkpoints = dr.best_checkpoint_filepaths(target_exp_id)
    for best_checkpoint in tqdm(best_checkpoints):
        exp_fold_checkpoint = dr.load_checkpoint_from_filepath(filepath_from_root=best_checkpoint, load_from_gcs=True, rm_local_after_load=True) 
        del exp_fold_checkpoint.model_state_dict
        del exp_fold_checkpoint.scheduler_state_dict
        del exp_fold_checkpoint.optimizer_state_dict
        gc.collect()
        checkpoints_info[target_exp_id]["val_ids"].extend(exp_fold_checkpoint.val_ids)
        checkpoints_info[target_exp_id]["val_start_logits"].extend(exp_fold_checkpoint.val_start_logits)
        checkpoints_info[target_exp_id]["val_end_logits"].extend(exp_fold_checkpoint.val_end_logits)
        del exp_fold_checkpoint
        gc.collect()

  0%|          | 0/5 [00:00<?, ?it/s]

2021-11-14 13:31:08,931 repository.py        167  [INFO] [__download_from_gcs] downloading data/checkpoint/e072/best_checkpoint/0_1_1.3986_0.6728.pkl from gs://kaggle-chaii-2021/../data/checkpoint/e072/best_checkpoint/0_1_1.3986_0.6728.pkl 
2021-11-14 13:31:08,931 repository.py        167  [INFO] [__download_from_gcs] downloading data/checkpoint/e072/best_checkpoint/0_1_1.3986_0.6728.pkl from gs://kaggle-chaii-2021/../data/checkpoint/e072/best_checkpoint/0_1_1.3986_0.6728.pkl 
2021-11-14 13:32:13,636 repository.py        176  [INFO] [__download_from_gcs] download done. 
2021-11-14 13:32:13,636 repository.py        176  [INFO] [__download_from_gcs] download done. 
2021-11-14 13:32:28,935 repository.py        167  [INFO] [__download_from_gcs] downloading data/checkpoint/e072/best_checkpoint/1_2_1.3330_0.6974.pkl from gs://kaggle-chaii-2021/../data/checkpoint/e072/best_checkpoint/1_2_1.3330_0.6974.pkl 
2021-11-14 13:32:28,935 repository.py        167  [INFO] [__download_from_gcs] download

  0%|          | 0/5 [00:00<?, ?it/s]

2021-11-14 13:36:37,786 repository.py        167  [INFO] [__download_from_gcs] downloading data/checkpoint/e073/best_checkpoint/0_2_14.0360_0.6704.pkl from gs://kaggle-chaii-2021/../data/checkpoint/e073/best_checkpoint/0_2_14.0360_0.6704.pkl 
2021-11-14 13:36:37,786 repository.py        167  [INFO] [__download_from_gcs] downloading data/checkpoint/e073/best_checkpoint/0_2_14.0360_0.6704.pkl from gs://kaggle-chaii-2021/../data/checkpoint/e073/best_checkpoint/0_2_14.0360_0.6704.pkl 
2021-11-14 13:37:29,253 repository.py        176  [INFO] [__download_from_gcs] download done. 
2021-11-14 13:37:29,253 repository.py        176  [INFO] [__download_from_gcs] download done. 
2021-11-14 13:37:38,012 repository.py        167  [INFO] [__download_from_gcs] downloading data/checkpoint/e073/best_checkpoint/1_3_16.3359_0.6945.pkl from gs://kaggle-chaii-2021/../data/checkpoint/e073/best_checkpoint/1_3_16.3359_0.6945.pkl 
2021-11-14 13:37:38,012 repository.py        167  [INFO] [__download_from_gcs] do

#### merge values

In [8]:
checkpoints_info.keys()

dict_keys(['e072', 'e073'])

In [9]:
# checkpoints_info["e049"]["val_ids"] == checkpoints_info["e059"]["val_ids"]

In [10]:
for target_exp_id in checkpoints_info.keys():
    checkpoints_info[target_exp_id]["val_start_logits"] = np.asarray(checkpoints_info[target_exp_id]["val_start_logits"])
    checkpoints_info[target_exp_id]["val_end_logits"] = np.asarray(checkpoints_info[target_exp_id]["val_end_logits"])
    
# for target_exp_id in checkpoints_info.keys():
#     checkpoints_info[target_exp_id]["start_logits"] = [np.asarray(logit) for logit in checkpoints_info[target_exp_id]["start_logits"]]
#     checkpoints_info[target_exp_id]["end_logits"] = [np.asarray(logit) for logit in checkpoints_info[target_exp_id]["end_logits"]]

In [11]:
ensembled_checkpoint_info = {
    "val_ids": None,
    "val_start_logits": None,
    "val_end_logits": None,
}

for target_exp_id in checkpoints_info.keys():
    if ensembled_checkpoint_info["val_ids"] is None:
        ensembled_checkpoint_info["val_ids"] = checkpoints_info[target_exp_id]["val_ids"]
    if ensembled_checkpoint_info["val_start_logits"] is None:
        ensembled_checkpoint_info["val_start_logits"] = checkpoints_info[target_exp_id]["val_start_logits"]
    else:
        ensembled_checkpoint_info["val_start_logits"] += checkpoints_info[target_exp_id]["val_start_logits"]
    if ensembled_checkpoint_info["val_end_logits"] is None:
        ensembled_checkpoint_info["val_end_logits"] = checkpoints_info[target_exp_id]["val_end_logits"]
    else:
        ensembled_checkpoint_info["val_end_logits"] += checkpoints_info[target_exp_id]["val_end_logits"]
        
        
ensembled_checkpoint_info["val_start_logits"] = [np.asarray(logit) for logit in checkpoints_info[target_exp_id]["val_start_logits"]]
ensembled_checkpoint_info["val_end_logits"] = [np.asarray(logit) for logit in checkpoints_info[target_exp_id]["val_end_logits"]]

In [12]:
checkpoint_df = pd.DataFrame()
checkpoint_df["id"] = ensembled_checkpoint_info["val_ids"]
checkpoint_df["start_logits"] = ensembled_checkpoint_info["val_start_logits"]
checkpoint_df["end_logits"] = ensembled_checkpoint_info["val_end_logits"]

checkpoint_df["overflowing_batch_id"] = None
bef_id = ""
overflowing_batch_id = 0
for i, row in checkpoint_df.iterrows():
    if str(row["id"]) != bef_id:
        overflowing_batch_id = 0
    checkpoint_df.loc[i, "overflowing_batch_id"] = overflowing_batch_id
    bef_id = row["id"]
    overflowing_batch_id += 1
checkpoint_df.head(10)

Unnamed: 0,id,start_logits,end_logits,overflowing_batch_id
0,903deec17,"[-6.871591091156006, -7.66694974899292, -9.131...","[-8.961920738220215, -12.13699722290039, -11.7...",0
1,903deec17,"[-6.353556156158447, -7.573686122894287, -8.77...","[-8.663551330566406, -12.235248565673828, -11....",1
2,903deec17,"[-5.727337837219238, -7.6956095695495605, -8.7...","[-7.9614105224609375, -12.012073516845703, -11...",2
3,903deec17,"[-6.6011643409729, -7.867605209350586, -8.9840...","[-8.839470863342285, -12.1362886428833, -11.83...",3
4,903deec17,"[-6.303467273712158, -7.26361608505249, -8.978...","[-8.5467529296875, -12.05208969116211, -11.755...",4
5,903deec17,"[-5.130573749542236, -6.862061977386475, -8.80...","[-7.419711112976074, -11.854751586914062, -11....",5
6,29d154b56,"[-5.35651159286499, -8.80470085144043, -9.7021...","[-7.449645519256592, -12.34675407409668, -11.3...",0
7,29d154b56,"[-5.7682204246521, -8.827058792114258, -9.9222...","[-7.772848606109619, -12.482477188110352, -11....",1
8,29d154b56,"[-5.013658046722412, -8.049748420715332, -9.64...","[-6.896134853363037, -12.419927597045898, -11....",2
9,29d154b56,"[-5.516550540924072, -8.561436653137207, -9.74...","[-7.415036201477051, -12.498138427734375, -11....",3


In [13]:
prep_df = prep_df.merge(checkpoint_df, on=["id", "overflowing_batch_id"], how="left")
prep_df

Unnamed: 0,id,context,question,answer_text,answer_start,language,top20_context,answer_text_count,input_ids,token_type_ids,attention_mask,sequence_ids,offset_mapping,overflowing_batch_id,duplicated_elems_num_with,part_answer_text_count,start_position,end_position,segmentation_position,is_contain_answer_text,start_logits,end_logits
0,903deec17,ஒரு சாதாரண வளர்ந்த மனிதனுடைய எலும்புக்கூடு பின...,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,53,tamil,ஒரு சாதாரண வளர்ந்த ம,1,"[0, 69535, 81049, 37368, 153264, 12095, 52989,...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, Non...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",0,-1,1,27,27,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,"[-6.871591091156006, -7.66694974899292, -9.131...","[-8.961920738220215, -12.13699722290039, -11.7..."
1,903deec17,ஒரு சாதாரண வளர்ந்த மனிதனுடைய எலும்புக்கூடு பின...,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,53,tamil,ஒரு சாதாரண வளர்ந்த ம,1,"[0, 69535, 81049, 37368, 153264, 12095, 52989,...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, Non...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",1,-1,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,"[-6.353556156158447, -7.573686122894287, -8.77...","[-8.663551330566406, -12.235248565673828, -11...."
2,903deec17,ஒரு சாதாரண வளர்ந்த மனிதனுடைய எலும்புக்கூடு பின...,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,53,tamil,ஒரு சாதாரண வளர்ந்த ம,1,"[0, 69535, 81049, 37368, 153264, 12095, 52989,...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, Non...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",2,-1,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,"[-5.727337837219238, -7.6956095695495605, -8.7...","[-7.9614105224609375, -12.012073516845703, -11..."
3,903deec17,ஒரு சாதாரண வளர்ந்த மனிதனுடைய எலும்புக்கூடு பின...,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,53,tamil,ஒரு சாதாரண வளர்ந்த ம,1,"[0, 69535, 81049, 37368, 153264, 12095, 52989,...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, Non...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",3,-1,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,"[-6.6011643409729, -7.867605209350586, -8.9840...","[-8.839470863342285, -12.1362886428833, -11.83..."
4,903deec17,ஒரு சாதாரண வளர்ந்த மனிதனுடைய எலும்புக்கூடு பின...,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,53,tamil,ஒரு சாதாரண வளர்ந்த ம,1,"[0, 69535, 81049, 37368, 153264, 12095, 52989,...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, Non...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",4,-1,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,"[-6.303467273712158, -7.26361608505249, -8.978...","[-8.5467529296875, -12.05208969116211, -11.755..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22321,xquad_1185,विद्युत आवेश के परिवर्तन की समय दर के रूप में ...,इलेक्ट्रोस्टैटिक और चुंबकीय बल के योग के रूप क...,इलेक्ट्रोस्टैटिक बल,328,hindi,विद्युत आवेश के परिव,1,"[0, 234186, 2284, 17433, 3045, 71683, 871, 369...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",0,-1,1,116,121,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,"[-6.719956398010254, -8.365612983703613, -10.4...","[-8.62995719909668, -12.595864295959473, -11.3..."
22322,xquad_1186,उस आयतन के लिए प्रासंगिक क्रॉस-सेक्शनल क्षेत्र...,संरचनाओं में तनाव का कारण क्या बनता है?,तनाव टेंसर,343,hindi,उस आयतन के लिए प्रास,1,"[0, 120018, 11846, 421, 129558, 641, 6701, 600...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",0,-1,1,128,131,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,"[-3.6441473960876465, -8.561023712158203, -10....","[-5.722349643707275, -11.100050926208496, -10...."
22323,xquad_1187,उस आयतन के लिए प्रासंगिक क्रॉस-सेक्शनल क्षेत्र...,किसी वस्तु के आयतन में क्रॉस सेक्शन क्षेत्र की...,दबाव की शर्तें,118,hindi,उस आयतन के लिए प्रास,1,"[0, 12820, 89773, 287, 34889, 41420, 421, 4761...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",0,-1,1,66,68,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,"[-4.3488993644714355, -8.246306419372559, -9.5...","[-6.398498058319092, -12.459280014038086, -11...."
22324,xquad_1188,उस आयतन के लिए प्रासंगिक क्रॉस-सेक्शनल क्षेत्र...,सामान्य ताकतों से क्या जुड़ा है?,दबाव की शर्तें,118,hindi,उस आयतन के लिए प्रास,1,"[0, 38338, 217186, 1302, 646, 6004, 158371, 10...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, None, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",0,-1,1,51,53,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,"[-3.942087411880493, -8.996224403381348, -9.38...","[-6.257936477661133, -11.771550178527832, -11...."


In [14]:
from typing import List, Tuple

def get_context_part(
    offset_mapping: List[Tuple[int, int]], context: str
) -> str:
    s = 1_000_000_000
    e = 0
    for (offs, offe) in offset_mapping:
        if offs == -1:
            continue
        s = min(offs, s)
        e = max(offe, e)
    context_part = context[s:e]
    return s, e, context_part

def get_part_start_end_logit_score(start_char_index, end_char_index, offset_mapping, start_logit, end_logit):
    start_index = 0
    while offset_mapping[start_index][0] == -1:
        start_index += 1
    while offset_mapping[start_index][0] != -1 and offset_mapping[start_index][0] <= start_char_index:
        start_index += 1
    start_index -= 1
    end_index = len(offset_mapping) - 1
    while offset_mapping[end_index][1] == -1:
        end_index -= 1
    while offset_mapping[end_index][1] != -1 and offset_mapping[end_index][1] >= end_char_index:
        end_index -= 1
    end_index += 1
    score = start_logit[start_index] + end_logit[end_index]
    return score

In [15]:
# %debug
import json

for i, row in tqdm(prep_df.iterrows(), total=len(prep_df)):
    context_part_start_char_index, _, context_part = get_context_part(row["offset_mapping"], row["context"])
    char_index_score_dict = {}
    start_char_index = context_part.find(row["answer_text"])
    while start_char_index >= 0:
        score = get_part_start_end_logit_score(
            context_part_start_char_index+start_char_index,
            context_part_start_char_index+start_char_index+len(row["answer_text"]),
            row["offset_mapping"],
            row["start_logits"],
            row["end_logits"]
        )
        char_index_score_dict[context_part_start_char_index+start_char_index] = score

        context_part_start_char_index += (start_char_index+len(row["answer_text"]))
        context_part = context_part[start_char_index+len(row["answer_text"]):]
        start_char_index = context_part.find(row["answer_text"])
    prep_df.loc[i, "char_index_score_dict"] = json.dumps(char_index_score_dict)

  0%|          | 0/22326 [00:00<?, ?it/s]

In [16]:
prep_df.head(100)[["id", "question", "answer_text", "answer_start", "language", "overflowing_batch_id", "is_contain_answer_text", "char_index_score_dict"]]

Unnamed: 0,id,question,answer_text,answer_start,language,overflowing_batch_id,is_contain_answer_text,char_index_score_dict
0,903deec17,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,53,tamil,0,1,"{""53"": 11.993648052215576}"
1,903deec17,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,53,tamil,1,0,{}
2,903deec17,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,53,tamil,2,0,{}
3,903deec17,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,53,tamil,3,0,{}
4,903deec17,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,53,tamil,4,0,{}
5,903deec17,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,53,tamil,5,0,{}
6,d9841668c,காளிதாசன் எங்கு பிறந்தார்?,காசுமீரில்,2358,tamil,0,0,{}
7,d9841668c,காளிதாசன் எங்கு பிறந்தார்?,காசுமீரில்,2358,tamil,1,0,{}
8,d9841668c,காளிதாசன் எங்கு பிறந்தார்?,காசுமீரில்,2358,tamil,2,1,"{""2358"": 8.365867853164673, ""2531"": -14.385911..."
9,d9841668c,காளிதாசன் எங்கு பிறந்தார்?,காசுமீரில்,2358,tamil,3,0,"{""2653"": -8.351884007453918}"


In [17]:
def temp(x):
    y = json.loads(x)
    if len(y) == 0:
        return -100000000
    else:
        return max(y.values())
    
def temp2(x):
    y = json.loads(x)
    if len(y) == 0:
        return -100000000
    else:
        res_k = -1
        res_v = -1_000_000_000
        for k, v in y.items():
            if v > res_v:
                res_k = int(k)
                res_v = v
        return res_k
    
prep_df["max_score"] = prep_df.char_index_score_dict.apply(lambda x: temp(x))
prep_df["max_index"] = prep_df.char_index_score_dict.apply(lambda x: temp2(x))
prep_df

Unnamed: 0,id,context,question,answer_text,answer_start,language,top20_context,answer_text_count,input_ids,token_type_ids,attention_mask,sequence_ids,offset_mapping,overflowing_batch_id,duplicated_elems_num_with,part_answer_text_count,start_position,end_position,segmentation_position,is_contain_answer_text,start_logits,end_logits,char_index_score_dict,max_score,max_index
0,903deec17,ஒரு சாதாரண வளர்ந்த மனிதனுடைய எலும்புக்கூடு பின...,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,53,tamil,ஒரு சாதாரண வளர்ந்த ம,1,"[0, 69535, 81049, 37368, 153264, 12095, 52989,...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, Non...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",0,-1,1,27,27,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,"[-6.871591091156006, -7.66694974899292, -9.131...","[-8.961920738220215, -12.13699722290039, -11.7...","{""53"": 11.993648052215576}",1.199365e+01,53
1,903deec17,ஒரு சாதாரண வளர்ந்த மனிதனுடைய எலும்புக்கூடு பின...,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,53,tamil,ஒரு சாதாரண வளர்ந்த ம,1,"[0, 69535, 81049, 37368, 153264, 12095, 52989,...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, Non...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",1,-1,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,"[-6.353556156158447, -7.573686122894287, -8.77...","[-8.663551330566406, -12.235248565673828, -11....",{},-1.000000e+08,-100000000
2,903deec17,ஒரு சாதாரண வளர்ந்த மனிதனுடைய எலும்புக்கூடு பின...,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,53,tamil,ஒரு சாதாரண வளர்ந்த ம,1,"[0, 69535, 81049, 37368, 153264, 12095, 52989,...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, Non...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",2,-1,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,"[-5.727337837219238, -7.6956095695495605, -8.7...","[-7.9614105224609375, -12.012073516845703, -11...",{},-1.000000e+08,-100000000
3,903deec17,ஒரு சாதாரண வளர்ந்த மனிதனுடைய எலும்புக்கூடு பின...,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,53,tamil,ஒரு சாதாரண வளர்ந்த ம,1,"[0, 69535, 81049, 37368, 153264, 12095, 52989,...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, Non...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",3,-1,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,"[-6.6011643409729, -7.867605209350586, -8.9840...","[-8.839470863342285, -12.1362886428833, -11.83...",{},-1.000000e+08,-100000000
4,903deec17,ஒரு சாதாரண வளர்ந்த மனிதனுடைய எலும்புக்கூடு பின...,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,53,tamil,ஒரு சாதாரண வளர்ந்த ம,1,"[0, 69535, 81049, 37368, 153264, 12095, 52989,...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, Non...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",4,-1,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,"[-6.303467273712158, -7.26361608505249, -8.978...","[-8.5467529296875, -12.05208969116211, -11.755...",{},-1.000000e+08,-100000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22321,xquad_1185,विद्युत आवेश के परिवर्तन की समय दर के रूप में ...,इलेक्ट्रोस्टैटिक और चुंबकीय बल के योग के रूप क...,इलेक्ट्रोस्टैटिक बल,328,hindi,विद्युत आवेश के परिव,1,"[0, 234186, 2284, 17433, 3045, 71683, 871, 369...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",0,-1,1,116,121,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,"[-6.719956398010254, -8.365612983703613, -10.4...","[-8.62995719909668, -12.595864295959473, -11.3...","{""328"": -9.274200439453125}",-9.274200e+00,328
22322,xquad_1186,उस आयतन के लिए प्रासंगिक क्रॉस-सेक्शनल क्षेत्र...,संरचनाओं में तनाव का कारण क्या बनता है?,तनाव टेंसर,343,hindi,उस आयतन के लिए प्रास,1,"[0, 120018, 11846, 421, 129558, 641, 6701, 600...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",0,-1,1,128,131,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,"[-3.6441473960876465, -8.561023712158203, -10....","[-5.722349643707275, -11.100050926208496, -10....","{""343"": 6.70177435874939}",6.701774e+00,343
22323,xquad_1187,उस आयतन के लिए प्रासंगिक क्रॉस-सेक्शनल क्षेत्र...,किसी वस्तु के आयतन में क्रॉस सेक्शन क्षेत्र की...,दबाव की शर्तें,118,hindi,उस आयतन के लिए प्रास,1,"[0, 12820, 89773, 287, 34889, 41420, 421, 4761...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",0,-1,1,66,68,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,"[-4.3488993644714355, -8.246306419372559, -9.5...","[-6.398498058319092, -12.459280014038086, -11....","{""118"": -8.283782720565796}",-8.283783e+00,118
22324,xquad_1188,उस आयतन के लिए प्रासंगिक क्रॉस-सेक्शनल क्षेत्र...,सामान्य ताकतों से क्या जुड़ा है?,दबाव की शर्तें,118,hindi,उस आयतन के लिए प्रास,1,"[0, 38338, 217186, 1302, 646, 6004, 158371, 10...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, None, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",0,-1,1,51,53,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,"[-3.942087411880493, -8.996224403381348, -9.38...","[-6.257936477661133, -11.771550178527832, -11....","{""118"": 1.8046899139881134}",1.804690e+00,118


In [16]:
prep_df.query("max_score < 0").max_score.sort_values()

1       -1.000000e+08
10182   -1.000000e+08
10183   -1.000000e+08
10186   -1.000000e+08
10187   -1.000000e+08
             ...     
17632   -2.779844e-02
22114   -1.381356e-02
15482   -1.350516e-02
12421   -8.942664e-03
5153    -7.085534e-03
Name: max_score, Length: 14419, dtype: float64

In [50]:
prep_df.groupby("id").max_score.max().sort_values().reset_index().head(900).tail(200)

Unnamed: 0,id,max_score
700,xquad_332,-1.451184
701,mlqa_hindi_4740,-1.45012
702,mlqa_hindi_4153,-1.447523
703,mlqa_hindi_2325,-1.441979
704,fd235557c,-1.415291
705,mlqa_hindi_546,-1.406173
706,mlqa_hindi_4588,-1.390697
707,mlqa_hindi_3557,-1.328216
708,33bf6b51c,-1.318875
709,4ca677224,-1.310635


In [56]:
prep_df.query("id == '1431f3af8'")

Unnamed: 0,id,context,question,answer_text,answer_start,language,top20_context,answer_text_count,input_ids,token_type_ids,attention_mask,sequence_ids,offset_mapping,overflowing_batch_id,duplicated_elems_num_with,part_answer_text_count,start_position,end_position,segmentation_position,is_contain_answer_text,start_logits,end_logits,char_index_score_dict,max_score,max_index,is_original
4948,1431f3af8,"தஞ்சைப் பிரகதீசுவரர் கோயில் என்றும், தஞ்சைப் ப...",தஞ்சாவூர் மாவட்டத்தின் மிகப்பெரிய கோயில் எது?,பிரகதீசுவரர் கோயில்,8,tamil,தஞ்சைப் பிரகதீசுவரர்,1,"[0, 8396, 32035, 7667, 3937, 36989, 72960, 216...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",0,-1,1,20,27,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,"[-5.921926021575928, -8.851225852966309, -10.0...","[-9.13615608215332, -12.774297714233398, -11.8...","{""8"": -0.474261999130249}",-0.474262,8,True
4949,1431f3af8,"தஞ்சைப் பிரகதீசுவரர் கோயில் என்றும், தஞ்சைப் ப...",தஞ்சாவூர் மாவட்டத்தின் மிகப்பெரிய கோயில் எது?,பிரகதீசுவரர் கோயில்,8,tamil,தஞ்சைப் பிரகதீசுவரர்,1,"[0, 8396, 32035, 7667, 3937, 36989, 72960, 216...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",1,-1,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,"[-7.113161563873291, -8.894224166870117, -10.0...","[-9.615821838378906, -12.748430252075195, -11....",{},-100000000.0,-100000000,True
4950,1431f3af8,"தஞ்சைப் பிரகதீசுவரர் கோயில் என்றும், தஞ்சைப் ப...",தஞ்சாவூர் மாவட்டத்தின் மிகப்பெரிய கோயில் எது?,பிரகதீசுவரர் கோயில்,8,tamil,தஞ்சைப் பிரகதீசுவரர்,1,"[0, 8396, 32035, 7667, 3937, 36989, 72960, 216...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",2,-1,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,"[-7.136600017547607, -8.96052074432373, -9.904...","[-9.58531379699707, -12.638053894042969, -11.5...",{},-100000000.0,-100000000,True
4951,1431f3af8,"தஞ்சைப் பிரகதீசுவரர் கோயில் என்றும், தஞ்சைப் ப...",தஞ்சாவூர் மாவட்டத்தின் மிகப்பெரிய கோயில் எது?,பிரகதீசுவரர் கோயில்,8,tamil,தஞ்சைப் பிரகதீசுவரர்,1,"[0, 8396, 32035, 7667, 3937, 36989, 72960, 216...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",3,-1,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,"[-6.701624870300293, -9.02721881866455, -9.852...","[-9.404024124145508, -12.388155937194824, -11....",{},-100000000.0,-100000000,True
4952,1431f3af8,"தஞ்சைப் பிரகதீசுவரர் கோயில் என்றும், தஞ்சைப் ப...",தஞ்சாவூர் மாவட்டத்தின் மிகப்பெரிய கோயில் எது?,பிரகதீசுவரர் கோயில்,8,tamil,தஞ்சைப் பிரகதீசுவரர்,1,"[0, 8396, 32035, 7667, 3937, 36989, 72960, 216...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",4,-1,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,"[-6.8931355476379395, -9.467649459838867, -10....","[-9.542146682739258, -12.438034057617188, -11....",{},-100000000.0,-100000000,True
4953,1431f3af8,"தஞ்சைப் பிரகதீசுவரர் கோயில் என்றும், தஞ்சைப் ப...",தஞ்சாவூர் மாவட்டத்தின் மிகப்பெரிய கோயில் எது?,பிரகதீசுவரர் கோயில்,8,tamil,தஞ்சைப் பிரகதீசுவரர்,1,"[0, 8396, 32035, 7667, 3937, 36989, 72960, 216...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",5,-1,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,"[-7.54492712020874, -9.281617164611816, -10.03...","[-9.99003791809082, -12.497489929199219, -11.6...",{},-100000000.0,-100000000,True
4954,1431f3af8,"தஞ்சைப் பிரகதீசுவரர் கோயில் என்றும், தஞ்சைப் ப...",தஞ்சாவூர் மாவட்டத்தின் மிகப்பெரிய கோயில் எது?,பிரகதீசுவரர் கோயில்,8,tamil,தஞ்சைப் பிரகதீசுவரர்,1,"[0, 8396, 32035, 7667, 3937, 36989, 72960, 216...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",6,-1,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,"[-7.14237117767334, -9.191513061523438, -10.01...","[-9.58694076538086, -12.534422874450684, -11.6...",{},-100000000.0,-100000000,True
4955,1431f3af8,"தஞ்சைப் பிரகதீசுவரர் கோயில் என்றும், தஞ்சைப் ப...",தஞ்சாவூர் மாவட்டத்தின் மிகப்பெரிய கோயில் எது?,பிரகதீசுவரர் கோயில்,8,tamil,தஞ்சைப் பிரகதீசுவரர்,1,"[0, 8396, 32035, 7667, 3937, 36989, 72960, 216...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",7,-1,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,"[-7.266315460205078, -8.969326972961426, -9.98...","[-9.63282585144043, -12.521053314208984, -11.7...",{},-100000000.0,-100000000,True
4956,1431f3af8,"தஞ்சைப் பிரகதீசுவரர் கோயில் என்றும், தஞ்சைப் ப...",தஞ்சாவூர் மாவட்டத்தின் மிகப்பெரிய கோயில் எது?,பிரகதீசுவரர் கோயில்,8,tamil,தஞ்சைப் பிரகதீசுவரர்,1,"[0, 8396, 32035, 7667, 3937, 36989, 72960, 216...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",8,-1,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,"[-6.477435111999512, -9.033400535583496, -10.0...","[-8.954018592834473, -12.546662330627441, -11....",{},-100000000.0,-100000000,True
4957,1431f3af8,"தஞ்சைப் பிரகதீசுவரர் கோயில் என்றும், தஞ்சைப் ப...",தஞ்சாவூர் மாவட்டத்தின் மிகப்பெரிய கோயில் எது?,பிரகதீசுவரர் கோயில்,8,tamil,தஞ்சைப் பிரகதீசுவரர்,1,"[0, 8396, 32035, 7667, 3937, 36989, 72960, 216...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",9,-1,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,"[-7.067080020904541, -9.026237487792969, -10.0...","[-9.450307846069336, -12.549145698547363, -11....",{},-100000000.0,-100000000,True


In [58]:
row = prep_df.loc[4948]
print(row["is_contain_answer_text"])
print(row["question"])
print(row["answer_text"])
print(row["char_index_score_dict"])
print(row["offset_mapping"])
print(row["context"][0:1357])

1
தஞ்சாவூர் மாவட்டத்தின் மிகப்பெரிய கோயில் எது?
பிரகதீசுவரர் கோயில் 
{"8": -0.474261999130249}
[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (0, 1), (1, 4), (4, 7), (7, 11), (11, 12), (12, 14), (14, 16), (16, 18), (18, 20), (20, 27), (27, 35), (35, 36), (36, 38), (38, 41), (41, 44), (44, 50), (50, 57), (57, 60), (60, 63), (63, 70), (70, 72), (72, 79), (79, 81), (81, 84), (84, 87), (87, 92), (92, 93), (93, 97), (97, 100), (100, 107), (107, 110), (110, 113), (113, 115), (115, 118), (118, 121), (121, 128), (128, 130), (130, 138), (138, 143), (143, 150), (150, 152), (152, 155), (155, 157), (157, 160), (160, 161), (161, 162), (162, 164), (164, 170), (170, 173), (173, 176), (176, 179), (179, 182), (182, 184), (184, 187), (187, 188), (188, 190), (190, 194), (194, 198), (198, 202), (202, 204), (204, 205), (205, 208), (208, 211), (211, 214), (214, 216), (216, 222), (222, 

In [25]:
row["start_logits"]

array([  6.08315992,  -6.98116255,  -8.22501183,  -9.89751434,
        -7.30931473,  -9.82466316,  -8.08366489,  -9.06968117,
       -10.16252327,  -8.91719151,  -7.5209651 ,  -8.38506699,
        -8.21641445,  -8.85834312,  -9.4388895 ,  -9.32982254,
        -6.80536509,  -5.55888987,  -3.89713144,  -3.75928688,
        -2.27144718,  -5.44230604,  -4.99740744,  -6.66793633,
        -0.76670164,  -4.2261405 ,  -4.19980717,  -4.37232447,
        -7.47990942,  -6.68271875,  -8.54318619,  -8.03356838,
        -8.67862701,  -9.19072819,  -8.96863556,  -7.18331575,
        -8.57827759,  -8.72879314,  -7.10748148,  -7.09515524,
       -10.07513142,  -8.08981991,  -8.87723541,  -7.47184229,
        -8.83721542,  -9.44092751,  -9.15033913,  -9.03614426,
        -9.73720837,  -8.59174156,  -9.14120293,  -9.01697063,
        -7.80488348,  -8.84934902,  -7.65068865,  -8.67582035,
        -9.63269615,  -9.33382607,  -9.18977737,  -9.46857548,
        -8.47351646,  -8.09776306,  -8.7864809 ,  -7.98

In [179]:
row = prep_df.loc[10227]
print(row["is_contain_answer_text"])
print(row["question"])
print(row["answer_text"])
print(row["char_index_score_dict"])
print(row["offset_mapping"])
print(row["context"][5014:6244])

1
एंड्रयू हीथ लेजर ने सबसे पहले किस फिल्म में काम किया था?
ब्लैकरोच
{"5792": 0.13439003378152847}
[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (5014, 5017), (5017, 5020), (5020, 5023), (5023, 5029), (5029, 5033), (5033, 5040), (5040, 5042), (5042, 5043), (5043, 5045), (5045, 5048), (5048, 5054), (5054, 5055), (5055, 5057), (5057, 5067), (5067, 5070), (5070, 5073), (5073, 5074), (5074, 5077), (5077, 5079), (5079, 5080), (5080, 5083), (5083, 5086), (5086, 5090), (5090, 5095), (5095, 5096), (5096, 5097), (5097, 5101), (5101, 5105), (5105, 5109), (5109, 5113), (5114, 5120), (5122, 5127), (5127, 5130), (5130, 5134), (5135, 5138), (5138, 5143), (5143, 5146), (5146, 5150), (5150, 5154), (5154, 5161), (5161, 5167), (5167, 5170), (5170, 5178), (5178, 5183), (5183, 5186), (5186, 5190), (5190, 5193), (5193, 5195), (5195, 5198), (5198, 5204), (

In [176]:
row = prep_df.loc[5443]
print(row["is_contain_answer_text"])
print(row["question"])
print(row["answer_text"])
print(row["char_index_score_dict"])
print(row["offset_mapping"])
print(row["context"][14461:15614])

1
ताम्र एवं टीन के मिश्रधातु को क्या कहते हैं ?
कांसा
{"15541": -6.322589874267578}
[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (14461, 14464), (14464, 14465), (14465, 14472), (14472, 14477), (14477, 14481), (14481, 14484), (14484, 14487), (14487, 14489), (14489, 14491), (14491, 14494), (14494, 14496), (14496, 14497), (14497, 14499), (14499, 14501), (14501, 14505), (14505, 14509), (14509, 14513), (14513, 14515), (14515, 14518), (14518, 14523), (14523, 14528), (14528, 14532), (14532, 14533), (14533, 14538), (14538, 14541), (14541, 14543), (14543, 14547), (14547, 14550), (14550, 14556), (14556, 14559), (14559, 14563), (14563, 14567), (14567, 14570), (14570, 14575), (14575, 14576), (14576, 14580), (14580, 14582), (14582, 14587), (14587, 14593), (14593, 14596), (14596, 14599), (14599, 14605), (14605, 14608), (14608, 14609), (14609, 14612), (14612, 14613),

In [145]:
row["context"][4872-5:4872+len("कुनैन")]

'े ही कुनैन'

In [147]:
str(" हो। ")

' हो। '

In [128]:
row = prep_df.loc[5191]
print(row["question"])
print(row["answer_text"])
print(row["char_index_score_dict"])
print(row["offset_mapping"])
print(row["context"][0:1635])

எந்த ஆண்டில் இந்திய தேசிய காங்கிரஸ் நிறுவப்பட்டது?
1885
{"191": 12.250142097473145, "903": 9.944947481155396, "1320": 3.9632880091667175}
[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (0, 6), (6, 12), (12, 22), (22, 24), (24, 31), (31, 32), (32, 39), (39, 48), (48, 57), (57, 58), (58, 60), (60, 61), (61, 63), (63, 67), (67, 68), (68, 70), (70, 76), (76, 83), (83, 93), (93, 95), (95, 96), (96, 97), (97, 105), (105, 112), (112, 122), (122, 123), (123, 131), (131, 134), (134, 136), (136, 137), (137, 149), (149, 153), (153, 155), (155, 157), (157, 160), (160, 168), (168, 174), (174, 179), (179, 184), (184, 186), (186, 189), (189, 190), (190, 195), (195, 197), (197, 204), (204, 210), (210, 212), (212, 215), (215, 219), (219, 220), (220, 227), (227, 235), (235, 241), (241, 245), (245, 251), (251, 257), (257, 258), (258, 260), (260, 266), (266, 268), (268, 269), (269, 271), (271, 274), (274, 278), (278, 282),

In [132]:
row = prep_df.loc[5196]
print(row["question"])
print(row["answer_text"])
print(row["char_index_score_dict"])
print(row["offset_mapping"])
print(row["context"][1124:2539])

எந்த ஆண்டில் இந்திய தேசிய காங்கிரஸ் நிறுவப்பட்டது?
1885
{"6400": 7.515032052993774}
[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (4839, 4840), (4841, 4846), (4846, 4847), (4847, 4849), (4849, 4856), (4856, 4860), (4860, 4863), (4863, 4866), (4866, 4867), (4867, 4871), (4871, 4877), (4879, 4885), (4885, 4887), (4888, 4891), (4891, 4895), (4895, 4896), (4896, 4899), (4899, 4903), (4903, 4906), (4906, 4908), (4908, 4911), (4911, 4916), (4916, 4922), (4922, 4925), (4925, 4930), (4930, 4934), (4934, 4940), (4940, 4945), (4945, 4949), (4949, 4958), (4958, 4959), (4959, 4966), (4966, 4968), (4968, 4970), (4970, 4972), (4972, 4985), (4985, 4993), (4993, 4998), (4998, 5000), (5000, 5005), (5005, 5006), (5006, 5008), (5008, 5015), (5015, 5019), (5019, 5022), (5022, 5026), (5026, 5029), (5029, 5031), (5031, 5037), (5037, 5041), (5041, 5048), (5048, 5052), (5052, 5053), (5053, 5057), (5057, 5063), (5063, 5067), (

In [107]:
row = prep_df.loc[67]
print(row["offset_mapping"])
print(row["context"][:1317])

[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (0, 5), (5, 9), (9, 13), (13, 15), (15, 22), (22, 28), (28, 30), (30, 32), (32, 34), (34, 35), (35, 42), (42, 52), (52, 56), (56, 61), (61, 62), (62, 65), (65, 67), (67, 70), (70, 74), (74, 81), (81, 82), (82, 87), (87, 89), (89, 91), (91, 95), (95, 96), (96, 101), (101, 112), (112, 113), (113, 115), (115, 117), (117, 120), (120, 124), (124, 125), (125, 127), (127, 128), (128, 133), (133, 136), (136, 142), (142, 149), (149, 155), (155, 160), (160, 164), (164, 165), (165, 167), (167, 169), (169, 171), (171, 172), (172, 177), (177, 180), (180, 182), (182, 190), (190, 200), (200, 205), (205, 210), (210, 215), (215, 219), (219, 224), (224, 227), (227, 233), (233, 240), (240, 246), (246, 248), (248, 250), (250, 252), (252, 256), (256, 257), (257, 259), (259, 262), (262, 263), (263, 265), (265, 266), (266, 269), (269, 271), (271, 272), (272, 2

In [109]:
row = prep_df.loc[68]
print(row["offset_mapping"])
print(row["context"][878:2221])

[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (878, 880), (880, 885), (885, 886), (886, 889), (889, 891), (891, 893), (893, 895), (895, 896), (896, 899), (899, 901), (901, 908), (908, 911), (911, 914), (914, 917), (917, 919), (919, 921), (921, 926), (926, 932), (932, 935), (935, 936), (936, 939), (939, 941), (941, 945), (945, 953), (953, 954), (954, 957), (957, 961), (961, 963), (963, 966), (966, 967), (967, 968), (968, 971), (971, 975), (975, 978), (978, 979), (979, 982), (982, 986), (986, 990), (990, 991), (991, 996), (996, 998), (998, 1001), (1001, 1005), (1005, 1010), (1010, 1015), (1015, 1016), (1016, 1022), (1022, 1026), (1026, 1032), (1032, 1037), (1037, 1044), (1044, 1049), (1049, 1058), (1058, 1059), (1059, 1060), (1060, 1062), (1062, 1064), (1064, 1069), (1069, 1074), (1074, 1076), (1076, 1082), (1082, 1088), (1088, 1093), (1093, 1095), (1095, 1101), (1101, 1105), (1105, 1

In [114]:
row = prep_df.loc[47]
print(row["question"])
print(row["answer_text"])
print(row["char_index_score_dict"])
print(row["offset_mapping"])
print(row["context"][3473:4761])

திரிதடையங்களைப் பயன்படுத்திய முதல் நிறுவனம் எது?
IBM
{"4171": 3.6412532925605774, "4208": 5.769239664077759, "4352": -5.552682399749756, "4468": -9.261614799499512}
[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (3473, 3476), (3476, 3482), (3482, 3486), (3486, 3488), (3488, 3493), (3493, 3497), (3497, 3501), (3501, 3505), (3505, 3510), (3510, 3511), (3511, 3512), (3512, 3516), (3516, 3517), (3517, 3520), (3520, 3524), (3524, 3528), (3528, 3532), (3532, 3533), (3533, 3535), (3535, 3538), (3538, 3539), (3539, 3543), (3543, 3546), (3547, 3551), (3551, 3558), (3558, 3559), (3559, 3572), (3572, 3573), (3573, 3577), (3577, 3580), (3580, 3584), (3584, 3587), (3587, 3588), (3588, 3592), (3592, 3593), (3593, 3596), (3596, 3598), (3598, 3600), (3600, 3602), (3602, 3604), (3604, 3605), (3605, 3607), (3607, 3611), (3611, 3613), (3613, 3620), (3620, 3622), (3622, 3625), (3625, 3627), (3627, 3629)

In [119]:
row = prep_df.loc[14184]
print(row["question"])
print(row["answer_text"])
print(row["char_index_score_dict"])
print(row["offset_mapping"])
print(row["context"][817:4761])

लीजा रे की पहली फिल्म का नाम क्या था?
नेताजी
{"2017": 14.952022552490234}
[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (817, 818), (818, 821), (821, 824), (824, 829), (829, 832), (832, 834), (834, 837), (837, 841), (841, 843), (843, 848), (848, 852), (852, 856), (856, 861), (861, 865), (865, 869), (869, 874), (874, 882), (882, 883), (883, 888), (888, 889), (889, 894), (894, 897), (897, 901), (901, 904), (904, 906), (906, 909), (909, 912), (912, 914), (914, 916), (916, 919), (919, 924), (924, 927), (927, 931), (931, 934), (934, 940), (940, 945), (945, 950), (950, 954), (954, 955), (955, 958), (958, 963), (963, 967), (967, 970), (970, 980), (980, 986), (986, 989), (989, 995), (995, 998), (998, 1003), (1003, 1007), (1007, 1010), (1010, 1013), (1013, 1016), (1016, 1019), (1019, 1026), (1026, 1029), (1029, 1032), (1032, 1037), (1037, 1041), (1041, 1044), (1044, 1045), (1045, 1048), (1048, 1052), 

In [18]:
prep_df["is_original"] = ~prep_df["id"].str.contains("^xquad_|^mlqa_").values
prep_df.is_original.value_counts()

True     14188
False     8138
Name: is_original, dtype: int64

In [19]:
with open("../data/preprocessed/e072_e073_prep_df.pkl", "wb") as fout:
    pickle.dump(prep_df, fout)

## prep df として作ってしまおう。
 - 1 text batch 内では argmax を使う
 - 0 より小さい場合は無視する
 - 1 id 内に一つもない場合は以下の 2 通りを扱う
     - argmax を使う
     - 無視する

In [20]:
with open("../data/preprocessed/e072_e073_prep_df.pkl", "rb") as fin:
    prep_df = pickle.load(fin)
prep_df.head()

Unnamed: 0,id,context,question,answer_text,answer_start,language,top20_context,answer_text_count,input_ids,token_type_ids,attention_mask,sequence_ids,offset_mapping,overflowing_batch_id,duplicated_elems_num_with,part_answer_text_count,start_position,end_position,segmentation_position,is_contain_answer_text,start_logits,end_logits,char_index_score_dict,max_score,max_index,is_original
0,903deec17,ஒரு சாதாரண வளர்ந்த மனிதனுடைய எலும்புக்கூடு பின...,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,53,tamil,ஒரு சாதாரண வளர்ந்த ம,1,"[0, 69535, 81049, 37368, 153264, 12095, 52989,...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, Non...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",0,-1,1,27,27,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,"[-6.871591091156006, -7.66694974899292, -9.131...","[-8.961920738220215, -12.13699722290039, -11.7...","{""53"": 11.993648052215576}",11.99365,53,True
1,903deec17,ஒரு சாதாரண வளர்ந்த மனிதனுடைய எலும்புக்கூடு பின...,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,53,tamil,ஒரு சாதாரண வளர்ந்த ம,1,"[0, 69535, 81049, 37368, 153264, 12095, 52989,...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, Non...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",1,-1,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,"[-6.353556156158447, -7.573686122894287, -8.77...","[-8.663551330566406, -12.235248565673828, -11....",{},-100000000.0,-100000000,True
2,903deec17,ஒரு சாதாரண வளர்ந்த மனிதனுடைய எலும்புக்கூடு பின...,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,53,tamil,ஒரு சாதாரண வளர்ந்த ம,1,"[0, 69535, 81049, 37368, 153264, 12095, 52989,...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, Non...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",2,-1,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,"[-5.727337837219238, -7.6956095695495605, -8.7...","[-7.9614105224609375, -12.012073516845703, -11...",{},-100000000.0,-100000000,True
3,903deec17,ஒரு சாதாரண வளர்ந்த மனிதனுடைய எலும்புக்கூடு பின...,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,53,tamil,ஒரு சாதாரண வளர்ந்த ம,1,"[0, 69535, 81049, 37368, 153264, 12095, 52989,...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, Non...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",3,-1,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,"[-6.6011643409729, -7.867605209350586, -8.9840...","[-8.839470863342285, -12.1362886428833, -11.83...",{},-100000000.0,-100000000,True
4,903deec17,ஒரு சாதாரண வளர்ந்த மனிதனுடைய எலும்புக்கூடு பின...,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,53,tamil,ஒரு சாதாரண வளர்ந்த ம,1,"[0, 69535, 81049, 37368, 153264, 12095, 52989,...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, Non...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",4,-1,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,"[-6.303467273712158, -7.26361608505249, -8.978...","[-8.5467529296875, -12.05208969116211, -11.755...",{},-100000000.0,-100000000,True


In [21]:
def get_max_start_char_index(char_index_score_dict, thresh):
    max_start_char_index = -1  # default -1
    largest_score = -100000000000
    for k, v in char_index_score_dict.items():
        if v < thresh:
            continue
        if v > largest_score:
            max_start_char_index = int(k)
            largest_score = v
    return max_start_char_index

def calc_start_end_position(start_char_index, end_char_index, offset_mapping):
    start_index = 0
    while offset_mapping[start_index][0] == -1:
        start_index += 1
    while offset_mapping[start_index][0] != -1 and offset_mapping[start_index][0] <= start_char_index:
        start_index += 1
    start_index -= 1
    end_index = len(offset_mapping) - 1
    while offset_mapping[end_index][1] == -1:
        end_index -= 1
    while offset_mapping[end_index][1] != -1 and offset_mapping[end_index][1] >= end_char_index:
        end_index -= 1
    end_index += 1
    return start_index, end_index

In [24]:
from copy import deepcopy

THRESH = -1.
# THRESH = 0.

reses = []
for i, row in tqdm(prep_df.iterrows(), total=len(prep_df)):
    copied_row = deepcopy(row)
    
    start_char_index = get_max_start_char_index(json.loads(copied_row["char_index_score_dict"]), thresh=THRESH)
    copied_row["answer_start"] = start_char_index
    end_char_index = start_char_index + len(row["answer_text"])
    
    offset_mapping = copied_row["offset_mapping"]
    if start_char_index < 0:
        copied_row["is_contain_answer_text"] = False
        copied_row["start_position"] = 0
        copied_row["end_position"] = 0
        copied_row["segmentation_position"] = [1] + [0] * (len(offset_mapping) - 1)
    else:
        copied_row["is_contain_answer_text"] = True
        start_position, end_position = calc_start_end_position(start_char_index, end_char_index, offset_mapping)
        copied_row["start_position"] = start_position
        copied_row["end_position"] = end_position
        copied_row["segmentation_position"] = [1 if start_position <= i and i <= end_position else 0 for i in range(len(offset_mapping))]
    reses.append(copied_row)

res_prep_df = pd.DataFrame([res_row.to_dict() for res_row in reses])
res_prep_df

  0%|          | 0/22326 [00:00<?, ?it/s]

Unnamed: 0,id,context,question,answer_text,answer_start,language,top20_context,answer_text_count,input_ids,token_type_ids,attention_mask,sequence_ids,offset_mapping,overflowing_batch_id,duplicated_elems_num_with,part_answer_text_count,start_position,end_position,segmentation_position,is_contain_answer_text,start_logits,end_logits,char_index_score_dict,max_score,max_index,is_original
0,903deec17,ஒரு சாதாரண வளர்ந்த மனிதனுடைய எலும்புக்கூடு பின...,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,53,tamil,ஒரு சாதாரண வளர்ந்த ம,1,"[0, 69535, 81049, 37368, 153264, 12095, 52989,...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, Non...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",0,-1,1,27,27,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",True,"[-6.871591091156006, -7.66694974899292, -9.131...","[-8.961920738220215, -12.13699722290039, -11.7...","{""53"": 11.993648052215576}",1.199365e+01,53,True
1,903deec17,ஒரு சாதாரண வளர்ந்த மனிதனுடைய எலும்புக்கூடு பின...,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,-1,tamil,ஒரு சாதாரண வளர்ந்த ம,1,"[0, 69535, 81049, 37368, 153264, 12095, 52989,...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, Non...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",1,-1,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",False,"[-6.353556156158447, -7.573686122894287, -8.77...","[-8.663551330566406, -12.235248565673828, -11....",{},-1.000000e+08,-100000000,True
2,903deec17,ஒரு சாதாரண வளர்ந்த மனிதனுடைய எலும்புக்கூடு பின...,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,-1,tamil,ஒரு சாதாரண வளர்ந்த ம,1,"[0, 69535, 81049, 37368, 153264, 12095, 52989,...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, Non...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",2,-1,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",False,"[-5.727337837219238, -7.6956095695495605, -8.7...","[-7.9614105224609375, -12.012073516845703, -11...",{},-1.000000e+08,-100000000,True
3,903deec17,ஒரு சாதாரண வளர்ந்த மனிதனுடைய எலும்புக்கூடு பின...,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,-1,tamil,ஒரு சாதாரண வளர்ந்த ம,1,"[0, 69535, 81049, 37368, 153264, 12095, 52989,...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, Non...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",3,-1,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",False,"[-6.6011643409729, -7.867605209350586, -8.9840...","[-8.839470863342285, -12.1362886428833, -11.83...",{},-1.000000e+08,-100000000,True
4,903deec17,ஒரு சாதாரண வளர்ந்த மனிதனுடைய எலும்புக்கூடு பின...,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,-1,tamil,ஒரு சாதாரண வளர்ந்த ம,1,"[0, 69535, 81049, 37368, 153264, 12095, 52989,...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, Non...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",4,-1,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",False,"[-6.303467273712158, -7.26361608505249, -8.978...","[-8.5467529296875, -12.05208969116211, -11.755...",{},-1.000000e+08,-100000000,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22321,xquad_1185,विद्युत आवेश के परिवर्तन की समय दर के रूप में ...,इलेक्ट्रोस्टैटिक और चुंबकीय बल के योग के रूप क...,इलेक्ट्रोस्टैटिक बल,-1,hindi,विद्युत आवेश के परिव,1,"[0, 234186, 2284, 17433, 3045, 71683, 871, 369...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",0,-1,1,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",False,"[-6.719956398010254, -8.365612983703613, -10.4...","[-8.62995719909668, -12.595864295959473, -11.3...","{""328"": -9.274200439453125}",-9.274200e+00,328,False
22322,xquad_1186,उस आयतन के लिए प्रासंगिक क्रॉस-सेक्शनल क्षेत्र...,संरचनाओं में तनाव का कारण क्या बनता है?,तनाव टेंसर,343,hindi,उस आयतन के लिए प्रास,1,"[0, 120018, 11846, 421, 129558, 641, 6701, 600...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",0,-1,1,128,131,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",True,"[-3.6441473960876465, -8.561023712158203, -10....","[-5.722349643707275, -11.100050926208496, -10....","{""343"": 6.70177435874939}",6.701774e+00,343,False
22323,xquad_1187,उस आयतन के लिए प्रासंगिक क्रॉस-सेक्शनल क्षेत्र...,किसी वस्तु के आयतन में क्रॉस सेक्शन क्षेत्र की...,दबाव की शर्तें,-1,hindi,उस आयतन के लिए प्रास,1,"[0, 12820, 89773, 287, 34889, 41420, 421, 4761...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",0,-1,1,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",False,"[-4.3488993644714355, -8.246306419372559, -9.5...","[-6.398498058319092, -12.459280014038086, -11....","{""118"": -8.283782720565796}",-8.283783e+00,118,False
22324,xquad_1188,उस आयतन के लिए प्रासंगिक क्रॉस-सेक्शनल क्षेत्र...,सामान्य ताकतों से क्या जुड़ा है?,दबाव की शर्तें,118,hindi,उस आयतन के लिए प्रास,1,"[0, 38338, 217186, 1302, 646, 6004, 158371, 10...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, None, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",0,-1,1,51,53,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",True,"[-3.942087411880493, -8.996224403381348, -9.38...","[-6.257936477661133, -11.771550178527832, -11....","{""118"": 1.8046899139881134}",1.804690e+00,118,False


In [21]:
res_prep_df.head(100)

Unnamed: 0,id,context,question,answer_text,answer_start,language,top20_context,answer_text_count,input_ids,token_type_ids,attention_mask,sequence_ids,offset_mapping,overflowing_batch_id,duplicated_elems_num_with,part_answer_text_count,start_position,end_position,segmentation_position,is_contain_answer_text,start_logits,end_logits,char_index_score_dict,max_score,max_index,is_original
0,903deec17,ஒரு சாதாரண வளர்ந்த மனிதனுடைய எலும்புக்கூடு பின...,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,53,tamil,ஒரு சாதாரண வளர்ந்த ம,1,"[0, 69535, 81049, 37368, 153264, 12095, 52989,...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, Non...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",0,-1,1,27,27,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",True,"[3.544491767883301, -7.573987007141113, -9.143...","[1.5507718324661255, -11.260123252868652, -11....","{""53"": 12.708677768707275}",12.70868,53,True
1,903deec17,ஒரு சாதாரண வளர்ந்த மனிதனுடைய எலும்புக்கூடு பின...,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,-1,tamil,ஒரு சாதாரண வளர்ந்த ம,1,"[0, 69535, 81049, 37368, 153264, 12095, 52989,...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, Non...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",1,-1,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",False,"[5.329960823059082, -8.377323150634766, -9.419...","[3.5831198692321777, -11.672513961791992, -11....",{},-100000000.0,-100000000,True
2,903deec17,ஒரு சாதாரண வளர்ந்த மனிதனுடைய எலும்புக்கூடு பின...,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,-1,tamil,ஒரு சாதாரண வளர்ந்த ம,1,"[0, 69535, 81049, 37368, 153264, 12095, 52989,...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, Non...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",2,-1,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",False,"[5.505677223205566, -8.656808853149414, -9.415...","[3.6840555667877197, -11.733305931091309, -11....",{},-100000000.0,-100000000,True
3,903deec17,ஒரு சாதாரண வளர்ந்த மனிதனுடைய எலும்புக்கூடு பின...,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,-1,tamil,ஒரு சாதாரண வளர்ந்த ம,1,"[0, 69535, 81049, 37368, 153264, 12095, 52989,...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, Non...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",3,-1,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",False,"[5.036523342132568, -8.482492446899414, -9.488...","[3.365272283554077, -11.83347225189209, -11.64...",{},-100000000.0,-100000000,True
4,903deec17,ஒரு சாதாரண வளர்ந்த மனிதனுடைய எலும்புக்கூடு பின...,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,-1,tamil,ஒரு சாதாரண வளர்ந்த ம,1,"[0, 69535, 81049, 37368, 153264, 12095, 52989,...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, Non...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",4,-1,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",False,"[5.30822229385376, -7.854609966278076, -9.2968...","[3.545640468597412, -11.60487174987793, -11.43...",{},-100000000.0,-100000000,True
5,903deec17,ஒரு சாதாரண வளர்ந்த மனிதனுடைய எலும்புக்கூடு பின...,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,-1,tamil,ஒரு சாதாரண வளர்ந்த ம,1,"[0, 69535, 81049, 37368, 153264, 12095, 52989,...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, Non...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",5,-1,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",False,"[5.214144229888916, -7.426871299743652, -9.247...","[3.461940288543701, -11.324159622192383, -11.4...",{},-100000000.0,-100000000,True
6,d9841668c,காளிதாசன் (தேவநாகரி: कालिदास) சமஸ்கிருத இலக்கி...,காளிதாசன் எங்கு பிறந்தார்?,காசுமீரில்,-1,tamil,காளிதாசன் (தேவநாகரி:,3,"[0, 12751, 14622, 9654, 52881, 101514, 3686, 6...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, None, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",0,-1,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",False,"[4.828103542327881, -9.154858589172363, -10.40...","[2.961247205734253, -12.344258308410645, -11.7...",{},-100000000.0,-100000000,True
7,d9841668c,காளிதாசன் (தேவநாகரி: कालिदास) சமஸ்கிருத இலக்கி...,காளிதாசன் எங்கு பிறந்தார்?,காசுமீரில்,-1,tamil,காளிதாசன் (தேவநாகரி:,3,"[0, 12751, 14622, 9654, 52881, 101514, 3686, 6...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, None, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",1,-1,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",False,"[5.452599048614502, -9.358258247375488, -10.39...","[3.599719524383545, -12.596125602722168, -11.9...",{},-100000000.0,-100000000,True
8,d9841668c,காளிதாசன் (தேவநாகரி: कालिदास) சமஸ்கிருத இலக்கி...,காளிதாசன் எங்கு பிறந்தார்?,காசுமீரில்,-1,tamil,காளிதாசன் (தேவநாகரி:,3,"[0, 12751, 14622, 9654, 52881, 101514, 3686, 6...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, None, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",2,-1,3,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",False,"[5.315063953399658, -9.38316535949707, -10.414...","[3.4895339012145996, -12.656550407409668, -11....","{""2358"": -4.3671029806137085, ""2531"": -18.7519...",-4.367103,2358,True
9,d9841668c,காளிதாசன் (தேவநாகரி: कालिदास) சமஸ்கிருத இலக்கி...,காளிதாசன் எங்கு பிறந்தார்?,காசுமீரில்,-1,tamil,காளிதாசன் (தேவநாகரி:,3,"[0, 12751, 14622, 9654, 52881, 101514, 3686, 6...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, None, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",3,-1,1,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",False,"[5.25612735748291, -9.473699569702148, -10.509...","[3.4387102127075195, -12.419685363769531, -11....","{""2653"": -21.670771598815918}",-21.67077,2653,True


In [6]:
res_prep_df.query("is_original and part_answer_text_count > 2")

Unnamed: 0,id,context,question,answer_text,answer_start,language,top20_context,answer_text_count,input_ids,token_type_ids,attention_mask,sequence_ids,offset_mapping,overflowing_batch_id,duplicated_elems_num_with,part_answer_text_count,start_position,end_position,segmentation_position,is_contain_answer_text,start_logits,end_logits,char_index_score_dict,max_score,max_index,is_original
8,d9841668c,காளிதாசன் (தேவநாகரி: कालिदास) சமஸ்கிருத இலக்கி...,காளிதாசன் எங்கு பிறந்தார்?,காசுமீரில்,-1,tamil,காளிதாசன் (தேவநாகரி:,3,"[0, 12751, 14622, 9654, 52881, 101514, 3686, 6...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, None, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",2,-1,3,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",True,"[5.315063953399658, -9.38316535949707, -10.414...","[3.4895339012145996, -12.656550407409668, -11....","{""2358"": -4.3671029806137085, ""2531"": -18.7519...",-4.367103,2358,True
21,41660850a,"குழந்தையின் அழுகையை நிறுத்தவும், தூங்க வைக்கவ...",தமிழ்நாட்டில் குழந்தைகளை தூங்க வைக்க பாடும் பா...,தாலாட்டு,68,tamil,குழந்தையின் அழுகையை,27,"[0, 9708, 129729, 8938, 49936, 6819, 46018, 69...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",0,-1,9,38,40,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",False,"[3.464341640472412, -8.958144187927246, -9.736...","[1.1139813661575317, -11.528419494628906, -11....","{""68"": 13.48068380355835, ""96"": -1.32560634613...",13.480684,68,True
22,41660850a,"குழந்தையின் அழுகையை நிறுத்தவும், தூங்க வைக்கவ...",தமிழ்நாட்டில் குழந்தைகளை தூங்க வைக்க பாடும் பா...,தாலாட்டு,-1,tamil,குழந்தையின் அழுகையை,27,"[0, 9708, 129729, 8938, 49936, 6819, 46018, 69...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",1,-1,8,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",True,"[5.2053704261779785, -8.421758651733398, -9.62...","[3.4265682697296143, -11.456436157226562, -11....","{""1159"": -15.133623600006104, ""1300"": -17.6087...",-15.133624,1159,True
23,41660850a,"குழந்தையின் அழுகையை நிறுத்தவும், தூங்க வைக்கவ...",தமிழ்நாட்டில் குழந்தைகளை தூங்க வைக்க பாடும் பா...,தாலாட்டு,-1,tamil,குழந்தையின் அழுகையை,27,"[0, 9708, 129729, 8938, 49936, 6819, 46018, 69...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",2,-1,7,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",True,"[5.150566577911377, -8.491168975830078, -9.624...","[3.3948814868927, -11.475215911865234, -11.694...","{""1823"": -10.659355640411377, ""1964"": -19.1777...",-10.659356,1823,True
26,41660850a,"குழந்தையின் அழுகையை நிறுத்தவும், தூங்க வைக்கவ...",தமிழ்நாட்டில் குழந்தைகளை தூங்க வைக்க பாடும் பா...,தாலாட்டு,-1,tamil,குழந்தையின் அழுகையை,27,"[0, 9708, 129729, 8938, 49936, 6819, 46018, 69...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",5,-1,6,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",True,"[5.011085033416748, -8.686507225036621, -9.749...","[3.235741376876831, -11.558892250061035, -11.7...","{""4453"": -21.258883476257324, ""4631"": -19.2430...",-19.175587,4957,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13912,79b26e4d8,विश्वनाथ प्रताप सिंह भारत गणराज्य के आठवें प्र...,भारत के आठवें प्रधानमंत्री कौन थे?,विश्वनाथ प्रताप सिंह,-1,hindi,विश्वनाथ प्रताप सिंह,15,"[0, 3946, 287, 35175, 92463, 42775, 49329, 115...","[None, 0, 0, 0, 0, 0, 0, 0, 0, None, None, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",1,-1,6,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",True,"[4.599503040313721, -9.27696704864502, -10.137...","[3.1250529289245605, -10.93246078491211, -11.1...","{""1123"": -15.680858135223389, ""1461"": -14.5796...",-11.609926,2043,True
13917,79b26e4d8,विश्वनाथ प्रताप सिंह भारत गणराज्य के आठवें प्र...,भारत के आठवें प्रधानमंत्री कौन थे?,विश्वनाथ प्रताप सिंह,-1,hindi,विश्वनाथ प्रताप सिंह,15,"[0, 3946, 287, 35175, 92463, 42775, 49329, 115...","[None, 0, 0, 0, 0, 0, 0, 0, 0, None, None, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",6,-1,3,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",True,"[4.843196392059326, -8.57470417022705, -9.6302...","[3.193453788757324, -11.093741416931152, -10.9...","{""6028"": -11.795443058013916, ""6716"": -5.60433...",-5.604335,6716,True
14004,dcd67189c,रग्बी लीग विश्व कप राष्ट्रीय टीमों द्वारा खेला...,२०१७ में 'रग्बी लीग विश्व कप किस देश ने जीता था?,ऑस्ट्रेलिया,818,hindi,रग्बी लीग विश्व कप र,8,"[0, 95742, 421, 242, 1393, 20071, 12688, 19252...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",0,-1,5,253,253,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",False,"[0.6515074372291565, -9.002923965454102, -10.3...","[-1.3335314989089966, -10.113743782043457, -10...","{""289"": -9.433437824249268, ""436"": -0.78814081...",17.140829,818,True
14005,dcd67189c,रग्बी लीग विश्व कप राष्ट्रीय टीमों द्वारा खेला...,२०१७ में 'रग्बी लीग विश्व कप किस देश ने जीता था?,ऑस्ट्रेलिया,1695,hindi,रग्बी लीग विश्व कप र,8,"[0, 95742, 421, 242, 1393, 20071, 12688, 19252...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",1,-1,3,236,236,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",False,"[4.035395622253418, -9.715600967407227, -9.940...","[2.1665444374084473, -11.016805648803711, -10....","{""1604"": -7.072896480560303, ""1695"": 3.1218366...",3.121837,1695,True


In [7]:
prep_df.loc[14132:14132]

Unnamed: 0,id,context,question,answer_text,answer_start,language,top20_context,answer_text_count,input_ids,token_type_ids,attention_mask,sequence_ids,offset_mapping,overflowing_batch_id,duplicated_elems_num_with,part_answer_text_count,start_position,end_position,segmentation_position,is_contain_answer_text,start_logits,end_logits,char_index_score_dict,max_score,max_index,is_original
14132,26f356026,स्वामी निगमानन्द परमहंस (18 अगस्त 1880 - 29 नव...,स्वामी निगमानन्द परमहंस के तन्त्र गुरु कौन थे?,बामाक्षेपा,2691,hindi,स्वामी निगमानन्द परम,4,"[0, 122585, 103689, 144569, 968, 110013, 25784...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",3,-1,4,65,68,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,"[4.716207504272461, -8.972646713256836, -7.285...","[3.159884214401245, -11.494466781616211, -10.3...","{""2691"": -11.605035305023193, ""3293"": -5.28046...",-5.280469,3293,True


In [34]:
res_prep_df.loc[22219]["char_index_score_dict"]

'{"0": -0.03943347930908203, "47": -1.991658091545105, "158": 10.660694360733032, "219": -4.200814723968506, "309": -7.003398895263672, "452": -8.943944931030273, "507": -9.516861915588379, "552": -10.651233911514282}'

In [25]:
with open(f"../data/dataset/val_pseudo/e072_e073_{THRESH:.3f}_res_prep_df.pkl", "wb") as fout:
    pickle.dump(res_prep_df, fout)

In [8]:
# THRESH = -1.
# with open(f"../data/dataset/val_pseudo/e070_{THRESH:.3f}_res_prep_df.pkl", "rb") as fin:
#     res_prep_df = pickle.load(fin)