In [10]:
import sys
import pandas as pd
import json
import numpy as np
import torch
from torch import Tensor
import pickle
from tqdm.auto import tqdm

from matplotlib import pyplot as plt

sys.path.append("../")
from src.log import myLogger
from src.repository.data_repository import DataRepository
from src.checkpoint.checkpoint import Checkpoint
from src.metrics.jaccard import jaccard

pd.set_option("display.max_columns", 300)
pd.set_option("display.max_rows", 600)

%load_ext autoreload
%autoreload 2

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))
%config InlineBackend.figure_formats = {'png', 'retina'}

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
logger = myLogger("../logs/temp.log", exp_id="e000", wdb_prj_id="temp", exp_config=None, use_wdb=False) 
dr = DataRepository(logger=logger, local_root_path="..")

2021-11-13 23:35:46,628 log.py               31   [INFO] [__init__] skip wandb init 


In [25]:
import pickle

train_prep_df = dr.load_preprocessed_df(
          dataset_name="train",                                                                                                                                                                                                                                   
          class_name="BaselineKernelPreprocessorV1",
          tokenizer_name="XLMRobertaTokenizerFast",
          max_length=400,
          pad_on_right=True,
          stride=135,
          split=False,
          lstrip=False,
          use_language_as_question=False,
          add_overflowing_batch_id=False,
    )

mlqa_hindi_prep_df = dr.load_preprocessed_df(
          dataset_name="mlqa_hindi",                                                                                                                                                                                                                                   
          class_name="BaselineKernelPreprocessorV1",
          tokenizer_name="XLMRobertaTokenizerFast",
          max_length=400,
          pad_on_right=True,
          stride=135,
          split=False,
          lstrip=False,
          use_language_as_question=False,
          add_overflowing_batch_id=False,
    )

xquad_prep_df = dr.load_preprocessed_df(
          dataset_name="xquad",                                                                                                                                                                                                                                   
          class_name="BaselineKernelPreprocessorV1",
          tokenizer_name="XLMRobertaTokenizerFast",
          max_length=400,
          pad_on_right=True,
          stride=135,
          split=False,
          lstrip=False,
          use_language_as_question=False,
          add_overflowing_batch_id=False,
    )

prep_df = pd.concat([train_prep_df, mlqa_hindi_prep_df, xquad_prep_df], axis=0).reset_index(drop=True)
prep_df

2021-11-13 22:09:19,344 data_repository.py   253  [INFO] [load_preprocessed_df] loading data/preprocessed/train_BaselineKernelPreprocessorV1_XLMRobertaTokenizerFast_400_True_135_False_False_False_False.pkl ... 
2021-11-13 22:09:21,960 data_repository.py   260  [INFO] [load_preprocessed_df] done. 
2021-11-13 22:09:21,964 data_repository.py   253  [INFO] [load_preprocessed_df] loading data/preprocessed/mlqa_hindi_BaselineKernelPreprocessorV1_XLMRobertaTokenizerFast_400_True_135_False_False_False_False.pkl ... 
2021-11-13 22:09:22,961 data_repository.py   260  [INFO] [load_preprocessed_df] done. 
2021-11-13 22:09:22,964 data_repository.py   253  [INFO] [load_preprocessed_df] loading data/preprocessed/xquad_BaselineKernelPreprocessorV1_XLMRobertaTokenizerFast_400_True_135_False_False_False_False.pkl ... 
2021-11-13 22:09:23,161 data_repository.py   260  [INFO] [load_preprocessed_df] done. 


Unnamed: 0,id,context,question,answer_text,answer_start,language,top20_context,answer_text_count,input_ids,token_type_ids,attention_mask,sequence_ids,offset_mapping,overflowing_batch_id,duplicated_elems_num_with,part_answer_text_count,start_position,end_position,segmentation_position,is_contain_answer_text
0,903deec17,ஒரு சாதாரண வளர்ந்த மனிதனுடைய எலும்புக்கூடு பின...,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,53,tamil,ஒரு சாதாரண வளர்ந்த ம,1,"[0, 69535, 81049, 37368, 153264, 12095, 52989,...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, Non...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",0,-1,1,27,27,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1
1,903deec17,ஒரு சாதாரண வளர்ந்த மனிதனுடைய எலும்புக்கூடு பின...,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,53,tamil,ஒரு சாதாரண வளர்ந்த ம,1,"[0, 69535, 81049, 37368, 153264, 12095, 52989,...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, Non...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",1,-1,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0
2,903deec17,ஒரு சாதாரண வளர்ந்த மனிதனுடைய எலும்புக்கூடு பின...,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,53,tamil,ஒரு சாதாரண வளர்ந்த ம,1,"[0, 69535, 81049, 37368, 153264, 12095, 52989,...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, Non...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",2,-1,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0
3,903deec17,ஒரு சாதாரண வளர்ந்த மனிதனுடைய எலும்புக்கூடு பின...,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,53,tamil,ஒரு சாதாரண வளர்ந்த ம,1,"[0, 69535, 81049, 37368, 153264, 12095, 52989,...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, Non...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",3,-1,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0
4,903deec17,ஒரு சாதாரண வளர்ந்த மனிதனுடைய எலும்புக்கூடு பின...,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,53,tamil,ஒரு சாதாரண வளர்ந்த ம,1,"[0, 69535, 81049, 37368, 153264, 12095, 52989,...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, Non...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",4,-1,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22321,xquad_1185,विद्युत आवेश के परिवर्तन की समय दर के रूप में ...,इलेक्ट्रोस्टैटिक और चुंबकीय बल के योग के रूप क...,इलेक्ट्रोस्टैटिक बल,328,hindi,विद्युत आवेश के परिव,1,"[0, 234186, 2284, 17433, 3045, 71683, 871, 369...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",0,-1,1,116,121,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1
22322,xquad_1186,उस आयतन के लिए प्रासंगिक क्रॉस-सेक्शनल क्षेत्र...,संरचनाओं में तनाव का कारण क्या बनता है?,तनाव टेंसर,343,hindi,उस आयतन के लिए प्रास,1,"[0, 120018, 11846, 421, 129558, 641, 6701, 600...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",0,-1,1,128,131,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1
22323,xquad_1187,उस आयतन के लिए प्रासंगिक क्रॉस-सेक्शनल क्षेत्र...,किसी वस्तु के आयतन में क्रॉस सेक्शन क्षेत्र की...,दबाव की शर्तें,118,hindi,उस आयतन के लिए प्रास,1,"[0, 12820, 89773, 287, 34889, 41420, 421, 4761...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",0,-1,1,66,68,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1
22324,xquad_1188,उस आयतन के लिए प्रासंगिक क्रॉस-सेक्शनल क्षेत्र...,सामान्य ताकतों से क्या जुड़ा है?,दबाव की शर्तें,118,hindi,उस आयतन के लिए प्रास,1,"[0, 38338, 217186, 1302, 646, 6004, 158371, 10...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, None, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",0,-1,1,51,53,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1


#### load values

In [4]:
import gc
from collections import defaultdict
from tqdm.auto import tqdm

target_exp_ids = ["e070"]

checkpoints_info = {}

for target_exp_id in target_exp_ids:
    checkpoints_info[target_exp_id] = defaultdict(list)
    best_checkpoints = dr.best_checkpoint_filepaths(target_exp_id)
    for best_checkpoint in tqdm(best_checkpoints):
        exp_fold_checkpoint = dr.load_checkpoint_from_filepath(filepath_from_root=best_checkpoint, load_from_gcs=True, rm_local_after_load=True) 
        del exp_fold_checkpoint.model_state_dict
        del exp_fold_checkpoint.scheduler_state_dict
        del exp_fold_checkpoint.optimizer_state_dict
        gc.collect()
        checkpoints_info[target_exp_id]["val_ids"].extend(exp_fold_checkpoint.val_ids)
        checkpoints_info[target_exp_id]["val_start_logits"].extend(exp_fold_checkpoint.val_start_logits)
        checkpoints_info[target_exp_id]["val_end_logits"].extend(exp_fold_checkpoint.val_end_logits)
        del exp_fold_checkpoint
        gc.collect()

  0%|          | 0/5 [00:00<?, ?it/s]

2021-11-13 21:58:35,332 repository.py        167  [INFO] [__download_from_gcs] downloading data/checkpoint/e070/best_checkpoint/0_1_1.9333_0.6728.pkl from gs://kaggle-chaii-2021/../data/checkpoint/e070/best_checkpoint/0_1_1.9333_0.6728.pkl 
2021-11-13 21:59:29,633 repository.py        176  [INFO] [__download_from_gcs] download done. 
2021-11-13 21:59:43,668 repository.py        167  [INFO] [__download_from_gcs] downloading data/checkpoint/e070/best_checkpoint/1_1_1.6149_0.6885.pkl from gs://kaggle-chaii-2021/../data/checkpoint/e070/best_checkpoint/1_1_1.6149_0.6885.pkl 
2021-11-13 22:00:38,110 repository.py        176  [INFO] [__download_from_gcs] download done. 
2021-11-13 22:00:48,611 repository.py        167  [INFO] [__download_from_gcs] downloading data/checkpoint/e070/best_checkpoint/2_2_2.0890_0.6831.pkl from gs://kaggle-chaii-2021/../data/checkpoint/e070/best_checkpoint/2_2_2.0890_0.6831.pkl 
2021-11-13 22:01:42,131 repository.py        176  [INFO] [__download_from_gcs] download

#### merge values

In [26]:
checkpoints_info.keys()

dict_keys(['e070'])

In [27]:
# checkpoints_info["e049"]["val_ids"] == checkpoints_info["e059"]["val_ids"]

In [28]:
for target_exp_id in checkpoints_info.keys():
    checkpoints_info[target_exp_id]["val_start_logits"] = np.asarray(checkpoints_info[target_exp_id]["val_start_logits"])
    checkpoints_info[target_exp_id]["val_end_logits"] = np.asarray(checkpoints_info[target_exp_id]["val_end_logits"])
    
# for target_exp_id in checkpoints_info.keys():
#     checkpoints_info[target_exp_id]["start_logits"] = [np.asarray(logit) for logit in checkpoints_info[target_exp_id]["start_logits"]]
#     checkpoints_info[target_exp_id]["end_logits"] = [np.asarray(logit) for logit in checkpoints_info[target_exp_id]["end_logits"]]

In [29]:
ensembled_checkpoint_info = {
    "val_ids": None,
    "val_start_logits": None,
    "val_end_logits": None,
}

for target_exp_id in checkpoints_info.keys():
    if ensembled_checkpoint_info["val_ids"] is None:
        ensembled_checkpoint_info["val_ids"] = checkpoints_info[target_exp_id]["val_ids"]
    if ensembled_checkpoint_info["val_start_logits"] is None:
        ensembled_checkpoint_info["val_start_logits"] = checkpoints_info[target_exp_id]["val_start_logits"]
    else:
        ensembled_checkpoint_info["val_start_logits"] += checkpoints_info[target_exp_id]["val_start_logits"]
    if ensembled_checkpoint_info["val_end_logits"] is None:
        ensembled_checkpoint_info["val_end_logits"] = checkpoints_info[target_exp_id]["val_end_logits"]
    else:
        ensembled_checkpoint_info["val_end_logits"] += checkpoints_info[target_exp_id]["val_end_logits"]
        
        
ensembled_checkpoint_info["val_start_logits"] = [np.asarray(logit) for logit in checkpoints_info[target_exp_id]["val_start_logits"]]
ensembled_checkpoint_info["val_end_logits"] = [np.asarray(logit) for logit in checkpoints_info[target_exp_id]["val_end_logits"]]

In [30]:
checkpoint_df = pd.DataFrame()
checkpoint_df["id"] = ensembled_checkpoint_info["val_ids"]
checkpoint_df["start_logits"] = ensembled_checkpoint_info["val_start_logits"]
checkpoint_df["end_logits"] = ensembled_checkpoint_info["val_end_logits"]

checkpoint_df["overflowing_batch_id"] = None
bef_id = ""
overflowing_batch_id = 0
for i, row in checkpoint_df.iterrows():
    if str(row["id"]) != bef_id:
        overflowing_batch_id = 0
    checkpoint_df.loc[i, "overflowing_batch_id"] = overflowing_batch_id
    bef_id = row["id"]
    overflowing_batch_id += 1
checkpoint_df.head(10)

Unnamed: 0,id,start_logits,end_logits,overflowing_batch_id
0,903deec17,"[5.0701704025268555, -5.960593223571777, -7.40...","[2.313723087310791, -10.814351081848145, -10.9...",0
1,903deec17,"[6.620517730712891, -5.636728286743164, -7.391...","[4.0754289627075195, -10.411615371704102, -10....",1
2,903deec17,"[6.747103214263916, -5.517636299133301, -7.225...","[4.195484638214111, -10.406983375549316, -10.7...",2
3,903deec17,"[6.703310489654541, -5.5158371925354, -7.33090...","[4.214907646179199, -10.472651481628418, -10.7...",3
4,903deec17,"[6.8010969161987305, -4.386323928833008, -6.87...","[4.284533977508545, -10.148509979248047, -10.6...",4
5,903deec17,"[6.571201324462891, -4.953230857849121, -7.127...","[4.143421173095703, -10.245794296264648, -10.9...",5
6,29d154b56,"[4.954555511474609, -8.37631607055664, -8.8468...","[1.9546047449111938, -11.775449752807617, -10....",0
7,29d154b56,"[6.1438398361206055, -8.49733829498291, -8.985...","[3.496518850326538, -11.87124252319336, -10.95...",1
8,29d154b56,"[6.594204902648926, -7.593926429748535, -8.380...","[4.2898850440979, -11.591924667358398, -10.298...",2
9,29d154b56,"[6.514626502990723, -8.229415893554688, -8.700...","[4.196666240692139, -11.592212677001953, -10.5...",3


In [31]:
prep_df = prep_df.merge(checkpoint_df, on=["id", "overflowing_batch_id"], how="left")
prep_df

Unnamed: 0,id,context,question,answer_text,answer_start,language,top20_context,answer_text_count,input_ids,token_type_ids,attention_mask,sequence_ids,offset_mapping,overflowing_batch_id,duplicated_elems_num_with,part_answer_text_count,start_position,end_position,segmentation_position,is_contain_answer_text,start_logits,end_logits
0,903deec17,ஒரு சாதாரண வளர்ந்த மனிதனுடைய எலும்புக்கூடு பின...,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,53,tamil,ஒரு சாதாரண வளர்ந்த ம,1,"[0, 69535, 81049, 37368, 153264, 12095, 52989,...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, Non...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",0,-1,1,27,27,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,"[5.0701704025268555, -5.960593223571777, -7.40...","[2.313723087310791, -10.814351081848145, -10.9..."
1,903deec17,ஒரு சாதாரண வளர்ந்த மனிதனுடைய எலும்புக்கூடு பின...,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,53,tamil,ஒரு சாதாரண வளர்ந்த ம,1,"[0, 69535, 81049, 37368, 153264, 12095, 52989,...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, Non...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",1,-1,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,"[6.620517730712891, -5.636728286743164, -7.391...","[4.0754289627075195, -10.411615371704102, -10...."
2,903deec17,ஒரு சாதாரண வளர்ந்த மனிதனுடைய எலும்புக்கூடு பின...,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,53,tamil,ஒரு சாதாரண வளர்ந்த ம,1,"[0, 69535, 81049, 37368, 153264, 12095, 52989,...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, Non...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",2,-1,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,"[6.747103214263916, -5.517636299133301, -7.225...","[4.195484638214111, -10.406983375549316, -10.7..."
3,903deec17,ஒரு சாதாரண வளர்ந்த மனிதனுடைய எலும்புக்கூடு பின...,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,53,tamil,ஒரு சாதாரண வளர்ந்த ம,1,"[0, 69535, 81049, 37368, 153264, 12095, 52989,...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, Non...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",3,-1,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,"[6.703310489654541, -5.5158371925354, -7.33090...","[4.214907646179199, -10.472651481628418, -10.7..."
4,903deec17,ஒரு சாதாரண வளர்ந்த மனிதனுடைய எலும்புக்கூடு பின...,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,53,tamil,ஒரு சாதாரண வளர்ந்த ம,1,"[0, 69535, 81049, 37368, 153264, 12095, 52989,...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, Non...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",4,-1,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,"[6.8010969161987305, -4.386323928833008, -6.87...","[4.284533977508545, -10.148509979248047, -10.6..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22321,xquad_1185,विद्युत आवेश के परिवर्तन की समय दर के रूप में ...,इलेक्ट्रोस्टैटिक और चुंबकीय बल के योग के रूप क...,इलेक्ट्रोस्टैटिक बल,328,hindi,विद्युत आवेश के परिव,1,"[0, 234186, 2284, 17433, 3045, 71683, 871, 369...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",0,-1,1,116,121,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,"[2.214484453201294, -2.326050043106079, -7.288...","[-0.6118912696838379, -8.885482788085938, -8.1..."
22322,xquad_1186,उस आयतन के लिए प्रासंगिक क्रॉस-सेक्शनल क्षेत्र...,संरचनाओं में तनाव का कारण क्या बनता है?,तनाव टेंसर,343,hindi,उस आयतन के लिए प्रास,1,"[0, 120018, 11846, 421, 129558, 641, 6701, 600...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",0,-1,1,128,131,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,"[3.1197197437286377, -4.8114519119262695, -9.3...","[-0.6221887469291687, -9.734058380126953, -9.8..."
22323,xquad_1187,उस आयतन के लिए प्रासंगिक क्रॉस-सेक्शनल क्षेत्र...,किसी वस्तु के आयतन में क्रॉस सेक्शन क्षेत्र की...,दबाव की शर्तें,118,hindi,उस आयतन के लिए प्रास,1,"[0, 12820, 89773, 287, 34889, 41420, 421, 4761...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",0,-1,1,66,68,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,"[1.590507984161377, -7.263235092163086, -8.281...","[-2.085319757461548, -11.977270126342773, -10...."
22324,xquad_1188,उस आयतन के लिए प्रासंगिक क्रॉस-सेक्शनल क्षेत्र...,सामान्य ताकतों से क्या जुड़ा है?,दबाव की शर्तें,118,hindi,उस आयतन के लिए प्रास,1,"[0, 38338, 217186, 1302, 646, 6004, 158371, 10...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, None, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",0,-1,1,51,53,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,"[2.6658051013946533, -5.117407321929932, -6.50...","[-1.514367699623108, -10.561481475830078, -10...."


In [32]:
from typing import List, Tuple

def get_context_part(
    offset_mapping: List[Tuple[int, int]], context: str
) -> str:
    s = 1_000_000_000
    e = 0
    for (offs, offe) in offset_mapping:
        if offs == -1:
            continue
        s = min(offs, s)
        e = max(offe, e)
    context_part = context[s:e]
    return s, e, context_part

def get_part_start_end_logit_score(start_char_index, end_char_index, offset_mapping, start_logit, end_logit):
    start_index = 0
    while offset_mapping[start_index][0] == -1:
        start_index += 1
    while offset_mapping[start_index][0] != -1 and offset_mapping[start_index][0] <= start_char_index:
        start_index += 1
    start_index -= 1
    end_index = len(offset_mapping) - 1
    while offset_mapping[end_index][1] == -1:
        end_index -= 1
    while offset_mapping[end_index][1] != -1 and offset_mapping[end_index][1] >= end_char_index:
        end_index -= 1
    end_index += 1
    score = start_logit[start_index] + end_logit[end_index]
    return score

In [33]:
# %debug
import json

for i, row in tqdm(prep_df.iterrows(), total=len(prep_df)):
    context_part_start_char_index, _, context_part = get_context_part(row["offset_mapping"], row["context"])
    char_index_score_dict = {}
    start_char_index = context_part.find(row["answer_text"])
    while start_char_index >= 0:
        score = get_part_start_end_logit_score(
            context_part_start_char_index+start_char_index,
            context_part_start_char_index+start_char_index+len(row["answer_text"]),
            row["offset_mapping"],
            row["start_logits"],
            row["end_logits"]
        )
        char_index_score_dict[context_part_start_char_index+start_char_index] = score

        context_part_start_char_index += (start_char_index+len(row["answer_text"]))
        context_part = context_part[start_char_index+len(row["answer_text"]):]
        start_char_index = context_part.find(row["answer_text"])
    prep_df.loc[i, "char_index_score_dict"] = json.dumps(char_index_score_dict)

  0%|          | 0/22326 [00:00<?, ?it/s]

In [34]:
prep_df.head(100)[["id", "question", "answer_text", "answer_start", "language", "overflowing_batch_id", "is_contain_answer_text", "char_index_score_dict"]]

Unnamed: 0,id,question,answer_text,answer_start,language,overflowing_batch_id,is_contain_answer_text,char_index_score_dict
0,903deec17,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,53,tamil,0,1,"{""53"": 9.169572353363037}"
1,903deec17,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,53,tamil,1,0,{}
2,903deec17,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,53,tamil,2,0,{}
3,903deec17,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,53,tamil,3,0,{}
4,903deec17,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,53,tamil,4,0,{}
5,903deec17,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,53,tamil,5,0,{}
6,d9841668c,காளிதாசன் எங்கு பிறந்தார்?,காசுமீரில்,2358,tamil,0,0,{}
7,d9841668c,காளிதாசன் எங்கு பிறந்தார்?,காசுமீரில்,2358,tamil,1,0,{}
8,d9841668c,காளிதாசன் எங்கு பிறந்தார்?,காசுமீரில்,2358,tamil,2,1,"{""2358"": 6.011011481285095, ""2531"": -1.0148405..."
9,d9841668c,காளிதாசன் எங்கு பிறந்தார்?,காசுமீரில்,2358,tamil,3,0,"{""2653"": -7.417436838150024}"


In [35]:
def temp(x):
    y = json.loads(x)
    if len(y) == 0:
        return -100000000
    else:
        return max(y.values())
    
def temp2(x):
    y = json.loads(x)
    if len(y) == 0:
        return -100000000
    else:
        res_k = -1
        res_v = -1_000_000_000
        for k, v in y.items():
            if v > res_v:
                res_k = int(k)
                res_v = v
        return res_k
    
prep_df["max_score"] = prep_df.char_index_score_dict.apply(lambda x: temp(x))
prep_df["max_index"] = prep_df.char_index_score_dict.apply(lambda x: temp2(x))
prep_df

Unnamed: 0,id,context,question,answer_text,answer_start,language,top20_context,answer_text_count,input_ids,token_type_ids,attention_mask,sequence_ids,offset_mapping,overflowing_batch_id,duplicated_elems_num_with,part_answer_text_count,start_position,end_position,segmentation_position,is_contain_answer_text,start_logits,end_logits,char_index_score_dict,max_score,max_index
0,903deec17,ஒரு சாதாரண வளர்ந்த மனிதனுடைய எலும்புக்கூடு பின...,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,53,tamil,ஒரு சாதாரண வளர்ந்த ம,1,"[0, 69535, 81049, 37368, 153264, 12095, 52989,...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, Non...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",0,-1,1,27,27,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,"[5.0701704025268555, -5.960593223571777, -7.40...","[2.313723087310791, -10.814351081848145, -10.9...","{""53"": 9.169572353363037}",9.169572e+00,53
1,903deec17,ஒரு சாதாரண வளர்ந்த மனிதனுடைய எலும்புக்கூடு பின...,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,53,tamil,ஒரு சாதாரண வளர்ந்த ம,1,"[0, 69535, 81049, 37368, 153264, 12095, 52989,...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, Non...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",1,-1,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,"[6.620517730712891, -5.636728286743164, -7.391...","[4.0754289627075195, -10.411615371704102, -10....",{},-1.000000e+08,-100000000
2,903deec17,ஒரு சாதாரண வளர்ந்த மனிதனுடைய எலும்புக்கூடு பின...,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,53,tamil,ஒரு சாதாரண வளர்ந்த ம,1,"[0, 69535, 81049, 37368, 153264, 12095, 52989,...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, Non...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",2,-1,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,"[6.747103214263916, -5.517636299133301, -7.225...","[4.195484638214111, -10.406983375549316, -10.7...",{},-1.000000e+08,-100000000
3,903deec17,ஒரு சாதாரண வளர்ந்த மனிதனுடைய எலும்புக்கூடு பின...,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,53,tamil,ஒரு சாதாரண வளர்ந்த ம,1,"[0, 69535, 81049, 37368, 153264, 12095, 52989,...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, Non...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",3,-1,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,"[6.703310489654541, -5.5158371925354, -7.33090...","[4.214907646179199, -10.472651481628418, -10.7...",{},-1.000000e+08,-100000000
4,903deec17,ஒரு சாதாரண வளர்ந்த மனிதனுடைய எலும்புக்கூடு பின...,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,53,tamil,ஒரு சாதாரண வளர்ந்த ம,1,"[0, 69535, 81049, 37368, 153264, 12095, 52989,...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, Non...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",4,-1,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,"[6.8010969161987305, -4.386323928833008, -6.87...","[4.284533977508545, -10.148509979248047, -10.6...",{},-1.000000e+08,-100000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22321,xquad_1185,विद्युत आवेश के परिवर्तन की समय दर के रूप में ...,इलेक्ट्रोस्टैटिक और चुंबकीय बल के योग के रूप क...,इलेक्ट्रोस्टैटिक बल,328,hindi,विद्युत आवेश के परिव,1,"[0, 234186, 2284, 17433, 3045, 71683, 871, 369...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",0,-1,1,116,121,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,"[2.214484453201294, -2.326050043106079, -7.288...","[-0.6118912696838379, -8.885482788085938, -8.1...","{""328"": 4.255732536315918}",4.255733e+00,328
22322,xquad_1186,उस आयतन के लिए प्रासंगिक क्रॉस-सेक्शनल क्षेत्र...,संरचनाओं में तनाव का कारण क्या बनता है?,तनाव टेंसर,343,hindi,उस आयतन के लिए प्रास,1,"[0, 120018, 11846, 421, 129558, 641, 6701, 600...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",0,-1,1,128,131,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,"[3.1197197437286377, -4.8114519119262695, -9.3...","[-0.6221887469291687, -9.734058380126953, -9.8...","{""343"": 9.480067729949951}",9.480068e+00,343
22323,xquad_1187,उस आयतन के लिए प्रासंगिक क्रॉस-सेक्शनल क्षेत्र...,किसी वस्तु के आयतन में क्रॉस सेक्शन क्षेत्र की...,दबाव की शर्तें,118,hindi,उस आयतन के लिए प्रास,1,"[0, 12820, 89773, 287, 34889, 41420, 421, 4761...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",0,-1,1,66,68,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,"[1.590507984161377, -7.263235092163086, -8.281...","[-2.085319757461548, -11.977270126342773, -10....","{""118"": -0.5963906645774841}",-5.963907e-01,118
22324,xquad_1188,उस आयतन के लिए प्रासंगिक क्रॉस-सेक्शनल क्षेत्र...,सामान्य ताकतों से क्या जुड़ा है?,दबाव की शर्तें,118,hindi,उस आयतन के लिए प्रास,1,"[0, 38338, 217186, 1302, 646, 6004, 158371, 10...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, None, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",0,-1,1,51,53,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,"[2.6658051013946533, -5.117407321929932, -6.50...","[-1.514367699623108, -10.561481475830078, -10....","{""118"": 14.286912441253662}",1.428691e+01,118


In [36]:
prep_df.query("max_score < 0").max_score.sort_values()

1       -1.000000e+08
10188   -1.000000e+08
10189   -1.000000e+08
10190   -1.000000e+08
10191   -1.000000e+08
             ...     
1287    -1.407617e-02
7599    -1.351523e-02
8165    -1.203048e-02
19792   -8.420169e-03
18317   -1.250505e-03
Name: max_score, Length: 14206, dtype: float64

In [60]:
prep_df.groupby("id").max_score.max().sort_values().reset_index().head(600)

Unnamed: 0,id,max_score
0,mlqa_hindi_2354,-100000000.0
1,mlqa_hindi_562,-21.10788
2,mlqa_hindi_3241,-20.27896
3,33bf6b51c,-20.06819
4,ccd5473c0,-20.01588
5,mlqa_hindi_2317,-19.52712
6,mlqa_hindi_3444,-18.58867
7,mlqa_hindi_1204,-18.30945
8,xquad_753,-17.92348
9,mlqa_hindi_3908,-17.91267


In [26]:
prep_df.query("id == '915f023b0'")

Unnamed: 0,id,context,question,answer_text,answer_start,language,top20_context,answer_text_count,input_ids,token_type_ids,attention_mask,sequence_ids,offset_mapping,overflowing_batch_id,duplicated_elems_num_with,part_answer_text_count,start_position,end_position,segmentation_position,is_contain_answer_text,start_logits,end_logits,char_index_score_dict,max_score,max_index,is_original
8562,915f023b0,क्रिया योग की साधना करने वालों के द्वारा इसे ए...,क्रिया योग के संस्थापक कौन थे?,लाहिरी महाशय,143,hindi,क्रिया योग की साधना,7,"[0, 139058, 42194, 287, 178208, 49329, 11526, ...","[None, 0, 0, 0, 0, 0, 0, 0, None, None, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",0,-1,1,46,50,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,"[3.4175925254821777, -9.287590026855469, -9.82...","[1.1747922897338867, -11.198990821838379, -10....","{""143"": -1.5716326832771301}",-1.571633,143,True
8563,915f023b0,क्रिया योग की साधना करने वालों के द्वारा इसे ए...,क्रिया योग के संस्थापक कौन थे?,लाहिरी महाशय,143,hindi,क्रिया योग की साधना,7,"[0, 139058, 42194, 287, 178208, 49329, 11526, ...","[None, 0, 0, 0, 0, 0, 0, 0, None, None, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",1,-1,1,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,"[5.1523356437683105, -9.2976713180542, -9.6756...","[3.756939649581909, -10.71048641204834, -10.06...","{""1541"": -5.190345287322998}",-5.190345,1541,True
8564,915f023b0,क्रिया योग की साधना करने वालों के द्वारा इसे ए...,क्रिया योग के संस्थापक कौन थे?,लाहिरी महाशय,143,hindi,क्रिया योग की साधना,7,"[0, 139058, 42194, 287, 178208, 49329, 11526, ...","[None, 0, 0, 0, 0, 0, 0, 0, None, None, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",2,-1,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,"[5.470852851867676, -9.1478910446167, -9.56985...","[3.883002281188965, -10.517669677734375, -9.78...",{},-100000000.0,-100000000,True
8565,915f023b0,क्रिया योग की साधना करने वालों के द्वारा इसे ए...,क्रिया योग के संस्थापक कौन थे?,लाहिरी महाशय,143,hindi,क्रिया योग की साधना,7,"[0, 139058, 42194, 287, 178208, 49329, 11526, ...","[None, 0, 0, 0, 0, 0, 0, 0, None, None, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",3,-1,2,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,"[5.736210823059082, -9.023700714111328, -9.458...","[4.162169933319092, -10.142075538635254, -9.69...","{""3923"": -7.963799476623535, ""4116"": -14.76287...",-7.963799,3923,True
8566,915f023b0,क्रिया योग की साधना करने वालों के द्वारा इसे ए...,क्रिया योग के संस्थापक कौन थे?,लाहिरी महाशय,143,hindi,क्रिया योग की साधना,7,"[0, 139058, 42194, 287, 178208, 49329, 11526, ...","[None, 0, 0, 0, 0, 0, 0, 0, None, None, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",4,-1,5,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,"[5.512853622436523, -9.200356483459473, -9.533...","[3.895507335662842, -10.453171730041504, -10.0...","{""3923"": -15.064462184906006, ""4116"": -15.4484...",-10.05311,4792,True
8567,915f023b0,क्रिया योग की साधना करने वालों के द्वारा इसे ए...,क्रिया योग के संस्थापक कौन थे?,लाहिरी महाशय,143,hindi,क्रिया योग की साधना,7,"[0, 139058, 42194, 287, 178208, 49329, 11526, ...","[None, 0, 0, 0, 0, 0, 0, 0, None, None, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",5,-1,3,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,"[4.148573398590088, -9.486236572265625, -9.835...","[2.419513702392578, -10.811320304870605, -10.3...","{""4562"": -4.252346515655518, ""4629"": -5.243105...",-1.745841,4792,True


In [28]:
row = prep_df.loc[8566]
print(row["is_contain_answer_text"])
print(row["question"])
print(row["answer_text"])
print(row["char_index_score_dict"])
print(row["offset_mapping"])
print(row["context"][3693:5050])

0
क्रिया योग के संस्थापक कौन थे?
लाहिरी महाशय
{"3923": -15.064462184906006, "4116": -15.44843578338623, "4562": -12.848931312561035, "4629": -12.067591190338135, "4792": -10.053106784820557}
[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (3693, 3694), (3694, 3695), (3695, 3699), (3699, 3704), (3704, 3705), (3705, 3710), (3710, 3712), (3712, 3714), (3714, 3719), (3719, 3722), (3722, 3725), (3725, 3731), (3731, 3732), (3732, 3737), (3737, 3739), (3739, 3740), (3740, 3742), (3742, 3746), (3746, 3750), (3750, 3752), (3752, 3753), (3753, 3756), (3756, 3758), (3758, 3760), (3760, 3762), (3762, 3765), (3765, 3770), (3770, 3773), (3773, 3776), (3776, 3777), (3777, 3779), (3779, 3785), (3785, 3788), (3788, 3792), (3792, 3795), (3795, 3800), (3800, 3803), (3803, 3806), (3806, 3808), (3808, 3812), (3812, 3816), (3816, 3819), (3819, 3826), (3826, 3828), (3828, 3832), (3832, 3835), (3835, 3839), (3839, 3840), (3840, 3841), (3841, 3845), (3845, 3

In [25]:
row["start_logits"]

array([  6.08315992,  -6.98116255,  -8.22501183,  -9.89751434,
        -7.30931473,  -9.82466316,  -8.08366489,  -9.06968117,
       -10.16252327,  -8.91719151,  -7.5209651 ,  -8.38506699,
        -8.21641445,  -8.85834312,  -9.4388895 ,  -9.32982254,
        -6.80536509,  -5.55888987,  -3.89713144,  -3.75928688,
        -2.27144718,  -5.44230604,  -4.99740744,  -6.66793633,
        -0.76670164,  -4.2261405 ,  -4.19980717,  -4.37232447,
        -7.47990942,  -6.68271875,  -8.54318619,  -8.03356838,
        -8.67862701,  -9.19072819,  -8.96863556,  -7.18331575,
        -8.57827759,  -8.72879314,  -7.10748148,  -7.09515524,
       -10.07513142,  -8.08981991,  -8.87723541,  -7.47184229,
        -8.83721542,  -9.44092751,  -9.15033913,  -9.03614426,
        -9.73720837,  -8.59174156,  -9.14120293,  -9.01697063,
        -7.80488348,  -8.84934902,  -7.65068865,  -8.67582035,
        -9.63269615,  -9.33382607,  -9.18977737,  -9.46857548,
        -8.47351646,  -8.09776306,  -8.7864809 ,  -7.98

In [179]:
row = prep_df.loc[10227]
print(row["is_contain_answer_text"])
print(row["question"])
print(row["answer_text"])
print(row["char_index_score_dict"])
print(row["offset_mapping"])
print(row["context"][5014:6244])

1
एंड्रयू हीथ लेजर ने सबसे पहले किस फिल्म में काम किया था?
ब्लैकरोच
{"5792": 0.13439003378152847}
[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (5014, 5017), (5017, 5020), (5020, 5023), (5023, 5029), (5029, 5033), (5033, 5040), (5040, 5042), (5042, 5043), (5043, 5045), (5045, 5048), (5048, 5054), (5054, 5055), (5055, 5057), (5057, 5067), (5067, 5070), (5070, 5073), (5073, 5074), (5074, 5077), (5077, 5079), (5079, 5080), (5080, 5083), (5083, 5086), (5086, 5090), (5090, 5095), (5095, 5096), (5096, 5097), (5097, 5101), (5101, 5105), (5105, 5109), (5109, 5113), (5114, 5120), (5122, 5127), (5127, 5130), (5130, 5134), (5135, 5138), (5138, 5143), (5143, 5146), (5146, 5150), (5150, 5154), (5154, 5161), (5161, 5167), (5167, 5170), (5170, 5178), (5178, 5183), (5183, 5186), (5186, 5190), (5190, 5193), (5193, 5195), (5195, 5198), (5198, 5204), (

In [176]:
row = prep_df.loc[5443]
print(row["is_contain_answer_text"])
print(row["question"])
print(row["answer_text"])
print(row["char_index_score_dict"])
print(row["offset_mapping"])
print(row["context"][14461:15614])

1
ताम्र एवं टीन के मिश्रधातु को क्या कहते हैं ?
कांसा
{"15541": -6.322589874267578}
[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (14461, 14464), (14464, 14465), (14465, 14472), (14472, 14477), (14477, 14481), (14481, 14484), (14484, 14487), (14487, 14489), (14489, 14491), (14491, 14494), (14494, 14496), (14496, 14497), (14497, 14499), (14499, 14501), (14501, 14505), (14505, 14509), (14509, 14513), (14513, 14515), (14515, 14518), (14518, 14523), (14523, 14528), (14528, 14532), (14532, 14533), (14533, 14538), (14538, 14541), (14541, 14543), (14543, 14547), (14547, 14550), (14550, 14556), (14556, 14559), (14559, 14563), (14563, 14567), (14567, 14570), (14570, 14575), (14575, 14576), (14576, 14580), (14580, 14582), (14582, 14587), (14587, 14593), (14593, 14596), (14596, 14599), (14599, 14605), (14605, 14608), (14608, 14609), (14609, 14612), (14612, 14613),

In [145]:
row["context"][4872-5:4872+len("कुनैन")]

'े ही कुनैन'

In [147]:
str(" हो। ")

' हो। '

In [128]:
row = prep_df.loc[5191]
print(row["question"])
print(row["answer_text"])
print(row["char_index_score_dict"])
print(row["offset_mapping"])
print(row["context"][0:1635])

எந்த ஆண்டில் இந்திய தேசிய காங்கிரஸ் நிறுவப்பட்டது?
1885
{"191": 12.250142097473145, "903": 9.944947481155396, "1320": 3.9632880091667175}
[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (0, 6), (6, 12), (12, 22), (22, 24), (24, 31), (31, 32), (32, 39), (39, 48), (48, 57), (57, 58), (58, 60), (60, 61), (61, 63), (63, 67), (67, 68), (68, 70), (70, 76), (76, 83), (83, 93), (93, 95), (95, 96), (96, 97), (97, 105), (105, 112), (112, 122), (122, 123), (123, 131), (131, 134), (134, 136), (136, 137), (137, 149), (149, 153), (153, 155), (155, 157), (157, 160), (160, 168), (168, 174), (174, 179), (179, 184), (184, 186), (186, 189), (189, 190), (190, 195), (195, 197), (197, 204), (204, 210), (210, 212), (212, 215), (215, 219), (219, 220), (220, 227), (227, 235), (235, 241), (241, 245), (245, 251), (251, 257), (257, 258), (258, 260), (260, 266), (266, 268), (268, 269), (269, 271), (271, 274), (274, 278), (278, 282),

In [132]:
row = prep_df.loc[5196]
print(row["question"])
print(row["answer_text"])
print(row["char_index_score_dict"])
print(row["offset_mapping"])
print(row["context"][1124:2539])

எந்த ஆண்டில் இந்திய தேசிய காங்கிரஸ் நிறுவப்பட்டது?
1885
{"6400": 7.515032052993774}
[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (4839, 4840), (4841, 4846), (4846, 4847), (4847, 4849), (4849, 4856), (4856, 4860), (4860, 4863), (4863, 4866), (4866, 4867), (4867, 4871), (4871, 4877), (4879, 4885), (4885, 4887), (4888, 4891), (4891, 4895), (4895, 4896), (4896, 4899), (4899, 4903), (4903, 4906), (4906, 4908), (4908, 4911), (4911, 4916), (4916, 4922), (4922, 4925), (4925, 4930), (4930, 4934), (4934, 4940), (4940, 4945), (4945, 4949), (4949, 4958), (4958, 4959), (4959, 4966), (4966, 4968), (4968, 4970), (4970, 4972), (4972, 4985), (4985, 4993), (4993, 4998), (4998, 5000), (5000, 5005), (5005, 5006), (5006, 5008), (5008, 5015), (5015, 5019), (5019, 5022), (5022, 5026), (5026, 5029), (5029, 5031), (5031, 5037), (5037, 5041), (5041, 5048), (5048, 5052), (5052, 5053), (5053, 5057), (5057, 5063), (5063, 5067), (

In [107]:
row = prep_df.loc[67]
print(row["offset_mapping"])
print(row["context"][:1317])

[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (0, 5), (5, 9), (9, 13), (13, 15), (15, 22), (22, 28), (28, 30), (30, 32), (32, 34), (34, 35), (35, 42), (42, 52), (52, 56), (56, 61), (61, 62), (62, 65), (65, 67), (67, 70), (70, 74), (74, 81), (81, 82), (82, 87), (87, 89), (89, 91), (91, 95), (95, 96), (96, 101), (101, 112), (112, 113), (113, 115), (115, 117), (117, 120), (120, 124), (124, 125), (125, 127), (127, 128), (128, 133), (133, 136), (136, 142), (142, 149), (149, 155), (155, 160), (160, 164), (164, 165), (165, 167), (167, 169), (169, 171), (171, 172), (172, 177), (177, 180), (180, 182), (182, 190), (190, 200), (200, 205), (205, 210), (210, 215), (215, 219), (219, 224), (224, 227), (227, 233), (233, 240), (240, 246), (246, 248), (248, 250), (250, 252), (252, 256), (256, 257), (257, 259), (259, 262), (262, 263), (263, 265), (265, 266), (266, 269), (269, 271), (271, 272), (272, 2

In [109]:
row = prep_df.loc[68]
print(row["offset_mapping"])
print(row["context"][878:2221])

[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (878, 880), (880, 885), (885, 886), (886, 889), (889, 891), (891, 893), (893, 895), (895, 896), (896, 899), (899, 901), (901, 908), (908, 911), (911, 914), (914, 917), (917, 919), (919, 921), (921, 926), (926, 932), (932, 935), (935, 936), (936, 939), (939, 941), (941, 945), (945, 953), (953, 954), (954, 957), (957, 961), (961, 963), (963, 966), (966, 967), (967, 968), (968, 971), (971, 975), (975, 978), (978, 979), (979, 982), (982, 986), (986, 990), (990, 991), (991, 996), (996, 998), (998, 1001), (1001, 1005), (1005, 1010), (1010, 1015), (1015, 1016), (1016, 1022), (1022, 1026), (1026, 1032), (1032, 1037), (1037, 1044), (1044, 1049), (1049, 1058), (1058, 1059), (1059, 1060), (1060, 1062), (1062, 1064), (1064, 1069), (1069, 1074), (1074, 1076), (1076, 1082), (1082, 1088), (1088, 1093), (1093, 1095), (1095, 1101), (1101, 1105), (1105, 1

In [114]:
row = prep_df.loc[47]
print(row["question"])
print(row["answer_text"])
print(row["char_index_score_dict"])
print(row["offset_mapping"])
print(row["context"][3473:4761])

திரிதடையங்களைப் பயன்படுத்திய முதல் நிறுவனம் எது?
IBM
{"4171": 3.6412532925605774, "4208": 5.769239664077759, "4352": -5.552682399749756, "4468": -9.261614799499512}
[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (3473, 3476), (3476, 3482), (3482, 3486), (3486, 3488), (3488, 3493), (3493, 3497), (3497, 3501), (3501, 3505), (3505, 3510), (3510, 3511), (3511, 3512), (3512, 3516), (3516, 3517), (3517, 3520), (3520, 3524), (3524, 3528), (3528, 3532), (3532, 3533), (3533, 3535), (3535, 3538), (3538, 3539), (3539, 3543), (3543, 3546), (3547, 3551), (3551, 3558), (3558, 3559), (3559, 3572), (3572, 3573), (3573, 3577), (3577, 3580), (3580, 3584), (3584, 3587), (3587, 3588), (3588, 3592), (3592, 3593), (3593, 3596), (3596, 3598), (3598, 3600), (3600, 3602), (3602, 3604), (3604, 3605), (3605, 3607), (3607, 3611), (3611, 3613), (3613, 3620), (3620, 3622), (3622, 3625), (3625, 3627), (3627, 3629)

In [119]:
row = prep_df.loc[14184]
print(row["question"])
print(row["answer_text"])
print(row["char_index_score_dict"])
print(row["offset_mapping"])
print(row["context"][817:4761])

लीजा रे की पहली फिल्म का नाम क्या था?
नेताजी
{"2017": 14.952022552490234}
[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (817, 818), (818, 821), (821, 824), (824, 829), (829, 832), (832, 834), (834, 837), (837, 841), (841, 843), (843, 848), (848, 852), (852, 856), (856, 861), (861, 865), (865, 869), (869, 874), (874, 882), (882, 883), (883, 888), (888, 889), (889, 894), (894, 897), (897, 901), (901, 904), (904, 906), (906, 909), (909, 912), (912, 914), (914, 916), (916, 919), (919, 924), (924, 927), (927, 931), (931, 934), (934, 940), (940, 945), (945, 950), (950, 954), (954, 955), (955, 958), (958, 963), (963, 967), (967, 970), (970, 980), (980, 986), (986, 989), (989, 995), (995, 998), (998, 1003), (1003, 1007), (1007, 1010), (1010, 1013), (1013, 1016), (1016, 1019), (1019, 1026), (1026, 1029), (1029, 1032), (1032, 1037), (1037, 1041), (1041, 1044), (1044, 1045), (1045, 1048), (1048, 1052), 

In [64]:
prep_df["is_original"] = ~prep_df["id"].str.contains("^xquad_|^mlqa_").values
prep_df.is_original.value_counts()

True     14188
False     8138
Name: is_original, dtype: int64

In [65]:
with open("../data/preprocessed/e070_prep_df.pkl", "wb") as fout:
    pickle.dump(prep_df, fout)

## prep df として作ってしまおう。
 - 1 text batch 内では argmax を使う
 - 0 より小さい場合は無視する
 - 1 id 内に一つもない場合は以下の 2 通りを扱う
     - argmax を使う
     - 無視する

In [23]:
with open("../data/preprocessed/e070_prep_df.pkl", "rb") as fin:
    prep_df = pickle.load(fin)
prep_df.head()

Unnamed: 0,id,context,question,answer_text,answer_start,language,top20_context,answer_text_count,input_ids,token_type_ids,attention_mask,sequence_ids,offset_mapping,overflowing_batch_id,duplicated_elems_num_with,part_answer_text_count,start_position,end_position,segmentation_position,is_contain_answer_text,start_logits,end_logits,char_index_score_dict,max_score,max_index,is_original
0,903deec17,ஒரு சாதாரண வளர்ந்த மனிதனுடைய எலும்புக்கூடு பின...,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,53,tamil,ஒரு சாதாரண வளர்ந்த ம,1,"[0, 69535, 81049, 37368, 153264, 12095, 52989,...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, Non...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",0,-1,1,27,27,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,"[5.0701704025268555, -5.960593223571777, -7.40...","[2.313723087310791, -10.814351081848145, -10.9...","{""53"": 9.169572353363037}",9.169572,53,True
1,903deec17,ஒரு சாதாரண வளர்ந்த மனிதனுடைய எலும்புக்கூடு பின...,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,53,tamil,ஒரு சாதாரண வளர்ந்த ம,1,"[0, 69535, 81049, 37368, 153264, 12095, 52989,...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, Non...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",1,-1,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,"[6.620517730712891, -5.636728286743164, -7.391...","[4.0754289627075195, -10.411615371704102, -10....",{},-100000000.0,-100000000,True
2,903deec17,ஒரு சாதாரண வளர்ந்த மனிதனுடைய எலும்புக்கூடு பின...,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,53,tamil,ஒரு சாதாரண வளர்ந்த ம,1,"[0, 69535, 81049, 37368, 153264, 12095, 52989,...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, Non...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",2,-1,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,"[6.747103214263916, -5.517636299133301, -7.225...","[4.195484638214111, -10.406983375549316, -10.7...",{},-100000000.0,-100000000,True
3,903deec17,ஒரு சாதாரண வளர்ந்த மனிதனுடைய எலும்புக்கூடு பின...,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,53,tamil,ஒரு சாதாரண வளர்ந்த ம,1,"[0, 69535, 81049, 37368, 153264, 12095, 52989,...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, Non...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",3,-1,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,"[6.703310489654541, -5.5158371925354, -7.33090...","[4.214907646179199, -10.472651481628418, -10.7...",{},-100000000.0,-100000000,True
4,903deec17,ஒரு சாதாரண வளர்ந்த மனிதனுடைய எலும்புக்கூடு பின...,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,53,tamil,ஒரு சாதாரண வளர்ந்த ம,1,"[0, 69535, 81049, 37368, 153264, 12095, 52989,...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, Non...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",4,-1,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,"[6.8010969161987305, -4.386323928833008, -6.87...","[4.284533977508545, -10.148509979248047, -10.6...",{},-100000000.0,-100000000,True


In [8]:
def get_max_start_char_index(char_index_score_dict, thresh):
    max_start_char_index = -1  # default -1
    largest_score = -100000000000
    for k, v in char_index_score_dict.items():
        if v < thresh:
            continue
        if v > largest_score:
            max_start_char_index = int(k)
            largest_score = v
    return max_start_char_index

def calc_start_end_position(start_char_index, end_char_index, offset_mapping):
    start_index = 0
    while offset_mapping[start_index][0] == -1:
        start_index += 1
    while offset_mapping[start_index][0] != -1 and offset_mapping[start_index][0] <= start_char_index:
        start_index += 1
    start_index -= 1
    end_index = len(offset_mapping) - 1
    while offset_mapping[end_index][1] == -1:
        end_index -= 1
    while offset_mapping[end_index][1] != -1 and offset_mapping[end_index][1] >= end_char_index:
        end_index -= 1
    end_index += 1
    return start_index, end_index

In [29]:
from copy import deepcopy

THRESH = -1.

reses = []
for i, row in tqdm(prep_df.iterrows(), total=len(prep_df)):
    copied_row = deepcopy(row)
    
    start_char_index = get_max_start_char_index(json.loads(copied_row["char_index_score_dict"]), thresh=THRESH)
    copied_row["answer_start"] = start_char_index
    end_char_index = start_char_index + len(row["answer_text"])
    
    offset_mapping = copied_row["offset_mapping"]
    if start_char_index < 0:
        copied_row["is_contain_answer_text"] = True
        copied_row["start_position"] = 0
        copied_row["end_position"] = 0
        copied_row["segmentation_position"] = [1] + [0] * (len(offset_mapping) - 1)
    else:
        copied_row["is_contain_answer_text"] = False
        start_position, end_position = calc_start_end_position(start_char_index, end_char_index, offset_mapping)
        copied_row["start_position"] = start_position
        copied_row["end_position"] = end_position
        copied_row["segmentation_position"] = [1 if start_position <= i and i <= end_position else 0 for i in range(len(offset_mapping))]
    reses.append(copied_row)

res_prep_df = pd.DataFrame([res_row.to_dict() for res_row in reses])
res_prep_df

  0%|          | 0/22326 [00:00<?, ?it/s]

Unnamed: 0,id,context,question,answer_text,answer_start,language,top20_context,answer_text_count,input_ids,token_type_ids,attention_mask,sequence_ids,offset_mapping,overflowing_batch_id,duplicated_elems_num_with,part_answer_text_count,start_position,end_position,segmentation_position,is_contain_answer_text,start_logits,end_logits,char_index_score_dict,max_score,max_index,is_original
0,903deec17,ஒரு சாதாரண வளர்ந்த மனிதனுடைய எலும்புக்கூடு பின...,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,53,tamil,ஒரு சாதாரண வளர்ந்த ம,1,"[0, 69535, 81049, 37368, 153264, 12095, 52989,...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, Non...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",0,-1,1,27,27,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",False,"[5.0701704025268555, -5.960593223571777, -7.40...","[2.313723087310791, -10.814351081848145, -10.9...","{""53"": 9.169572353363037}",9.169572e+00,53,True
1,903deec17,ஒரு சாதாரண வளர்ந்த மனிதனுடைய எலும்புக்கூடு பின...,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,-1,tamil,ஒரு சாதாரண வளர்ந்த ம,1,"[0, 69535, 81049, 37368, 153264, 12095, 52989,...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, Non...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",1,-1,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",True,"[6.620517730712891, -5.636728286743164, -7.391...","[4.0754289627075195, -10.411615371704102, -10....",{},-1.000000e+08,-100000000,True
2,903deec17,ஒரு சாதாரண வளர்ந்த மனிதனுடைய எலும்புக்கூடு பின...,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,-1,tamil,ஒரு சாதாரண வளர்ந்த ம,1,"[0, 69535, 81049, 37368, 153264, 12095, 52989,...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, Non...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",2,-1,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",True,"[6.747103214263916, -5.517636299133301, -7.225...","[4.195484638214111, -10.406983375549316, -10.7...",{},-1.000000e+08,-100000000,True
3,903deec17,ஒரு சாதாரண வளர்ந்த மனிதனுடைய எலும்புக்கூடு பின...,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,-1,tamil,ஒரு சாதாரண வளர்ந்த ம,1,"[0, 69535, 81049, 37368, 153264, 12095, 52989,...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, Non...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",3,-1,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",True,"[6.703310489654541, -5.5158371925354, -7.33090...","[4.214907646179199, -10.472651481628418, -10.7...",{},-1.000000e+08,-100000000,True
4,903deec17,ஒரு சாதாரண வளர்ந்த மனிதனுடைய எலும்புக்கூடு பின...,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,-1,tamil,ஒரு சாதாரண வளர்ந்த ம,1,"[0, 69535, 81049, 37368, 153264, 12095, 52989,...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, Non...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",4,-1,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",True,"[6.8010969161987305, -4.386323928833008, -6.87...","[4.284533977508545, -10.148509979248047, -10.6...",{},-1.000000e+08,-100000000,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22321,xquad_1185,विद्युत आवेश के परिवर्तन की समय दर के रूप में ...,इलेक्ट्रोस्टैटिक और चुंबकीय बल के योग के रूप क...,इलेक्ट्रोस्टैटिक बल,328,hindi,विद्युत आवेश के परिव,1,"[0, 234186, 2284, 17433, 3045, 71683, 871, 369...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",0,-1,1,116,121,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",False,"[2.214484453201294, -2.326050043106079, -7.288...","[-0.6118912696838379, -8.885482788085938, -8.1...","{""328"": 4.255732536315918}",4.255733e+00,328,False
22322,xquad_1186,उस आयतन के लिए प्रासंगिक क्रॉस-सेक्शनल क्षेत्र...,संरचनाओं में तनाव का कारण क्या बनता है?,तनाव टेंसर,343,hindi,उस आयतन के लिए प्रास,1,"[0, 120018, 11846, 421, 129558, 641, 6701, 600...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",0,-1,1,128,131,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",False,"[3.1197197437286377, -4.8114519119262695, -9.3...","[-0.6221887469291687, -9.734058380126953, -9.8...","{""343"": 9.480067729949951}",9.480068e+00,343,False
22323,xquad_1187,उस आयतन के लिए प्रासंगिक क्रॉस-सेक्शनल क्षेत्र...,किसी वस्तु के आयतन में क्रॉस सेक्शन क्षेत्र की...,दबाव की शर्तें,118,hindi,उस आयतन के लिए प्रास,1,"[0, 12820, 89773, 287, 34889, 41420, 421, 4761...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",0,-1,1,66,68,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",False,"[1.590507984161377, -7.263235092163086, -8.281...","[-2.085319757461548, -11.977270126342773, -10....","{""118"": -0.5963906645774841}",-5.963907e-01,118,False
22324,xquad_1188,उस आयतन के लिए प्रासंगिक क्रॉस-सेक्शनल क्षेत्र...,सामान्य ताकतों से क्या जुड़ा है?,दबाव की शर्तें,118,hindi,उस आयतन के लिए प्रास,1,"[0, 38338, 217186, 1302, 646, 6004, 158371, 10...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, None, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",0,-1,1,51,53,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",False,"[2.6658051013946533, -5.117407321929932, -6.50...","[-1.514367699623108, -10.561481475830078, -10....","{""118"": 14.286912441253662}",1.428691e+01,118,False


In [82]:
res_prep_df.head(100)

Unnamed: 0,id,context,question,answer_text,answer_start,language,top20_context,answer_text_count,input_ids,token_type_ids,attention_mask,sequence_ids,offset_mapping,overflowing_batch_id,duplicated_elems_num_with,part_answer_text_count,start_position,end_position,segmentation_position,is_contain_answer_text,start_logits,end_logits,char_index_score_dict,max_score,max_index,is_original
0,903deec17,ஒரு சாதாரண வளர்ந்த மனிதனுடைய எலும்புக்கூடு பின...,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,53,tamil,ஒரு சாதாரண வளர்ந்த ம,1,"[0, 69535, 81049, 37368, 153264, 12095, 52989,...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, Non...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",0,-1,1,27,27,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",False,"[5.0701704025268555, -5.960593223571777, -7.40...","[2.313723087310791, -10.814351081848145, -10.9...","{""53"": 9.169572353363037}",9.169572,53,True
1,903deec17,ஒரு சாதாரண வளர்ந்த மனிதனுடைய எலும்புக்கூடு பின...,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,-1,tamil,ஒரு சாதாரண வளர்ந்த ம,1,"[0, 69535, 81049, 37368, 153264, 12095, 52989,...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, Non...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",1,-1,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",True,"[6.620517730712891, -5.636728286743164, -7.391...","[4.0754289627075195, -10.411615371704102, -10....",{},-100000000.0,-100000000,True
2,903deec17,ஒரு சாதாரண வளர்ந்த மனிதனுடைய எலும்புக்கூடு பின...,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,-1,tamil,ஒரு சாதாரண வளர்ந்த ம,1,"[0, 69535, 81049, 37368, 153264, 12095, 52989,...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, Non...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",2,-1,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",True,"[6.747103214263916, -5.517636299133301, -7.225...","[4.195484638214111, -10.406983375549316, -10.7...",{},-100000000.0,-100000000,True
3,903deec17,ஒரு சாதாரண வளர்ந்த மனிதனுடைய எலும்புக்கூடு பின...,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,-1,tamil,ஒரு சாதாரண வளர்ந்த ம,1,"[0, 69535, 81049, 37368, 153264, 12095, 52989,...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, Non...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",3,-1,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",True,"[6.703310489654541, -5.5158371925354, -7.33090...","[4.214907646179199, -10.472651481628418, -10.7...",{},-100000000.0,-100000000,True
4,903deec17,ஒரு சாதாரண வளர்ந்த மனிதனுடைய எலும்புக்கூடு பின...,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,-1,tamil,ஒரு சாதாரண வளர்ந்த ம,1,"[0, 69535, 81049, 37368, 153264, 12095, 52989,...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, Non...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",4,-1,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",True,"[6.8010969161987305, -4.386323928833008, -6.87...","[4.284533977508545, -10.148509979248047, -10.6...",{},-100000000.0,-100000000,True
5,903deec17,ஒரு சாதாரண வளர்ந்த மனிதனுடைய எலும்புக்கூடு பின...,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,-1,tamil,ஒரு சாதாரண வளர்ந்த ம,1,"[0, 69535, 81049, 37368, 153264, 12095, 52989,...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, Non...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",5,-1,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",True,"[6.571201324462891, -4.953230857849121, -7.127...","[4.143421173095703, -10.245794296264648, -10.9...",{},-100000000.0,-100000000,True
6,d9841668c,காளிதாசன் (தேவநாகரி: कालिदास) சமஸ்கிருத இலக்கி...,காளிதாசன் எங்கு பிறந்தார்?,காசுமீரில்,-1,tamil,காளிதாசன் (தேவநாகரி:,3,"[0, 12751, 14622, 9654, 52881, 101514, 3686, 6...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, None, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",0,-1,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",True,"[5.134627819061279, -7.827160358428955, -9.353...","[2.5055532455444336, -11.261157989501953, -10....",{},-100000000.0,-100000000,True
7,d9841668c,காளிதாசன் (தேவநாகரி: कालिदास) சமஸ்கிருத இலக்கி...,காளிதாசன் எங்கு பிறந்தார்?,காசுமீரில்,-1,tamil,காளிதாசன் (தேவநாகரி:,3,"[0, 12751, 14622, 9654, 52881, 101514, 3686, 6...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, None, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",1,-1,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",True,"[6.132116317749023, -7.9172797203063965, -8.88...","[3.6926043033599854, -11.67197036743164, -10.9...",{},-100000000.0,-100000000,True
8,d9841668c,காளிதாசன் (தேவநாகரி: कालिदास) சமஸ்கிருத இலக்கி...,காளிதாசன் எங்கு பிறந்தார்?,காசுமீரில்,2358,tamil,காளிதாசன் (தேவநாகரி:,3,"[0, 12751, 14622, 9654, 52881, 101514, 3686, 6...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, None, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",2,-1,3,209,212,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",False,"[5.931232929229736, -8.107956886291504, -9.340...","[3.544769525527954, -11.642298698425293, -11.1...","{""2358"": 6.011011481285095, ""2531"": -1.0148405...",6.011011,2358,True
9,d9841668c,காளிதாசன் (தேவநாகரி: कालिदास) சமஸ்கிருத இலக்கி...,காளிதாசன் எங்கு பிறந்தார்?,காசுமீரில்,-1,tamil,காளிதாசன் (தேவநாகரி:,3,"[0, 12751, 14622, 9654, 52881, 101514, 3686, 6...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, None, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",3,-1,1,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",True,"[6.049813270568848, -8.049707412719727, -9.259...","[3.70566463470459, -11.700212478637695, -11.11...","{""2653"": -7.417436838150024}",-7.417437,2653,True


In [31]:
res_prep_df.query("part_answer_text_count > 2")

Unnamed: 0,id,context,question,answer_text,answer_start,language,top20_context,answer_text_count,input_ids,token_type_ids,attention_mask,sequence_ids,offset_mapping,overflowing_batch_id,duplicated_elems_num_with,part_answer_text_count,start_position,end_position,segmentation_position,is_contain_answer_text,start_logits,end_logits,char_index_score_dict,max_score,max_index,is_original
8,d9841668c,காளிதாசன் (தேவநாகரி: कालिदास) சமஸ்கிருத இலக்கி...,காளிதாசன் எங்கு பிறந்தார்?,காசுமீரில்,2358,tamil,காளிதாசன் (தேவநாகரி:,3,"[0, 12751, 14622, 9654, 52881, 101514, 3686, 6...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, None, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",2,-1,3,209,212,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",False,"[5.931232929229736, -8.107956886291504, -9.340...","[3.544769525527954, -11.642298698425293, -11.1...","{""2358"": 6.011011481285095, ""2531"": -1.0148405...",6.011011,2358,True
21,41660850a,"குழந்தையின் அழுகையை நிறுத்தவும், தூங்க வைக்கவ...",தமிழ்நாட்டில் குழந்தைகளை தூங்க வைக்க பாடும் பா...,தாலாட்டு,68,tamil,குழந்தையின் அழுகையை,27,"[0, 9708, 129729, 8938, 49936, 6819, 46018, 69...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",0,-1,9,38,40,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",False,"[5.263872146606445, -5.018688201904297, -7.984...","[2.3657572269439697, -9.595905303955078, -10.6...","{""68"": 11.338393688201904, ""96"": 1.54693630337...",11.338394,68,True
22,41660850a,"குழந்தையின் அழுகையை நிறுத்தவும், தூங்க வைக்கவ...",தமிழ்நாட்டில் குழந்தைகளை தூங்க வைக்க பாடும் பா...,தாலாட்டு,-1,tamil,குழந்தையின் அழுகையை,27,"[0, 9708, 129729, 8938, 49936, 6819, 46018, 69...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",1,-1,8,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",True,"[6.457548141479492, -6.196562767028809, -7.997...","[4.086376667022705, -10.231048583984375, -10.6...","{""1159"": -6.539100170135498, ""1300"": -8.870328...",-2.831536,1823,True
23,41660850a,"குழந்தையின் அழுகையை நிறுத்தவும், தூங்க வைக்கவ...",தமிழ்நாட்டில் குழந்தைகளை தூங்க வைக்க பாடும் பா...,தாலாட்டு,1823,tamil,குழந்தையின் அழுகையை,27,"[0, 9708, 129729, 8938, 49936, 6819, 46018, 69...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",2,-1,7,70,72,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",False,"[6.434752941131592, -6.623223304748535, -8.039...","[4.052172660827637, -10.479355812072754, -10.5...","{""1823"": -0.24710583686828613, ""1964"": -10.139...",-0.247106,1823,True
26,41660850a,"குழந்தையின் அழுகையை நிறுத்தவும், தூங்க வைக்கவ...",தமிழ்நாட்டில் குழந்தைகளை தூங்க வைக்க பாடும் பா...,தாலாட்டு,-1,tamil,குழந்தையின் அழுகையை,27,"[0, 9708, 129729, 8938, 49936, 6819, 46018, 69...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",5,-1,6,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",True,"[6.3092875480651855, -5.981869697570801, -7.86...","[3.996511459350586, -10.042537689208984, -10.2...","{""4453"": -16.509401321411133, ""4631"": -5.46601...",-5.466014,4631,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22219,xquad_1087,"इस्लामवाद एक विवादास्पद अवधारणा है, क्योंकि यह...",इस्लामवाद के समर्थक उनके विचारों को क्या मानते...,इस्लाम,158,hindi,इस्लामवाद एक विवादास,8,"[0, 136695, 27122, 287, 186718, 24939, 12184, ...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",0,-1,8,49,49,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",False,"[0.9025558233261108, -6.218747138977051, -8.62...","[-2.9901509284973145, -11.008432388305664, -10...","{""0"": -0.03943347930908203, ""47"": -1.991658091...",10.660694,158,False
22282,xquad_1150,पहले मेथोडिस्ट पादरी वर्ग को इंग्लैंड के चर्च ...,पादरी आमतौर पर स्थानीय मंडलों में किस रूप में ...,पादरी,536,hindi,पहले मेथोडिस्ट पादरी,3,"[0, 6, 69650, 4010, 26046, 216881, 968, 17631,...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",0,-1,3,165,166,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",False,"[2.2730953693389893, -7.252089500427246, -3.52...","[-1.2939056158065796, -5.55315637588501, -7.35...","{""15"": -4.739900827407837, ""229"": -1.330254197...",1.226566,536,False
22301,xquad_1169,अरस्तू ने एक बल की अवधारणा को दार्शनिक चर्चा ...,बल की दार्शनिक चर्चा किसने प्रदान की?,अरस्तू,1,hindi,अरस्तू ने एक बल की,5,"[0, 39812, 471, 16822, 7407, 14910, 5725, 2952...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, Non...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",0,-1,5,16,18,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",False,"[3.8185818195343018, -7.841070652008057, -9.31...","[1.2277852296829224, -8.672074317932129, -9.01...","{""1"": 14.268604755401611, ""60"": 6.295892715454...",14.268605,1,False
22310,xquad_1176,बलों के लिए मूलभूत सिद्धांतों का विकास असमान व...,भौतिकविद किस तरह के आत्मनिर्भर मॉडल बनाने की क...,एकीकरण,-1,hindi,बलों के लिए मूलभूत स,4,"[0, 143777, 158600, 37828, 15107, 287, 58834, ...","[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, ...",0,-1,3,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",True,"[5.747521877288818, -7.2679667472839355, -9.54...","[3.622725486755371, -10.203123092651367, -8.35...","{""56"": -10.035277366638184, ""817"": -9.90228319...",-8.200345,1013,False


In [34]:
res_prep_df.loc[22219]["char_index_score_dict"]

'{"0": -0.03943347930908203, "47": -1.991658091545105, "158": 10.660694360733032, "219": -4.200814723968506, "309": -7.003398895263672, "452": -8.943944931030273, "507": -9.516861915588379, "552": -10.651233911514282}'

In [35]:
with open(f"../data/dataset/val_pseudo/e070_{THRESH:.3f}_res_prep_df.pkl", "wb") as fout:
    pickle.dump(res_prep_df, fout)