In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd
import sys
import random

from tqdm import tqdm

import string

import os

import json

In [3]:
!pip install -q transformers

[K     |████████████████████████████████| 3.1 MB 4.4 MB/s 
[K     |████████████████████████████████| 596 kB 63.4 MB/s 
[K     |████████████████████████████████| 56 kB 4.4 MB/s 
[K     |████████████████████████████████| 895 kB 52.3 MB/s 
[K     |████████████████████████████████| 3.3 MB 52.8 MB/s 
[?25h

In [4]:
from transformers import AutoTokenizer, BertConfig, TFBertModel

In [5]:
import tensorflow as tf
from tensorflow.keras.losses import sparse_categorical_crossentropy as sce
from tensorflow.keras.callbacks import Callback

In [6]:
val_ins = '/content/drive/MyDrive/data/fine_data/val_instance_128_clean.csv'
val_id = '/content/drive/MyDrive/data/fine_data/val_id.csv'

val_cand = '/content/drive/MyDrive/data/fine_data/val_mapping.csv'
gt = '/content/drive/MyDrive/data/fine_data/validation.csv'
gt_long = '/content/drive/MyDrive/data/fine_data/validation_long.csv'
gt_short = '/content/drive/MyDrive/data/fine_data/validation_short.csv'

In [7]:
f_train = '/content/drive/MyDrive/data/simplified-nq-train.jsonl'
f_test = '/content/drive/MyDrive/data/simplified-nq-test.jsonl'
num_train_samples = 307372
num_test_samples = 346

## **Functions to tokenize data**

In [8]:
model_name = 'bert-base-uncased'

tokenizer_lan = AutoTokenizer.from_pretrained(model_name)

tags = [ '``', '\'\'', '--']

print(tags)

special_tokens_dict = {'additional_special_tokens': tags}

num_added_toks = tokenizer_lan.add_special_tokens(special_tokens_dict)
print(num_added_toks)
print(len(tokenizer_lan))

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

['``', "''", '--']
3
30525


In [9]:
AnswerType = {
    'NO_ANSWER': 0,
    'YES': 1,
    'NO': 2,
    'SHORT' : 3,
    'LONG' : 4
}

def preprocess_data(data, tokenizer, debug=False): 
    progress = tqdm(data, total=len(data))
    x1 = []
    x2 = []
    x3 = []
    y = []
    for sam in progress:
        # part_id = sam['part_id']
        # part_tokens = f'[part={part_id}]'

        # context = part_tokens + " " + sam['context']
        tokenized_sam = tokenizer.encode_plus(sam['question'], sam['context'], 
                                              padding='max_length',
                                              truncation=True,
                                              max_length=512,
                                              add_special_tokens=True)
        
        x1.append(tf.cast(tokenized_sam['input_ids'], tf.int32))
        x2.append(tf.cast(tokenized_sam['token_type_ids'], tf.int32))
        x3.append(tf.cast(tokenized_sam['attention_mask'], tf.int32))

        y.append([sam['start'], sam['stop'], AnswerType[sam['target']]])

    x1 = tf.convert_to_tensor(x1)
    x2 = tf.convert_to_tensor(x2)
    x3 = tf.convert_to_tensor(x3)

    y = tf.convert_to_tensor(y)
    return x1, x2, x3, y


In [10]:
val_id_df = pd.read_csv(val_id)
val_id_list = val_id_df['example_id'].tolist()

val_cand_df = pd.read_pickle(val_cand)
val_cand_list = val_cand_df.to_dict('records') 

gt_df = pd.read_csv(gt)
gt_list = gt_df.to_dict('records')

gt_long_df = pd.read_csv(gt_long)
gt_long_list = gt_long_df.to_dict('records')

gt_short_df = pd.read_csv(gt_short)
gt_short_list = gt_short_df.to_dict('records')

In [11]:
val_ins_df = pd.read_csv(val_ins)
val_ins_list = val_ins_df.to_dict('records')
print(len(val_ins_list))

169721


In [12]:
def get_strategy():
    try:
        tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
        print('Running on TPU ', tpu_cluster_resolver.cluster_spec().as_dict()['worker'])
        tf.config.experimental_connect_to_cluster(tpu_cluster_resolver)
        tf.tpu.experimental.initialize_tpu_system(tpu_cluster_resolver)
        strategy = tf.distribute.experimental.TPUStrategy(tpu_cluster_resolver)
    except ValueError as e:
        print(e)
        print('No TPU detected')
        tpu = None
        strategy = tf.distribute.get_strategy()
    return strategy

In [13]:
def create_model_lan(tokenizer, model_name, debug=False):
    config = BertConfig()
    if debug:
        print(config)
    # encoder = TFBertModel(config)
    encoder = TFBertModel.from_pretrained(model_name)
    encoder.resize_token_embeddings(len(tokenizer))

    NUM_TARGET = 5
    class MyQAModel(tf.keras.Model):
        def __init__(self, *inputs, **kwargs):
            super().__init__(*inputs, **kwargs)            
            self.bert = encoder

            # self.dropout_start = tf.keras.layers.Dropout(0.1)
            # self.dropout_stop = tf.keras.layers.Dropout(0.1)
            # self.dropout_target = tf.keras.layers.Dropout(0.1)

            self.start_logits = tf.keras.layers.Dense(1)
            self.stop_logits = tf.keras.layers.Dense(1)
            
            self.target = tf.keras.layers.Dense(NUM_TARGET)

        def call(self, inputs, **kwargs):
            bert_res=self.bert(inputs[0], 
                               token_type_ids=inputs[1], 
                               attention_mask=inputs[2]
                               )
            
            # dropout_res1 = self.dropout_start(bert_res[0])
            dropout_res1 = bert_res[0]

            start_logits = tf.squeeze(self.start_logits(dropout_res1), -1)

            # dropout_res2 = self.dropout_stop(bert_res[0])
            dropout_res2 = bert_res[0]

            stop_logits = tf.squeeze(self.stop_logits(dropout_res2), -1)

            # dropout_res3 = self.dropout_target(bert_res[1])
            dropout_res3 = bert_res[1]
            
            targets = self.target(dropout_res3)
            
            paddings = tf.constant([[0, 0,], [0, 512-NUM_TARGET]])
            targets = tf.pad(targets, paddings)
            
            res = tf.stack([start_logits, stop_logits, targets], axis=1)
            return res
        
    model = MyQAModel()
    return model 

In [14]:
def getFineInsRes(raw_res, ins_list, debug=False):
    list_ins_res = []
    progress = tqdm(ins_list, total=len(ins_list))
    for rowid, row in enumerate(progress):
        example_id = row['example_id']
        part_id = row['part_id']

        res_start = tf.nn.softmax(raw_res[rowid][0]).numpy()
        res_stop = tf.nn.softmax(raw_res[rowid][1]).numpy()
        res_target = tf.nn.softmax(raw_res[rowid][2]).numpy()
        
        start = np.argmax(res_start)
        stop = np.argmax(res_stop)
        target = np.argmax(res_target)

        start_score = res_start[start]
        stop_score = res_stop[stop]

        start_CLS = res_start[0]
        stop_CLS = res_stop[0]
        
        ins_res = {}
        ins_res['example_id'] = example_id
        ins_res['part_id'] = part_id

        ins_res['start'] = start 
        ins_res['stop'] = stop
        ins_res['target'] = target

        ins_res['start_score'] = start_score
        ins_res['stop_score'] = stop_score
        
        ins_res['start_CLS'] = start_CLS
        ins_res['stop_CLS'] = stop_CLS
        if debug:
            if rowid == 101:
                print(row)
                print(ins_res)
        list_ins_res.append(ins_res)
    list_ins_res_df = pd.DataFrame(list_ins_res)
    return list_ins_res_df

In [15]:
def mergeDocumentRes(ins_df, val_id_df, threshold=0.0001, stride=128, debug=False):
    STRIDE = stride
    list_doc_lan = []
    for idx, doc in val_id_df.iterrows():
        doc_id = doc['example_id']
        ins_of_doc = ins_df.loc[ins_df['example_id'] == doc_id]
        
        start_ins = ins_of_doc.loc[ins_of_doc['start'] != 0]
        stop_ins = ins_of_doc.loc[ins_of_doc['stop'] != 0]
        all_non_zero = pd.concat([start_ins,stop_ins]).drop_duplicates()
        
        best_start = -1
        best_stop = -1
        best_target = 0
        best_score = threshold
                    
        for idx_ins, ins in all_non_zero.iterrows():
            ins_start = int(ins['start'])
            ins_stop = int(ins['stop'])
            ins_target = int(ins['target'])
            
            part_id = ins['part_id']
            part_start = part_id*STRIDE
            
            real_start = int(ins_start + part_start)
            real_stop = int(ins_stop + part_start)
            
            s_start = ins['start_score']
            s_stop = ins['stop_score']
            
            cls_start = ins['start_CLS']
            cls_stop = ins['stop_CLS']
            
            if real_stop > real_start:   
                if s_start - cls_start + s_stop - cls_stop > best_score:
                    best_score = s_start - cls_start + s_stop - cls_stop
                    best_start = real_start
                    best_stop = real_stop
                    best_target = ins_target

        doc_lan = {}
        doc_lan['example_id'] = doc_id
        doc_lan['start'] = best_start
        doc_lan['stop'] = best_stop
        doc_lan['target'] = best_target
        doc_lan['score'] = best_score
        
        if debug:
            if idx == 101:
                print(doc_lan)
        
        list_doc_lan.append(doc_lan)
    
    list_doc_lan_df = pd.DataFrame(list_doc_lan)
    return list_doc_lan_df

In [16]:
val_cand_df.head()

Unnamed: 0,example_id,new_candidates,old_candidates
0,4158175306918787233,"[{'end_token': 169, 'start_token': 6}, {'end_t...","[{'start_token': 8, 'top_level': True, 'end_to..."
1,-1957133654292017851,"[{'end_token': 138, 'start_token': 24}, {'end_...","[{'start_token': 26, 'top_level': True, 'end_t..."
2,-1273245364552286191,"[{'end_token': 146, 'start_token': 12}, {'end_...","[{'start_token': 14, 'top_level': True, 'end_t..."
3,-8663891343543535834,"[{'end_token': 188, 'start_token': 64}, {'end_...","[{'start_token': 66, 'top_level': True, 'end_t..."
4,8161832609608306276,"[{'end_token': 74, 'start_token': 6}, {'end_to...","[{'start_token': 8, 'top_level': True, 'end_to..."


In [17]:
AnswerTypeRev = {
    0: 'NO_ANSWER',
    1: 'YES',
    2: 'NO',
    3: 'SHORT',
    4: 'LONG'
}

def getSubmission(doc_res_df, doc_cand_df, threshold=0.0001, debug=False):
    doc_res_df.example_id = doc_res_df.example_id.astype(str)
    doc_cand_df.example_id = doc_cand_df.example_id.astype(str)
    if debug:
        print(doc_res_df.dtypes)
        print(doc_cand_df.dtypes)

    combine_df = pd.merge(doc_res_df, doc_cand_df, on='example_id')
    lines = []
    for id, doc in combine_df.iterrows():

        example_id = doc['example_id']
        long_id = str(example_id) + '_long'
        short_id = str(example_id) + '_short'

        line_long = {}
        line_long['example_id'] = long_id

        an_start = int(doc['start'])
        an_stop = int(doc['stop'])
        an_target = doc['target']
        an_score = doc['score']
        # print(an_start, an_stop, an_target, an_score)
        lan_start, lan_stop = -1, -1

        # find long answer 
        if an_start > 0 and an_stop > 0:
            candidates = doc['new_candidates']
            an_range = [*range(an_start, an_stop + 1, 1)]

            best_inter = 0.5
            shortest = 10000000000000
            best_id = 0
            for cidx, cand in enumerate(candidates):
                c_start = int(cand['start_token'])
                c_stop = int(cand['end_token'])

                c_range = [*range(c_start, c_stop + 1, 1)]
                inter = len(list(set(an_range)&set(c_range)))
            
                if float(inter) > best_inter:
                    best_id = cidx
                    best_inter = inter
                    shortest = len(c_range)
                elif inter == best_inter:
                    if shortest > len(c_range):
                        best_id = cidx
                        shortest = len(c_range)

            real_candidates = doc['old_candidates']
            lan_start = real_candidates[best_id]['start_token']
            lan_stop = real_candidates[best_id]['end_token']

            if debug:
                if id == 101:
                    print(lan_start, lan_stop)

        if lan_start > 0 and lan_stop > 0 and an_target != 0:
            long_string = str(lan_start) + ':' + str(lan_stop)
        else:
            long_string = ''


        line_long['PredictionString'] = long_string
        lines.append(line_long)

    lines_df = pd.DataFrame(lines)
    sorted_df = lines_df.sort_values('example_id')
    return sorted_df

In [18]:
def getResult(gt_df, res_df, debug=False):
    gt_df.fillna('', inplace=True)
    if gt_df.shape[0] != res_df.shape[0]:
        print('ERROR: Different number of rows')
        return -1.0
    
    TP, TN, FP, FN = 0, 0, 0, 0

    # list_incorrect = []

    gt_id = gt_df['example_id'].astype(str).tolist()
    res_id = res_df['example_id'].astype(str).tolist()

    if gt_id != res_id:
        print("ERROR: Example_id lists are not the same")
        return -1.0
    
    gt_res = gt_df['PredictionString'].tolist()
    res_res = res_df['PredictionString'].tolist()
    id_list = gt_df['example_id'].tolist()
    for i in range(len(gt_res)):
        if gt_res[i] == res_res[i]:
            if gt_res[i] != "":
                TP += 1
            else:
                TN += 1
        else:
            if res_res[i] == '':
                FN += 1
            else:
                FP += 1
    recall = float(TP)/float(TP+FN + 0.000001)
    precision = float(TP)/float(TP+FP + 0.000001)
    print("TP: ", TP)
    print("TN: ", TN)
    print("FP: ", FP)
    print("FN: ", FN)
    print("Recall: ", recall)
    print("Precision: ", precision)
    print("F1: ", float(2*recall*precision)/float(recall + precision + 0.00000001))


In [19]:
x_val1, x_val2, x_val3, y_val = preprocess_data(val_ins_list, tokenizer_lan)

100%|██████████| 169721/169721 [07:41<00:00, 367.65it/s]


In [21]:
import glob
list_files = glob.glob("/content/drive/MyDrive/data/models/clean-128-2/*")
print(list_files)

['/content/drive/MyDrive/data/models/clean-128-2/weights-01.h5', '/content/drive/MyDrive/data/models/clean-128-2/weights-02.h5', '/content/drive/MyDrive/data/models/clean-128-2/weights-03.h5', '/content/drive/MyDrive/data/models/clean-128-2/weights-04.h5', '/content/drive/MyDrive/data/models/clean-128-2/weights-05.h5', '/content/drive/MyDrive/data/models/clean-128-2/weights-06.h5', '/content/drive/MyDrive/data/models/clean-128-2/weights-07.h5', '/content/drive/MyDrive/data/models/clean-128-2/weights-08.h5', '/content/drive/MyDrive/data/models/clean-128-2/weights-09.h5', '/content/drive/MyDrive/data/models/clean-128-2/weights-10.h5', '/content/drive/MyDrive/data/models/clean-128-2/weights-11.h5', '/content/drive/MyDrive/data/models/clean-128-2/weights-12.h5', '/content/drive/MyDrive/data/models/clean-128-2/weights-13.h5', '/content/drive/MyDrive/data/models/clean-128-2/weights-14.h5', '/content/drive/MyDrive/data/models/clean-128-2/weights-15.h5', '/content/drive/MyDrive/data/models/cle

In [None]:
strategy = get_strategy()
with strategy.scope():
    lanModel = create_model_lan(tokenizer_lan, model_name)

    for onefile in list_files:
        weight_path = onefile
        print(weight_path)
        if os.path.isfile(weight_path):
            x = np.ones([1, 512], dtype=int)
            lanModel.predict([x, x, x])
            lanModel.load_weights(weight_path)

        opt = tf.keras.optimizers.Adam(learning_rate=0.00005)
        lossSCE = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
        metricSCA = tf.keras.metrics.SparseCategoricalAccuracy()
        lanModel.compile(optimizer=opt, loss=lossSCE, metrics=[metricSCA], run_eagerly=False)

        y_pred = lanModel.predict([x_val1, x_val2, x_val3], verbose=1) 
        fineResDf = getFineInsRes(y_pred, val_ins_list)
        list_threshold = [0.1]   
        for threshold in list_threshold:
            print('THRESHOLD: ', threshold)
            docAnsDf = mergeDocumentRes(fineResDf, val_id_df, threshold=threshold)
            sub = getSubmission(docAnsDf, val_cand_df)
            getResult(gt_long_df, sub)

INFO:absl:Entering into master device scope: /job:worker/replica:0/task:0/device:CPU:0


Running on TPU  ['10.82.230.106:8470']
INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Initializing the TPU system: grpc://10.82.230.106:8470


INFO:tensorflow:Initializing the TPU system: grpc://10.82.230.106:8470


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Found TPU system:


INFO:tensorflow:Found TPU system:


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)
DEBUG:filelock:Attempting to acquire lock 140602277960144 on /root/.cache/huggingface/transformers/775efbdc2152093295bc5824dee96da82a5f3c1f218dfface1b8cef3094bdf8f.c719a806caef7d36ec0185f14b3b5fa727d919f924abe35622b4b7147bfbb8c7.h5.lock
DEBUG:filelock:Lock 140602277960144 acquired on /root/.cache/huggingface/transformers/775efbdc2152093295bc5824dee96da82a5f3c1f218dfface1b8cef3094bdf8f.c719a806caef7d36ec0185f14b3b5fa727d919f924abe35622b4b7147bfbb8c7.h5.lock


Downloading:   0%|          | 0.00/511M [00:00<?, ?B/s]

DEBUG:filelock:Attempting to release lock 140602277960144 on /root/.cache/huggingface/transformers/775efbdc2152093295bc5824dee96da82a5f3c1f218dfface1b8cef3094bdf8f.c719a806caef7d36ec0185f14b3b5fa727d919f924abe35622b4b7147bfbb8c7.h5.lock
DEBUG:filelock:Lock 140602277960144 released on /root/.cache/huggingface/transformers/775efbdc2152093295bc5824dee96da82a5f3c1f218dfface1b8cef3094bdf8f.c719a806caef7d36ec0185f14b3b5fa727d919f924abe35622b4b7147bfbb8c7.h5.lock
Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification mo

/content/drive/MyDrive/data/models/clean-128-2/weights-01.h5


INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond_8/Identity:0' shape=(None, 512) dtype=int64>, <tf.Tensor 'cond_8/Identity_1:0' shape=(None, 512) dtype=int64>, <tf.Tensor 'cond_8/Identity_2:0' shape=(None, 512) dtype=int64>]
INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond_8/Identity:0' shape=(None, 512) dtype=int64>, <tf.Tensor 'cond_8/Identity_1:0' shape=(None, 512) dtype=int64>, <tf.Tensor 'cond_8/Identity_2:0' shape=(None, 512) dtype=int64>]
INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond_8/Identity:0' shape=(None, 512) dtype=int32>, <tf.Tensor 'cond_8/Identity_1:0' shape=(None, 512) dtype=int32>, <tf.Tensor 'cond_8/Identity_2:0' shape=(None, 512) dtype=int32>]




100%|██████████| 169721/169721 [10:32<00:00, 268.53it/s]


THRESHOLD:  0.1
TP:  616
TN:  612
FP:  1623
FN:  224
Recall:  0.7333333324603175
Precision:  0.27512282256582277
F1:  0.40012990808153515
/content/drive/MyDrive/data/models/clean-128-2/weights-02.h5


INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond_8/Identity:0' shape=(None, 512) dtype=int64>, <tf.Tensor 'cond_8/Identity_1:0' shape=(None, 512) dtype=int64>, <tf.Tensor 'cond_8/Identity_2:0' shape=(None, 512) dtype=int64>]
INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond_8/Identity:0' shape=(None, 512) dtype=int32>, <tf.Tensor 'cond_8/Identity_1:0' shape=(None, 512) dtype=int32>, <tf.Tensor 'cond_8/Identity_2:0' shape=(None, 512) dtype=int32>]




100%|██████████| 169721/169721 [10:21<00:00, 273.13it/s]


THRESHOLD:  0.1
TP:  683
TN:  460
FP:  1813
FN:  119
Recall:  0.8516209465690512
Precision:  0.27363782040319
F1:  0.41419041450339283
/content/drive/MyDrive/data/models/clean-128-2/weights-03.h5


INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond_8/Identity:0' shape=(None, 512) dtype=int64>, <tf.Tensor 'cond_8/Identity_1:0' shape=(None, 512) dtype=int64>, <tf.Tensor 'cond_8/Identity_2:0' shape=(None, 512) dtype=int64>]
INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond_8/Identity:0' shape=(None, 512) dtype=int32>, <tf.Tensor 'cond_8/Identity_1:0' shape=(None, 512) dtype=int32>, <tf.Tensor 'cond_8/Identity_2:0' shape=(None, 512) dtype=int32>]




100%|██████████| 169721/169721 [10:24<00:00, 271.96it/s]


THRESHOLD:  0.1
TP:  632
TN:  185
FP:  2219
FN:  39
Recall:  0.941877792933118
Precision:  0.22167660462235125
F1:  0.3588869927367862
/content/drive/MyDrive/data/models/clean-128-2/weights-04.h5


INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond_8/Identity:0' shape=(None, 512) dtype=int64>, <tf.Tensor 'cond_8/Identity_1:0' shape=(None, 512) dtype=int64>, <tf.Tensor 'cond_8/Identity_2:0' shape=(None, 512) dtype=int64>]
INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond_8/Identity:0' shape=(None, 512) dtype=int32>, <tf.Tensor 'cond_8/Identity_1:0' shape=(None, 512) dtype=int32>, <tf.Tensor 'cond_8/Identity_2:0' shape=(None, 512) dtype=int32>]




100%|██████████| 169721/169721 [10:09<00:00, 278.46it/s]


THRESHOLD:  0.1
TP:  670
TN:  215
FP:  2143
FN:  47
Recall:  0.9344490921416331
Precision:  0.23817987904792753
F1:  0.3796033959811555
/content/drive/MyDrive/data/models/clean-128-2/weights-05.h5


INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond_8/Identity:0' shape=(None, 512) dtype=int64>, <tf.Tensor 'cond_8/Identity_1:0' shape=(None, 512) dtype=int64>, <tf.Tensor 'cond_8/Identity_2:0' shape=(None, 512) dtype=int64>]
INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond_8/Identity:0' shape=(None, 512) dtype=int32>, <tf.Tensor 'cond_8/Identity_1:0' shape=(None, 512) dtype=int32>, <tf.Tensor 'cond_8/Identity_2:0' shape=(None, 512) dtype=int32>]




100%|██████████| 169721/169721 [09:59<00:00, 283.31it/s]


THRESHOLD:  0.1
TP:  667
TN:  253
FP:  2109
FN:  46
Recall:  0.9354838696557028
Precision:  0.2402737751295844
F1:  0.3823445078502251
/content/drive/MyDrive/data/models/clean-128-2/weights-06.h5


INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond_8/Identity:0' shape=(None, 512) dtype=int64>, <tf.Tensor 'cond_8/Identity_1:0' shape=(None, 512) dtype=int64>, <tf.Tensor 'cond_8/Identity_2:0' shape=(None, 512) dtype=int64>]
INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond_8/Identity:0' shape=(None, 512) dtype=int32>, <tf.Tensor 'cond_8/Identity_1:0' shape=(None, 512) dtype=int32>, <tf.Tensor 'cond_8/Identity_2:0' shape=(None, 512) dtype=int32>]




100%|██████████| 169721/169721 [09:58<00:00, 283.43it/s]


THRESHOLD:  0.1
TP:  643
TN:  171
FP:  2224
FN:  37
Recall:  0.9455882339035467
Precision:  0.2242762468698025
F1:  0.36255990647932174
/content/drive/MyDrive/data/models/clean-128-2/weights-07.h5


INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond_8/Identity:0' shape=(None, 512) dtype=int64>, <tf.Tensor 'cond_8/Identity_1:0' shape=(None, 512) dtype=int64>, <tf.Tensor 'cond_8/Identity_2:0' shape=(None, 512) dtype=int64>]
INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond_8/Identity:0' shape=(None, 512) dtype=int32>, <tf.Tensor 'cond_8/Identity_1:0' shape=(None, 512) dtype=int32>, <tf.Tensor 'cond_8/Identity_2:0' shape=(None, 512) dtype=int32>]




100%|██████████| 169721/169721 [10:10<00:00, 278.04it/s]


THRESHOLD:  0.1
TP:  645
TN:  171
FP:  2227
FN:  32
Recall:  0.9527326426104392
Precision:  0.22458217262375274
F1:  0.3634826678827551
/content/drive/MyDrive/data/models/clean-128-2/weights-08.h5


INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond_8/Identity:0' shape=(None, 512) dtype=int64>, <tf.Tensor 'cond_8/Identity_1:0' shape=(None, 512) dtype=int64>, <tf.Tensor 'cond_8/Identity_2:0' shape=(None, 512) dtype=int64>]
INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond_8/Identity:0' shape=(None, 512) dtype=int32>, <tf.Tensor 'cond_8/Identity_1:0' shape=(None, 512) dtype=int32>, <tf.Tensor 'cond_8/Identity_2:0' shape=(None, 512) dtype=int32>]




 58%|█████▊    | 97953/169721 [05:50<04:24, 270.84it/s]

In [None]:
y_pred = lanModel.predict([x_val1, x_val2, x_val3], verbose=1) 
fineResDf = getFineInsRes(y_pred, val_ins_list)

In [None]:
list_threshold = [0.1]   
for threshold in list_threshold:
    print('THRESHOLD: ', threshold)
    docAnsDf = mergeDocumentRes(fineResDf, val_id_df, threshold=threshold)
    sub = getSubmission(docAnsDf, val_cand_df)
    getResult(gt_long_df, sub)

## **Process with short answer**

### **Code to get short answer instance**

In [None]:
def getSanCandidate(sub, filename=f_train, debug=False):
    INSTANCE_WORDS_LEN = 500 
    STRIDE = 256 

    list_doc_lan_res = []
    for rowid, row in sub.iterrows():
        example_id = str(row['example_id']).replace('_long',"")
        lan_start, lan_stop = -1, -1

        if str(row['PredictionString']) != '':
            tokens = str(row['PredictionString']).split(':')
            lan_start = int(tokens[0])
            lan_stop = int(tokens[1]) 
            
        sam = {'example_id': example_id, 'lan_start': lan_start, 'lan_stop': lan_stop}
        list_doc_lan_res.append(sam)
        
    list_doc_lan_res_df = pd.DataFrame(list_doc_lan_res)

    set_id = set(list_doc_lan_res_df['example_id'].values.tolist())

    list_san_ins = []

    with open(filename) as f:
        progress = tqdm(f, total=num_train_samples)  
        for sam_count, line in enumerate(progress):
            if sam_count >= num_train_samples:
                break
                
            data = json.loads(line)
            example_id = str(data['example_id'])
            if example_id in set_id:
                # get lan result 
                ans = list_doc_lan_res_df.loc[list_doc_lan_res_df['example_id']==example_id]
                lan_start, lan_stop = -1, -1
                for rowid, row in ans.iterrows():
                    lan_start = row['lan_start']
                    lan_stop = row['lan_stop']
                if debug:
                    print(example_id, lan_start, lan_stop)
                doc_text = data['document_text']
                doc_text_split = doc_text.split()
                question = data['question_text']
                
                if lan_start > -1 and lan_stop > -1:
                    if lan_stop - lan_start <= INSTANCE_WORDS_LEN:
                        offset = (INSTANCE_WORDS_LEN - (lan_stop - lan_start))//2 
                        part_start = max(0,lan_start - offset)
                        part_stop = min(lan_stop + offset, len(doc_text_split))
                        part_split = doc_text_split[part_start:part_stop]
                        context = ' '.join(part_split)
                        ins = {'example_id': example_id, 'part_start': part_start, 'part_stop': part_stop, 
                               'question': question, 'context': context, 'start': 0, 'stop': 0, 'target': 'NO_ANSWER'}
                        list_san_ins.append(ins) 
                        if debug:
                            print(ins)
                    else: 
                    # in case found long answer is longer than context length limit then split the long answer into small parts
                    # and slide with stride 256
                        part_length = INSTANCE_WORDS_LEN
                        num_parts = (lan_stop - lan_start - INSTANCE_WORDS_LEN)//STRIDE + 1
                        for part_id in range(num_parts + 1):
                            part_start = lan_start + part_id*STRIDE
                            part_stop = min(len(doc_text_split), lan_start + part_id*STRIDE + part_length)
                            part_split = doc_text_split[part_start:part_stop]
                    
                            context = ' '.join(part_split)
                            ins = {'example_id': example_id, 'part_start': part_start, 'part_stop': part_stop, 
                               'question': question, 'context': context, 'start': 0, 'stop': 0, 'target': 'NO_ANSWER'}
                            list_san_ins.append(ins)
                            if debug:
                                print(ins)
    return list_san_ins            


In [None]:
list_san_ins = getSanCandidate(sub, debug=False)

In [None]:
print(len(list_san_ins))

In [None]:
list_san_ins_df = pd.DataFrame(list_san_ins)
list_san_ins_df.head(100)

In [None]:
list_san_ins_df.to_csv('/content/drive/MyDrive/data/fine_data/san_val_instance_0.1_thres.csv', index=False, 
                         columns=['example_id', 'part_start', 'part_stop', 'question', 'context', 'start', 'stop', 'target'])