In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd
import sys
import random

from tqdm import tqdm

import string

import os

import json

In [3]:
!pip install -q transformers

[K     |████████████████████████████████| 2.9 MB 5.2 MB/s 
[K     |████████████████████████████████| 596 kB 50.9 MB/s 
[K     |████████████████████████████████| 56 kB 4.2 MB/s 
[K     |████████████████████████████████| 895 kB 66.9 MB/s 
[K     |████████████████████████████████| 3.3 MB 28.8 MB/s 
[?25h

In [4]:
from transformers import AutoTokenizer, BertConfig, TFBertModel

In [5]:
import tensorflow as tf
from tensorflow.keras.losses import sparse_categorical_crossentropy as sce
from tensorflow.keras.callbacks import Callback

In [6]:
val_ins = '/content/drive/MyDrive/data/fine_data/val_instance_128_clean.csv'
val_id = '/content/drive/MyDrive/data/fine_data/val_id.csv'

val_cand = '/content/drive/MyDrive/data/fine_data/val_mapping.csv'
gt = '/content/drive/MyDrive/data/fine_data/validation.csv'
gt_long = '/content/drive/MyDrive/data/fine_data/validation_long.csv'
gt_short = '/content/drive/MyDrive/data/fine_data/validation_short.csv'

In [7]:
f_train = '/content/drive/MyDrive/data/simplified-nq-train.jsonl'
f_test = '/content/drive/MyDrive/data/simplified-nq-test.jsonl'
num_train_samples = 307372
num_test_samples = 346

In [8]:
val_id_df = pd.read_csv(val_id)
val_id_list = val_id_df['example_id'].tolist()

val_cand_df = pd.read_pickle(val_cand)
val_cand_list = val_cand_df.to_dict('records') 

gt_df = pd.read_csv(gt)
gt_list = gt_df.to_dict('records')

gt_long_df = pd.read_csv(gt_long)
gt_long_list = gt_long_df.to_dict('records')

gt_short_df = pd.read_csv(gt_short)
gt_short_list = gt_short_df.to_dict('records')

In [9]:
val_ins_df = pd.read_csv(val_ins)
val_ins_list = val_ins_df.to_dict('records')
print(len(val_ins_list))

169721


In [10]:
def get_strategy():
    try:
        tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
        print('Running on TPU ', tpu_cluster_resolver.cluster_spec().as_dict()['worker'])
        tf.config.experimental_connect_to_cluster(tpu_cluster_resolver)
        tf.tpu.experimental.initialize_tpu_system(tpu_cluster_resolver)
        strategy = tf.distribute.experimental.TPUStrategy(tpu_cluster_resolver)
    except ValueError as e:
        print(e)
        print('No TPU detected')
        tpu = None
        strategy = tf.distribute.get_strategy()
    return strategy

In [22]:
AnswerType = {
    'NO_ANSWER': 0,
    'YES': 1,
    'NO': 2,
    'SHORT' : 3,
    'LONG' : 4
}

def preprocess_data(data, tokenizer, debug=False): 
    progress = tqdm(data, total=len(data))
    x1 = []
    x2 = []
    x3 = []
    y = []
    for sam in progress:
        # part_id = sam['part_id']
        # part_tokens = f'[part={part_id}]'

        # context = part_tokens + " " + sam['context']
        tokenized_sam = tokenizer.encode_plus(sam['question'], sam['context'], 
                                              padding='max_length',
                                              truncation=True,
                                              max_length=512,
                                              add_special_tokens=True)
        
        x1.append(tf.cast(tokenized_sam['input_ids'], tf.int32))
        x2.append(tf.cast(tokenized_sam['token_type_ids'], tf.int32))
        x3.append(tf.cast(tokenized_sam['attention_mask'], tf.int32))

        y.append([sam['start'], sam['stop'], AnswerType[sam['target']]])

    x1 = tf.convert_to_tensor(x1)
    x2 = tf.convert_to_tensor(x2)
    x3 = tf.convert_to_tensor(x3)

    y = tf.convert_to_tensor(y)
    return x1, x2, x3, y


In [11]:
def getResult(gt_df, res_df, debug=False):
    gt_df.fillna('', inplace=True)
    if gt_df.shape[0] != res_df.shape[0]:
        print('ERROR: Different number of rows')
        return -1.0
    
    TP, TN, FP, FN = 0, 0, 0, 0

    # list_incorrect = []

    gt_id = gt_df['example_id'].astype(str).tolist()
    res_id = res_df['example_id'].astype(str).tolist()

    if gt_id != res_id:
        print("ERROR: Example_id lists are not the same")
        return -1.0
    
    gt_res = gt_df['PredictionString'].tolist()
    res_res = res_df['PredictionString'].tolist()
    id_list = gt_df['example_id'].tolist()
    for i in range(len(gt_res)):
        if gt_res[i] == res_res[i]:
            if gt_res[i] != "":
                TP += 1
            else:
                TN += 1
        else:
            if res_res[i] == '':
                FN += 1
            else:
                FP += 1
    recall = float(TP)/float(TP+FN + 0.000001)
    precision = float(TP)/float(TP+FP + 0.000001)
    print("TP: ", TP)
    print("TN: ", TN)
    print("FP: ", FP)
    print("FN: ", FN)
    print("Recall: ", recall)
    print("Precision: ", precision)
    print("F1: ", float(2*recall*precision)/float(recall + precision + 0.00000001))


## **Process with short answer**

### **Load saved short answer instance from file**

In [12]:
san_ins_df = pd.read_csv('/content/drive/MyDrive/data/fine_data/san_val_instance_0.1_thres.csv')
san_ins_list = san_ins_df.to_dict('records')
print(len(san_ins_list))

2218


In [13]:
model_name_san = '/content/drive/MyDrive/data/bert_base_uncased'
# model_name_san = 'bert-base-uncased'

tokenizer_san = AutoTokenizer.from_pretrained(model_name_san)

tags_san = ['<Dd>', '<Dl>', '<Dt>', '<H1>', '<H2>', '<H3>', '<Li>', '<Ol>', '<P>', '<Table>', '<Td>', '<Th>', '<Tr>', '<Ul>',
        '</Dd>', '</Dl>', '</Dt>', '</H1>', '</H2>', '</H3>', '</Li>', '</Ol>', '</P>', '</Table>', '</Td>', '</Th>', '</Tr>', '</Ul>',
        '<Th_colspan=', '</Th_colspan=', '``', '\'\'', '--']

print(tags_san)

special_tokens_dict = {'additional_special_tokens': tags_san}

num_added_toks = tokenizer_san.add_special_tokens(special_tokens_dict)
print(num_added_toks)
print(len(tokenizer_san))

['<Dd>', '<Dl>', '<Dt>', '<H1>', '<H2>', '<H3>', '<Li>', '<Ol>', '<P>', '<Table>', '<Td>', '<Th>', '<Tr>', '<Ul>', '</Dd>', '</Dl>', '</Dt>', '</H1>', '</H2>', '</H3>', '</Li>', '</Ol>', '</P>', '</Table>', '</Td>', '</Th>', '</Tr>', '</Ul>', '<Th_colspan=', '</Th_colspan=', '``', "''", '--']
33
30555


In [17]:
def create_model_san(tokenizer, model_name, debug=False):
    config = BertConfig()
    if debug:
        print(config)
    encoder = TFBertModel(config)
    # encoder = TFBertModel.from_pretrained(model_name)
    encoder.resize_token_embeddings(len(tokenizer))

    NUM_TARGET = 5
    class MyQAModel(tf.keras.Model):
        def __init__(self, *inputs, **kwargs):
            super().__init__(*inputs, **kwargs)            
            self.bert = encoder

            # self.dropout_start = tf.keras.layers.Dropout(0.1)
            # self.dropout_stop = tf.keras.layers.Dropout(0.1)
            # self.dropout_target = tf.keras.layers.Dropout(0.1)

            self.start_logits = tf.keras.layers.Dense(1)
            self.stop_logits = tf.keras.layers.Dense(1)
            
            self.target = tf.keras.layers.Dense(NUM_TARGET)

        def call(self, inputs, **kwargs):
            bert_res=self.bert(inputs[0], 
                               token_type_ids=inputs[1], 
                               attention_mask=inputs[2]
                               )
            
            # dropout_res1 = self.dropout_start(bert_res[0])
            dropout_res1 = bert_res[0]

            start_logits = tf.squeeze(self.start_logits(dropout_res1), -1)

            # dropout_res2 = self.dropout_stop(bert_res[0])
            dropout_res2 = bert_res[0]

            stop_logits = tf.squeeze(self.stop_logits(dropout_res2), -1)

            # dropout_res3 = self.dropout_target(bert_res[1])
            dropout_res3 = bert_res[1]
            
            targets = self.target(dropout_res3)
            
            paddings = tf.constant([[0, 0,], [0, 512-NUM_TARGET]])
            targets = tf.pad(targets, paddings)
            
            res = tf.stack([start_logits, stop_logits, targets], axis=1)
            return res
        
    model = MyQAModel()
    return model 

In [15]:
def getSanFineInsRes(raw_res, ins_list, debug=False):
    list_ins_res = []
    progress = tqdm(ins_list, total=len(ins_list))
    for rowid, row in enumerate(progress):
        example_id = row['example_id']
        part_start = row['part_start']

        res_start = tf.nn.softmax(raw_res[rowid][0]).numpy()
        res_stop = tf.nn.softmax(raw_res[rowid][1]).numpy()
        res_target = tf.nn.softmax(raw_res[rowid][2]).numpy()
        
        start = np.argmax(res_start)
        stop = np.argmax(res_stop)
        target = np.argmax(res_target)

        start_score = res_start[start]
        stop_score = res_stop[stop]

        start_CLS = res_start[0]
        stop_CLS = res_stop[0]
        
        ins_res = {}
        ins_res['example_id'] = example_id
        ins_res['part_start'] = part_start

        ins_res['san_start'] = start 
        ins_res['san_stop'] = stop
        ins_res['san_target'] = target

        ins_res['start_score'] = start_score
        ins_res['stop_score'] = stop_score
        
        ins_res['start_CLS'] = start_CLS
        ins_res['stop_CLS'] = stop_CLS
        if debug:
            if rowid == 101:
                print(row)
                print(ins_res)
        list_ins_res.append(ins_res)
    list_ins_res_df = pd.DataFrame(list_ins_res)
    return list_ins_res_df

In [18]:
strategy = get_strategy()
weight_path =  '/content/drive/MyDrive/data/models/san_2nd/weights-14.h5'
with strategy.scope():
    sanModel = create_model_san(model_name_san, tokenizer_san)

    if os.path.isfile(weight_path):
        x = np.ones([1, 512], dtype=int)
        sanModel.predict([x, x, x])
        sanModel.load_weights(weight_path)

    # optAdam = tf.keras.optimizers.Adam(learning_rate=0.00005)
    # lossSCE = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    # metricSCA = tf.keras.metrics.SparseCategoricalAccuracy()
    # sanModel.compile(optimizer=optAdam, loss=lossSCE, metrics=[metricSCA])

Running on TPU  ['10.11.216.66:8470']
INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Clearing out eager caches






INFO:tensorflow:Initializing the TPU system: grpc://10.11.216.66:8470


INFO:tensorflow:Initializing the TPU system: grpc://10.11.216.66:8470


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Found TPU system:


INFO:tensorflow:Found TPU system:


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)
INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond_8/Identity:0' shape=(None, 512) dtype=int64>, <tf.Tensor 'cond_8/Identity_1:0' shape=(None, 512) dtype=int64>, <tf.Tensor 'cond_8/Identity_2:0' shape=(None, 512) dtype=int64>]
INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond_8/Identity:0' shape=(None, 512) dtype=int64>, <tf.Tensor 'cond_8/Identity_1:0' shape=(None, 512) dtype=int64>, <tf.Tensor 'cond_8/Identity_2:0' shape=(None, 512) dtype=int64>]


In [19]:
def mergeDocumentResSan(ins_df, val_id_df, threshold=0.0001, debug=False):
    list_doc_lan = []
    for idx, doc in val_id_df.iterrows():
        doc_id = doc['example_id']
        ins_of_doc = ins_df.loc[ins_df['example_id'] == doc_id]
        
        start_ins = ins_of_doc.loc[ins_of_doc['san_start'] != 0]
        stop_ins = ins_of_doc.loc[ins_of_doc['san_stop'] != 0]
        all_non_zero = pd.concat([start_ins,stop_ins]).drop_duplicates()
        
        best_start = -1
        best_stop = -1
        best_target = 0
        best_score = threshold
                    
        for idx_ins, ins in all_non_zero.iterrows():
            ins_start = int(ins['san_start'])
            ins_stop = int(ins['san_stop'])
            ins_target = int(ins['san_target'])
            
            part_start = ins['part_start']
            
            real_start = int(ins_start + part_start)
            real_stop = int(ins_stop + part_start)
            
            s_start = ins['start_score']
            s_stop = ins['stop_score']
            
            cls_start = ins['start_CLS']
            cls_stop = ins['stop_CLS']
            
            if real_stop > real_start:   
                if s_start - cls_start + s_stop - cls_stop > best_score:
                    best_score = s_start - cls_start + s_stop - cls_stop
                    best_start = real_start
                    best_stop = real_stop
                    best_target = ins_target

        doc_lan = {}
        doc_lan['example_id'] = doc_id
        doc_lan['san_start'] = best_start
        doc_lan['san_stop'] = best_stop
        doc_lan['san_target'] = best_target
        doc_lan['score'] = best_score
        
        if debug:
            if idx == 101:
                print(doc_lan)
        
        list_doc_lan.append(doc_lan)
    
    list_doc_lan_df = pd.DataFrame(list_doc_lan)
    return list_doc_lan_df

In [20]:
AnswerTypeRev = {
    0: 'NO_ANSWER',
    1: 'YES',
    2: 'NO',
    3: 'SHORT',
    4: 'LONG'
}

def getSanSubmission(doc_res_df, threshold=0.0001, debug=False):
    doc_res_df.example_id = doc_res_df.example_id.astype(str)
    lines = []
    for id, doc in doc_res_df.iterrows():
        example_id = doc['example_id']
        short_id = str(example_id) + '_short'

        line_short = {}
        line_short['example_id'] = short_id

        an_start = int(doc['san_start'])
        an_stop = int(doc['san_stop'])
        an_target = int(doc['san_target'])
        an_score = float(doc['score'])

        if an_start > 0 and an_stop > 0 and an_stop - an_start < 30:
            short_string = str(an_start) + ':' + str(an_stop)
        else:
            short_string = ''

        if an_target == 1 or an_target == 2:
            short_string = AnswerTypeRev[an_target]


        line_short['PredictionString'] = short_string
        lines.append(line_short)

    lines_df = pd.DataFrame(lines)
    sorted_df = lines_df.sort_values('example_id')
    return sorted_df

In [23]:
x_san1, x_san2, x_san3, y_san = preprocess_data(san_ins_list, tokenizer_san)

100%|██████████| 2218/2218 [00:08<00:00, 268.25it/s]


In [24]:
y_pred_san = sanModel.predict([x_san1, x_san2, x_san3], verbose=1) 

INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond_8/Identity:0' shape=(None, 512) dtype=int32>, <tf.Tensor 'cond_8/Identity_1:0' shape=(None, 512) dtype=int32>, <tf.Tensor 'cond_8/Identity_2:0' shape=(None, 512) dtype=int32>]




InvalidArgumentError: ignored

In [None]:
fineSanResDf = getSanFineInsRes(y_pred_san, san_ins_list)

100%|██████████| 2755/2755 [00:10<00:00, 258.85it/s]


In [None]:
fineSanResDf.head(20)

Unnamed: 0,example_id,part_start,san_start,san_stop,san_target,start_score,stop_score,start_CLS,stop_CLS
0,4158175306918787233,219,0,0,4,0.160566,0.230596,0.160566,0.230596
1,-1957133654292017851,137,188,313,1,0.640268,0.442794,0.005081,0.005342
2,-1273245364552286191,110,279,278,4,0.210126,0.251557,0.01351,0.019804
3,-8663891343543535834,1770,123,125,3,0.177988,0.249999,0.021713,0.021871
4,8161832609608306276,0,137,139,3,0.460317,0.417052,0.000488,0.000546
5,-5682003170056099038,0,254,256,3,0.155353,0.137265,0.01203,0.011245
6,1612935385179441227,0,110,143,3,0.178247,0.152383,0.005447,0.006163
7,3665399692104500546,122,250,252,3,0.111621,0.289588,0.020194,0.0157
8,33744949014118896,184,327,331,3,0.26645,0.24692,0.024821,0.013834
9,350466672285576002,22,201,203,3,0.282921,0.295475,0.00184,0.00187


In [None]:
docSanResDf = mergeDocumentResSan(fineSanResDf, val_id_df, threshold=0.0001, debug=False)

In [None]:
docSanResDf.head(20)

Unnamed: 0,example_id,san_start,san_stop,san_target,score
0,-4415223548557328381,271,273,3,0.653292
1,-706775569507012323,148,152,3,0.180855
2,-8247431609310444236,544,545,3,0.462115
3,178097630653295544,520,522,3,0.114925
4,7206764020223190386,513,515,3,0.475483
5,-3896665866059218167,9648,9734,4,0.607028
6,3101272368142254978,11514,11515,3,0.572425
7,6096310511116140643,117,120,3,1.22069
8,-2352645543110896817,1324,1326,3,0.693021
9,-6829733378781657269,165,280,4,0.729159


In [None]:
sub = getSanSubmission(docSanResDf)

In [None]:
sub.head(20)

Unnamed: 0,example_id,PredictionString
1557,-1000683513392992283_short,
1949,-1006288523617376129_short,
699,-1008973731705801750_short,
374,-1009784761503721527_short,
2502,-1012377094725254051_short,448:452
1739,-1019378357338217101_short,236:240
1479,-1040350863390524476_short,6196:6199
863,-1042660772802635315_short,
2133,-1042822382384733999_short,
277,-1043602643596538133_short,


In [None]:
getResult(gt_short_df, sub, debug=False)

TP:  79
TN:  1303
FP:  1323
FN:  370
Recall:  0.17594654749232394
Precision:  0.056348074139552014
F1:  0.08535926149517804


In [None]:
gt_short_df.head(20)

Unnamed: 0,example_id,PredictionString
0,-1000683513392992283_short,
1,-1006288523617376129_short,
2,-1008973731705801750_short,
3,-1009784761503721527_short,1203:1204
4,-1012377094725254051_short,
5,-1019378357338217101_short,
6,-1040350863390524476_short,690:694
7,-1042660772802635315_short,5497:5504
8,-1042822382384733999_short,
9,-1043602643596538133_short,
