## **Handle data**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install -q transformers

[K     |████████████████████████████████| 2.9 MB 5.0 MB/s 
[K     |████████████████████████████████| 56 kB 4.0 MB/s 
[K     |████████████████████████████████| 596 kB 58.9 MB/s 
[K     |████████████████████████████████| 895 kB 50.3 MB/s 
[K     |████████████████████████████████| 3.3 MB 33.4 MB/s 
[?25h

## **Import**

In [3]:
import numpy as np
import pandas as pd
import sys
import random
from tqdm import tqdm
import re
import string
import os
import shutil
import json
from transformers import AutoTokenizer, TFBertMainLayer, TFBertForPreTraining, BertConfig, TFBertModel
import tensorflow as tf
from tensorflow.keras.losses import sparse_categorical_crossentropy as sce

In [4]:
f_test = '/content/drive/MyDrive/data/simplified-nq-test.jsonl'
f_train = '/content/drive/MyDrive/data/simplified-nq-train.jsonl'
num_train_samples = 307372
num_test_samples = 346

In [5]:
df_val_id = pd.read_csv('/content/drive/MyDrive/data/fine_data/val_id.csv', dtype=str)
set_val_id = set(df_val_id['example_id'].values.tolist())
print(len(set_val_id))

3075


In [6]:
def get_id_df(filename=f_test):
    list_id = []
    with open(filename) as f:
        progress = tqdm(f)  
        for sam_count, line in enumerate(progress):
            data = json.loads(line)
            example_id = str(data['example_id'])
            doc = {'example_id':example_id}
            list_id.append(doc)
    list_id_df = pd.DataFrame(list_id)
    return list_id_df 

In [7]:
AnswerType = {
    'NO_ANSWER': 0,
    'YES': 1,
    'NO': 2,
    'SHORT' : 3,
    'LONG' : 4
}

AnswerTypeRev = {
    0: 'NO_ANSWER',
    1: 'YES',
    2: 'NO',
    3: 'SHORT',
    4: 'LONG'
}

In [8]:
def preprocess_data(data, tokenizer, debug=False): 
    progress = tqdm(data, total=len(data))
    x1 = []
    x2 = []
    x3 = []
    y = []
    for sam in progress:
        tokenized_sam = tokenizer.encode_plus(sam['question'], sam['context'], 
                                              padding='max_length',
                                              truncation=True,
                                              max_length=512,
                                              add_special_tokens=True)
        
        x1.append(tf.cast(tokenized_sam['input_ids'], tf.int32))
        x2.append(tf.cast(tokenized_sam['token_type_ids'], tf.int32))
        x3.append(tf.cast(tokenized_sam['attention_mask'], tf.int32))

        y.append([sam['start'], sam['stop'], AnswerType[sam['target']]])

    x1 = tf.convert_to_tensor(x1)
    x2 = tf.convert_to_tensor(x2)
    x3 = tf.convert_to_tensor(x3)

    y = tf.convert_to_tensor(y)
    return x1, x2, x3, y

In [9]:
def get_strategy():
    try:
        tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
        print('Running on TPU ', tpu_cluster_resolver.cluster_spec().as_dict()['worker'])
        tf.config.experimental_connect_to_cluster(tpu_cluster_resolver)
        tf.tpu.experimental.initialize_tpu_system(tpu_cluster_resolver)
        strategy = tf.distribute.experimental.TPUStrategy(tpu_cluster_resolver)
    except ValueError as e:
        print(e)
        print('No TPU detected')
        tpu = None
        strategy = tf.distribute.get_strategy()
    return strategy

In [10]:
def mergeInstanceResult(test_res, list_test_ins):
    for i in range(len(list_test_ins)):
        ins_res = test_res[i]
        start = np.argmax(ins_res[0])
        stop = np.argmax(ins_res[1])
        target = np.argmax(ins_res[2])

        start_score = ins_res[0][start]
        stop_score = ins_res[1][stop]
        target_score = ins_res[2][target]

        start_CLS = ins_res[0][0]
        stop_CLS = ins_res[1][0]


        list_test_ins[i]['start'] = start 
        list_test_ins[i]['stop'] = stop
        list_test_ins[i]['target'] = target 

        list_test_ins[i]['start_score'] = start_score
        list_test_ins[i]['stop_score'] = stop_score
        list_test_ins[i]['target_score'] = target_score

        list_test_ins[i]['start_CLS'] = start_CLS
        list_test_ins[i]['stop_CLS'] = stop_CLS
    return list_test_ins

In [11]:
def mergeDocumentRes(ins_df, val_id_df=df_val_id, threshold=0.0001, stride=128, debug=False):
    STRIDE = stride
    list_doc_lan = []
    for idx, doc in val_id_df.iterrows():
        doc_id = doc['example_id']
        ins_of_doc = ins_df.loc[ins_df['example_id'] == doc_id]
        
        start_ins = ins_of_doc.loc[ins_of_doc['start'] != 0]
        stop_ins = ins_of_doc.loc[ins_of_doc['stop'] != 0]
        all_non_zero = pd.concat([start_ins,stop_ins]).drop_duplicates()
        
        best_start = -1
        best_stop = -1
        best_target = 0
        best_score = threshold
                    
        for idx_ins, ins in all_non_zero.iterrows():
            ins_start = int(ins['start'])
            ins_stop = int(ins['stop'])
            ins_target = int(ins['target'])
            
            part_start = ins['part_start']
            
            real_start = int(ins_start + part_start)
            real_stop = int(ins_stop + part_start)
            
            s_start = ins['start_score']
            s_stop = ins['stop_score']
            
            cls_start = ins['start_CLS']
            cls_stop = ins['stop_CLS']
            
            if real_stop > real_start:   
                if s_start - cls_start + s_stop - cls_stop > best_score:
                    best_score = s_start - cls_start + s_stop - cls_stop
                    best_start = real_start
                    best_stop = real_stop
                    best_target = ins_target

        doc_lan = {}
        doc_lan['example_id'] = doc_id
        doc_lan['start'] = best_start
        doc_lan['stop'] = best_stop
        doc_lan['target'] = best_target
        doc_lan['score'] = best_score
        
        if debug:
            if idx == 101:
                print(doc_lan)
        
        list_doc_lan.append(doc_lan)
    
    list_doc_lan_df = pd.DataFrame(list_doc_lan)
    return list_doc_lan_df

## **Get instances (html tags cleaned) for long answer predict**

In [12]:
cleanr = re.compile('<.*?>')
def clean_html(raw_html):
    cleantext = re.sub(cleanr, '<tag>', raw_html)
    return cleantext

def parseDataClean(filename=f_train, set_id=set_val_id, is_val=True, drop_noanswer_rate=0.95, drop_null_instances_rate=0.98, debug=False):
    INSTANCE_WORDS_LEN = 500 
    STRIDE = 128 
    num, count_drop, count_yes_no, count_long, count_short, count_no_answer = 0, 0, 0, 0, 0, 0
    list_instances = []

    with open(filename) as f:
        progress = tqdm(f)  
        for sam_count, line in enumerate(progress):
            data = json.loads(line)
            example_id = str(data['example_id'])

            if example_id in set_val_id:
                doc_text_raw = data['document_text']
                doc_text_tag = clean_html(doc_text_raw) # change all html tags to the form <tag>
                doc_tag_split = doc_text_tag.split()

                lan_start, lan_stop, san_start, san_stop = -1, -1, -1, -1

                clean_doc = list(filter(('<tag>').__ne__, doc_tag_split))

                question = data['question_text'] # question

                len_ques = len(question.split())
                part_len = INSTANCE_WORDS_LEN - len_ques 

                num_ins = (len(clean_doc) - part_len)//STRIDE + 1

                for part_id in range(num_ins + 1):
                    part_start = part_id*STRIDE
                    part_stop = min(len(clean_doc), part_id*STRIDE + part_len)

                    part_split = clean_doc[part_start:part_stop]

                    part = ' '.join(part_split)
                    
                    instance = {'example_id': example_id, 'part_start': part_start, 'part_stop': part_stop,
                                'question': question,'context': part, 
                                'start': 0, 'stop': 0, 'target': 'NO_ANSWER'}
                    list_instances.append(instance)
    return list_instances

In [13]:
def getMapping(set_id=set_val_id, filename=f_train):
    list_cand_maps = []
    with open(filename) as f:
        progress = tqdm(f)  
        for sam_count, line in enumerate(progress):
                
            data = json.loads(line)
            example_id = str(data['example_id'])

            if example_id in set_id:
                doc_text_raw = data['document_text']
                doc_text_raw = clean_html(doc_text_raw) # change all html tags to the form <tag>
                doc_text_split = doc_text_raw.split()

                clean_doc = list(filter(('<tag>').__ne__, doc_text_split))

                list_candidates = data['long_answer_candidates']
                list_new_candidates = []
                for cand in list_candidates:
                    cand_start = cand['start_token']
                    cand_stop = cand['end_token']
                    
                    num_tag_bef_start = doc_text_split[0:cand_start].count('<tag>')
                    num_tag_bef_stop = doc_text_split[0:cand_stop].count('<tag>')
                
                    new_start = cand_start - num_tag_bef_start
                    new_stop = cand_stop - num_tag_bef_stop
                    
                    new_cand = {}
                    new_cand['end_token'] = new_stop
                    new_cand['start_token'] = new_start
                    
                    list_new_candidates.append(new_cand)
                sample = {}
                sample['example_id'] = str(example_id)
                sample['new_candidates'] = list_new_candidates
                sample['old_candidates'] = list_candidates
                
                list_cand_maps.append(sample)
    return list_cand_maps

In [14]:
def build_model(model_name, debug=False):
    encoder = TFBertModel.from_pretrained(model_name)
    
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    tags = ['``', '\'\'', '--']

    special_tokens_dict = {'additional_special_tokens': tags}

    num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
    
    encoder.resize_token_embeddings(len(tokenizer))

    NUM_TARGET = 5
    class MyQAModel(tf.keras.Model):
        def __init__(self, *inputs, **kwargs):
            super().__init__(*inputs, **kwargs)            
            self.bert = encoder

            self.start_logits = tf.keras.layers.Dense(1)
            self.stop_logits = tf.keras.layers.Dense(1)
            
            self.target = tf.keras.layers.Dense(NUM_TARGET)

        def call(self, inputs, **kwargs):
            bert_res=self.bert(inputs[0], 
                               token_type_ids=inputs[1], 
                               attention_mask=inputs[2]
                               )
            
            # dropout_res1 = self.dropout_start(bert_res[0])
            dropout_res1 = bert_res[0]

            start_logits = tf.squeeze(self.start_logits(dropout_res1), -1)

            # dropout_res2 = self.dropout_stop(bert_res[0])
            dropout_res2 = bert_res[0]

            stop_logits = tf.squeeze(self.stop_logits(dropout_res2), -1)

            # dropout_res3 = self.dropout_target(bert_res[1])
            dropout_res3 = bert_res[1]
            
            targets = self.target(dropout_res3)
            
            paddings = tf.constant([[0, 0,], [0, 512-NUM_TARGET]])
            targets = tf.pad(targets, paddings)
            
            res = tf.stack([start_logits, stop_logits, targets], axis=1)
            return res
        
    model = MyQAModel()
    return model 

In [15]:
def getRawInstanceResults(list_test, verbose = True, debug = False):  
    if verbose:
        print('Getting raw result for all the instances generated from test file')
        
    model_name = '/content/drive/MyDrive/data/bert_base_uncased'
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    tags = ['``', '\'\'', '--']

    special_tokens_dict = {'additional_special_tokens': tags}

    num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
    print(num_added_toks)
    print(len(tokenizer))
    
    x_test1, x_test2, x_test3, y_test = preprocess_data(list_test, tokenizer)
    if verbose:
        print("Finish tokenizing ", len(list_test), " data for the first model")
        print(x_test1.shape)
    
    if verbose:
        print("Preparing model")
        
    strategy = get_strategy()
    with strategy.scope():
        testModel = build_model(model_name)
        x = np.ones([1, 512], dtype=int)
        testModel.predict([x, x, x])
        testModel.load_weights('/content/drive/MyDrive/data/models/current/weights-02.h5')
        optAdam = tf.keras.optimizers.Adam(learning_rate=0.00005)
        lossSCE = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
        metricSCA = tf.keras.metrics.SparseCategoricalAccuracy()
        testModel.compile(optimizer=optAdam, loss=lossSCE, metrics=[metricSCA])
    
    if verbose:
        print("Finish loading pretrained weights for the model")
        
    test_res = testModel.predict([x_test1, x_test2, x_test3], verbose=1)
    
    if verbose:
        print("Finish calculating raw result, get an array of size: ", test_res.shape)
    return test_res


In [16]:
def getSubmissionLan(doc_res_df, doc_cand_df, threshold=0.0001, debug=False):
    doc_res_df.example_id = doc_res_df.example_id.astype(str)
    doc_cand_df.example_id = doc_cand_df.example_id.astype(str)
    if debug:
        print(doc_res_df.dtypes)
        print(doc_cand_df.dtypes)

    combine_df = pd.merge(doc_res_df, doc_cand_df, on='example_id')
    lines = []
    for id, doc in combine_df.iterrows():

        example_id = doc['example_id']
        long_id = str(example_id) + '_long'
        short_id = str(example_id) + '_short'

        line_long = {}
        line_long['example_id'] = long_id

        an_start = int(doc['start'])
        an_stop = int(doc['stop'])
        an_target = doc['target']
        an_score = doc['score']
        # print(an_start, an_stop, an_target, an_score)
        lan_start, lan_stop = -1, -1

        # find long answer 
        if an_start > 0 and an_stop > 0:
            candidates = doc['new_candidates']
            an_range = [*range(an_start, an_stop + 1, 1)]

            best_inter = 0.5
            shortest = 10000000000000
            best_id = 0
            for cidx, cand in enumerate(candidates):
                c_start = int(cand['start_token'])
                c_stop = int(cand['end_token'])

                c_range = [*range(c_start, c_stop + 1, 1)]
                inter = len(list(set(an_range)&set(c_range)))
            
                if float(inter) > best_inter:
                    best_id = cidx
                    best_inter = inter
                    shortest = len(c_range)
                elif inter == best_inter:
                    if shortest > len(c_range):
                        best_id = cidx
                        shortest = len(c_range)

            real_candidates = doc['old_candidates']
            lan_start = real_candidates[best_id]['start_token']
            lan_stop = real_candidates[best_id]['end_token']

            if debug:
                if id == 101:
                    print(lan_start, lan_stop)

        if lan_start > 0 and lan_stop > 0 and an_target != 0:
            long_string = str(lan_start) + ':' + str(lan_stop)
        else:
            long_string = ''


        line_long['PredictionString'] = long_string
        lines.append(line_long)

    lines_df = pd.DataFrame(lines)
    sorted_df = lines_df.sort_values('example_id')
    return sorted_df

## **Process short answer**

In [17]:
def getSanCandidate(sub, filename=f_train, debug=False):
    INSTANCE_WORDS_LEN = 500 
    STRIDE = 256 

    list_doc_lan_res = []
    for rowid, row in sub.iterrows():
        example_id = str(row['example_id']).replace('_long',"")
        lan_start, lan_stop = -1, -1

        if str(row['PredictionString']) != '':
            tokens = str(row['PredictionString']).split(':')
            lan_start = int(tokens[0])
            lan_stop = int(tokens[1]) 
            
        sam = {'example_id': example_id, 'lan_start': lan_start, 'lan_stop': lan_stop}
        list_doc_lan_res.append(sam)
        
    list_doc_lan_res_df = pd.DataFrame(list_doc_lan_res)

    set_id = set(list_doc_lan_res_df['example_id'].values.tolist())

    list_san_ins = []

    with open(filename) as f:
        progress = tqdm(f)  
        for sam_count, line in enumerate(progress):
            data = json.loads(line)
            example_id = str(data['example_id'])
            if example_id in set_id:
                # get lan result 
                ans = list_doc_lan_res_df.loc[list_doc_lan_res_df['example_id']==example_id]
                lan_start, lan_stop = -1, -1
                for rowid, row in ans.iterrows():
                    lan_start = row['lan_start']
                    lan_stop = row['lan_stop']
                if debug:
                    print(example_id, lan_start, lan_stop)
                doc_text = data['document_text']
                doc_text_split = doc_text.split()
                question = data['question_text']
                
                if lan_start > -1 and lan_stop > -1:
                    if lan_stop - lan_start <= INSTANCE_WORDS_LEN:
                        offset = (INSTANCE_WORDS_LEN - (lan_stop - lan_start))//2 
                        part_start = max(0,lan_start - offset)
                        part_stop = min(lan_stop + offset, len(doc_text_split))
                        part_split = doc_text_split[part_start:part_stop]
                        context = ' '.join(part_split)
                        ins = {'example_id': example_id, 'part_start': part_start, 'part_stop': part_stop, 
                               'question': question, 'context': context, 'start': 0, 'stop': 0, 'target': 'NO_ANSWER'}
                        list_san_ins.append(ins) 
                        if debug:
                            print(ins)
                    else: 
                    # in case found long answer is longer than context length limit then split the long answer into small parts
                    # and slide with stride 256
                        part_length = INSTANCE_WORDS_LEN
                        num_parts = (lan_stop - lan_start - INSTANCE_WORDS_LEN)//STRIDE + 1
                        for part_id in range(num_parts + 1):
                            part_start = lan_start + part_id*STRIDE
                            part_stop = min(len(doc_text_split), lan_start + part_id*STRIDE + part_length)
                            part_split = doc_text_split[part_start:part_stop]
                    
                            context = ' '.join(part_split)
                            ins = {'example_id': example_id, 'part_start': part_start, 'part_stop': part_stop, 
                               'question': question, 'context': context, 'start': 0, 'stop': 0, 'target': 'NO_ANSWER'}
                            list_san_ins.append(ins)
                            if debug:
                                print(ins)
    return list_san_ins            


In [18]:
def create_model_san(tokenizer_san, model_name_san, debug=False):
    config = BertConfig()
    if debug:
        print(config)
    encoder = TFBertModel.from_pretrained(model_name_san)
    encoder.resize_token_embeddings(len(tokenizer_san))

    NUM_TARGET = 5
    class MyQAModel(tf.keras.Model):
        def __init__(self, *inputs, **kwargs):
            super().__init__(*inputs, **kwargs)            
            self.bert = encoder
            self.start_logits = tf.keras.layers.Dense(1)
            self.stop_logits = tf.keras.layers.Dense(1)
            
            self.target = tf.keras.layers.Dense(NUM_TARGET)

        def call(self, inputs, **kwargs):
            bert_res=self.bert(inputs[0], 
                               token_type_ids=inputs[1], 
                               attention_mask=inputs[2]
                               )
            
            dropout_res1 = bert_res[0]

            start_logits = tf.squeeze(self.start_logits(dropout_res1), -1)

            dropout_res2 = bert_res[0]

            stop_logits = tf.squeeze(self.stop_logits(dropout_res2), -1)

            dropout_res3 = bert_res[1]
            
            targets = self.target(dropout_res3)
            
            paddings = tf.constant([[0, 0,], [0, 512-NUM_TARGET]])
            targets = tf.pad(targets, paddings)
            
            res = tf.stack([start_logits, stop_logits, targets], axis=1)
            return res
        
    model = MyQAModel()
    return model 

In [19]:
def getSanRawRes(list_san_ins, verbose=1):
    print("Getting raw result for short answer instance generated from found long answers")
    
    model_name_san = '/content/drive/MyDrive/data/bert_base_uncased'

    tokenizer_san = AutoTokenizer.from_pretrained(model_name_san)

    tags_san = ['<Dd>', '<Dl>', '<Dt>', '<H1>', '<H2>', '<H3>', '<Li>', '<Ol>', '<P>', '<Table>', '<Td>', '<Th>', '<Tr>', '<Ul>',
            '</Dd>', '</Dl>', '</Dt>', '</H1>', '</H2>', '</H3>', '</Li>', '</Ol>', '</P>', '</Table>', '</Td>', '</Th>', '</Tr>', '</Ul>',
            '<Th_colspan=', '</Th_colspan=', '``', '\'\'', '--']

    special_tokens_dict_san = {'additional_special_tokens': tags_san}

    num_added_toks_san = tokenizer_san.add_special_tokens(special_tokens_dict_san)
    print("Short answer vocab size: ", len(tokenizer_san))
    
    x_san1, x_san2, x_san3, y_san = preprocess_data(list_san_ins, tokenizer_san)
    print("Finish tokenizing ", len(list_san_ins), " instances for short answer candidates")
    print(x_san1.shape)
    
    strategy_san = get_strategy()
    with strategy_san.scope():
        sanModel = create_model_san(tokenizer_san, model_name_san)
        x = np.ones([1, 512], dtype=int)
        sanModel.predict([x, x, x])
        sanModel.load_weights('/content/drive/MyDrive/data/models/san_2nd/weights-14.h5')
        optAdam = tf.keras.optimizers.Adam(learning_rate=0.00005)
        lossSCE = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
        metricSCA = tf.keras.metrics.SparseCategoricalAccuracy()
        sanModel.compile(optimizer=optAdam, loss=lossSCE, metrics=[metricSCA])
    
    if verbose:
        print("Finish loading pretrained weights for the model for short answer")
        
    test_res = sanModel.predict([x_san1, x_san2, x_san3], verbose=1)
    
    if verbose:
        print("Finish calculating raw result, get an array of size: ", test_res.shape)
    return test_res

In [20]:
def getSanSubmission(doc_res_df, threshold=0.0001, debug=False):
    doc_res_df.example_id = doc_res_df.example_id.astype(str)
    lines = []
    for id, doc in doc_res_df.iterrows():
        example_id = doc['example_id']
        short_id = str(example_id) + '_short'

        line_short = {}
        line_short['example_id'] = short_id

        an_start = int(doc['start'])
        an_stop = int(doc['stop'])
        an_target = int(doc['target'])
        an_score = float(doc['score'])

        if an_start > 0 and an_stop > 0 and an_target != 4:
            short_string = str(an_start) + ':' + str(an_stop)
        else:
            short_string = ''

        if an_target == 1 or an_target == 2:
            short_string = AnswerTypeRev[an_target]


        line_short['PredictionString'] = short_string
        lines.append(line_short)

    lines_df = pd.DataFrame(lines)
    sorted_df = lines_df.sort_values('example_id')
    return sorted_df

## **From here on is for test**

In [21]:
lan_map = getMapping()

307373it [06:24, 799.58it/s] 


In [22]:
list_mappings_df = pd.DataFrame(lan_map)
list_mappings_df.head()

Unnamed: 0,example_id,new_candidates,old_candidates
0,4158175306918787233,"[{'end_token': 169, 'start_token': 6}, {'end_t...","[{'start_token': 8, 'top_level': True, 'end_to..."
1,-1957133654292017851,"[{'end_token': 138, 'start_token': 24}, {'end_...","[{'start_token': 26, 'top_level': True, 'end_t..."
2,-1273245364552286191,"[{'end_token': 146, 'start_token': 12}, {'end_...","[{'start_token': 14, 'top_level': True, 'end_t..."
3,-8663891343543535834,"[{'end_token': 188, 'start_token': 64}, {'end_...","[{'start_token': 66, 'top_level': True, 'end_t..."
4,8161832609608306276,"[{'end_token': 74, 'start_token': 6}, {'end_to...","[{'start_token': 8, 'top_level': True, 'end_to..."


In [23]:
list_all_ins= parseDataClean()
all_ins_res = getRawInstanceResults(list_all_ins)

307373it [05:23, 950.74it/s]


Getting raw result for all the instances generated from test file
3
30525


100%|██████████| 169721/169721 [08:49<00:00, 320.75it/s]
INFO:absl:Entering into master device scope: /job:worker/replica:0/task:0/device:CPU:0


Finish tokenizing  169721  data for the first model
(169721, 512)
Preparing model
Running on TPU  ['10.28.157.74:8470']
INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Initializing the TPU system: grpc://10.28.157.74:8470


INFO:tensorflow:Initializing the TPU system: grpc://10.28.157.74:8470


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Found TPU system:


INFO:tensorflow:Found TPU system:


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)
Some layers from the model checkpoint at /content/drive/MyDrive/data/bert_base_uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at /content/drive/MyDrive/data/bert_base_uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further

Finish loading pretrained weights for the model


INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond_8/Identity:0' shape=(None, 512) dtype=int32>, <tf.Tensor 'cond_8/Identity_1:0' shape=(None, 512) dtype=int32>, <tf.Tensor 'cond_8/Identity_2:0' shape=(None, 512) dtype=int32>]


Finish calculating raw result, get an array of size:  (169721, 3, 512)


In [24]:
list_fine_res_all_ins = mergeInstanceResult(all_ins_res, list_all_ins)
fine_res_all_ins_df = pd.DataFrame(list_fine_res_all_ins)

In [25]:
fine_res_all_ins_df.head()

Unnamed: 0,example_id,part_start,part_stop,question,context,start,stop,target,start_score,stop_score,target_score,start_CLS,stop_CLS
0,4158175306918787233,0,491,the most common system of land title in australia,Torrens title - wikipedia Torrens title Proper...,170,174,3,4.398574,3.093508,16.519304,1.324222,0.284956
1,4158175306918787233,128,619,the most common system of land title in australia,appropriation riparian Lateral and subjacent s...,42,43,3,3.799255,2.48083,16.187399,1.496315,0.498197
2,4158175306918787233,256,747,the most common system of land title in australia,", instead of by the use of deeds . The Registr...",204,208,3,3.289289,1.839854,16.44734,1.883022,0.605597
3,4158175306918787233,384,875,the most common system of land title in australia,"of the then colony , though some attribute the...",79,81,3,3.003526,1.479181,16.242163,2.03284,0.852072
4,4158175306918787233,512,1003,the most common system of land title in australia,series of documents ) . The State guarantees t...,0,0,3,4.181863,2.881199,13.895632,4.181863,2.881199


In [26]:
docAnsDf = mergeDocumentRes(fine_res_all_ins_df)

In [27]:
docAnsDf.head()

Unnamed: 0,example_id,start,stop,target,score
0,-4415223548557328381,254,258,3,6.591407
1,-706775569507012323,140,145,3,6.543525
2,-8247431609310444236,232,233,3,5.622023
3,178097630653295544,375,377,3,7.955732
4,7206764020223190386,484,489,3,5.023816


In [28]:
subLan = getSubmissionLan(docAnsDf, list_mappings_df)

In [29]:
subLan.head(20)

Unnamed: 0,example_id,PredictionString
1557,-1000683513392992283_long,18:142
1949,-1006288523617376129_long,24:173
699,-1008973731705801750_long,257:465
374,-1009784761503721527_long,1198:1253
2502,-1012377094725254051_long,4600:4850
1739,-1019378357338217101_long,199:253
1479,-1040350863390524476_long,285:302
863,-1042660772802635315_long,4244:4324
2133,-1042822382384733999_long,151:227
277,-1043602643596538133_long,77:119


In [30]:
list_san_ins = getSanCandidate(subLan, debug=False)

307373it [04:43, 1086.03it/s]


In [31]:
sanRawRes = getSanRawRes(list_san_ins)

Getting raw result for short answer instance generated from found long answers
Short answer vocab size:  30555


100%|██████████| 2833/2833 [00:11<00:00, 243.71it/s]


Finish tokenizing  2833  instances for short answer candidates
(2833, 512)
Running on TPU  ['10.28.157.74:8470']
INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Clearing out eager caches






INFO:tensorflow:Initializing the TPU system: grpc://10.28.157.74:8470


INFO:tensorflow:Initializing the TPU system: grpc://10.28.157.74:8470


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Found TPU system:


INFO:tensorflow:Found TPU system:


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)
Some layers from the model checkpoint at /content/drive/MyDrive/data/bert_base_uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at /content/drive/MyDrive/data/bert_base_uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further

Finish loading pretrained weights for the model for short answer


INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond_8/Identity:0' shape=(None, 512) dtype=int32>, <tf.Tensor 'cond_8/Identity_1:0' shape=(None, 512) dtype=int32>, <tf.Tensor 'cond_8/Identity_2:0' shape=(None, 512) dtype=int32>]


Finish calculating raw result, get an array of size:  (2833, 3, 512)


In [32]:
list_fine_res_san_ins = mergeInstanceResult(sanRawRes, list_san_ins)
fine_res_san_ins_df = pd.DataFrame(list_fine_res_san_ins)

In [33]:
fine_res_san_ins_df.head()

Unnamed: 0,example_id,part_start,part_stop,question,context,start,stop,target,start_score,stop_score,target_score,start_CLS,stop_CLS
0,4158175306918787233,2936,3435,the most common system of land title in australia,"-- these interests , although even possibly un...",289,291,3,2.087946,2.700563,17.098423,-8.953727,-11.174394
1,-1957133654292017851,538,1037,will xbox 360 disc work on xbox one,emulates an exact replica of its predecessor '...,307,400,1,4.123751,2.724857,17.86956,-4.887974,-7.42928
2,-1273245364552286191,110,610,when do you have to identify a body,</Ul> </Td> </Tr> <Tr> <Td> Criminalistics ( h...,278,307,4,3.05117,3.186876,17.974148,-6.172495,-6.962911
3,-8663891343543535834,2883,3382,who won i am a celebrity get me out of here au...,</Td> <Td> 42 </Td> <Td> Evicted 10th </Td> </...,243,245,3,4.852525,5.032586,19.962137,-8.03173,-10.307142
4,8161832609608306276,0,407,who was the first person to be executed,William Kemmler - wikipedia <H1> William Kemml...,137,140,3,4.298324,4.928468,19.921936,-7.839705,-10.490671


In [34]:
docSanAnsDf = mergeDocumentRes(fine_res_san_ins_df)

In [35]:
docSanAnsDf.head()

Unnamed: 0,example_id,start,stop,target,score
0,-4415223548557328381,270,273,3,24.017219
1,-706775569507012323,69,89,3,24.715335
2,-8247431609310444236,544,545,3,22.010135
3,178097630653295544,516,518,3,23.170139
4,7206764020223190386,513,516,3,16.479665


In [36]:
subSan = getSanSubmission(docSanAnsDf, threshold=0.2)

In [37]:
subSan.head(20)

Unnamed: 0,example_id,PredictionString
1557,-1000683513392992283_short,
1949,-1006288523617376129_short,
699,-1008973731705801750_short,433:442
374,-1009784761503721527_short,1040:1041
2502,-1012377094725254051_short,
1739,-1019378357338217101_short,238:251
1479,-1040350863390524476_short,286:291
863,-1042660772802635315_short,4263:4297
2133,-1042822382384733999_short,
277,-1043602643596538133_short,


In [38]:
sub = pd.concat([subLan, subSan])
sub_sorted = sub.sort_values('example_id')

In [39]:
sub_sorted.head(20)

Unnamed: 0,example_id,PredictionString
1557,-1000683513392992283_long,18:142
1557,-1000683513392992283_short,
1949,-1006288523617376129_long,24:173
1949,-1006288523617376129_short,
699,-1008973731705801750_long,257:465
699,-1008973731705801750_short,433:442
374,-1009784761503721527_long,1198:1253
374,-1009784761503721527_short,1040:1041
2502,-1012377094725254051_long,4600:4850
2502,-1012377094725254051_short,


In [40]:
sub_sorted.to_csv('/content/drive/MyDrive/data/fine_data/val_submission.csv', 
                  index=False, 
                  columns=['example_id', 'PredictionString'])

In [41]:
gt = '/content/drive/MyDrive/data/fine_data/validation.csv'
gt_df = pd.read_csv(gt)

In [42]:
def getResult(gt_df, res_df, debug=False):
    gt_df.fillna('', inplace=True)
    if gt_df.shape[0] != res_df.shape[0]:
        print('ERROR: Different number of rows')
        return -1.0
    
    TP, TN, FP, FN = 0, 0, 0, 0

    # list_incorrect = []

    gt_id = gt_df['example_id'].astype(str).tolist()
    res_id = res_df['example_id'].astype(str).tolist()

    if gt_id != res_id:
        print("ERROR: Example_id lists are not the same")
        return -1.0
    
    gt_res = gt_df['PredictionString'].tolist()
    res_res = res_df['PredictionString'].tolist()
    id_list = gt_df['example_id'].tolist()
    for i in range(len(gt_res)):
        if gt_res[i] == res_res[i]:
            if gt_res[i] != "":
                TP += 1
            else:
                TN += 1
        else:
            if res_res[i] == '':
                FN += 1
            else:
                FP += 1
    recall = float(TP)/float(TP+FN + 0.000001)
    precision = float(TP)/float(TP+FP + 0.000001)
    print("TP: ", TP)
    print("TN: ", TN)
    print("FP: ", FP)
    print("FN: ", FN)
    print("Recall: ", recall)
    print("Precision: ", precision)
    print("F1: ", float(2*recall*precision)/float(recall + precision + 0.00000001))


In [43]:
getResult(gt_df, sub_sorted)

TP:  1018
TN:  1469
FP:  3327
FN:  336
Recall:  0.7518463805377795
Precision:  0.2342922899345702
F1:  0.3572556551393762
