In [1]:
import json
import os
import time
import datetime
from collections import defaultdict

import torch
from bunch import Bunch
from pytorch_transformers import BertTokenizer, BertModel, WarmupLinearSchedule, AdamW
from dataset import TrainTRECDataset, TestTRECDataset
from model import TRECCARModel
from torch import nn, optim
from torch.utils.data import DataLoader
from transformers import get_linear_schedule_with_warmup
import random
import numpy as np
import pandas as pd
import logging
import warnings

warnings.filterwarnings('ignore')
logging.basicConfig(level=logging.INFO)

CONFIG_FILE = "config.json"
if torch.cuda.is_available():
    device = torch.device("cuda:0")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: TITAN X (Pascal)


In [2]:
def get_config_from_json(json_file):
    """
        Get the config from a json file
        :param json_file:
        :return: config(namespace) or config(dictionary)
        """
    # parse the configurations from the config json file provided
    with open(json_file, 'r') as config_file:
        config_dict = json.load(config_file)

    # convert the dictionary to a namespace using bunch lib
    config = Bunch(config_dict)

    return config, config_dict


def format_time(elapsed_time):
    """
    Takes a time in seconds and returns a string hh:mm:ss
    """
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed_time)))
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [3]:
config, _ = get_config_from_json(CONFIG_FILE)
seed_val = config.cmd_args['seed']
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
os.makedirs(config.data['results_dir'], exist_ok=True)

# Loading Tokenizer
tokenizer = BertTokenizer.from_pretrained(config["bert_token_file"], cache_dir=config.data['pretrained_download_dir'])
dataset = TrainTRECDataset(config.data['train_data'], config, is_train=True, bert_tokenizer=tokenizer)
train_dataloader = DataLoader(dataset=dataset,
                              batch_size=config.training["train_batch_size"],
                              pin_memory=config.cmd_args['device'] == 'cuda:0',
                              num_workers=config.training['num_workers'],
                              shuffle=True)
n_train_batches = len(train_dataloader)
print("Number of train batches : ", n_train_batches)

INFO:pytorch_transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at ../pretrained_download_dir/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084


Number of train batches :  1


In [4]:
# Creating instance of BertModel
net = TRECCARModel(config, freeze_bert=True)
net.to(device)

criterion = nn.MarginRankingLoss(margin=1, size_average=True)
opti = AdamW(net.parameters(),
             lr=2e-5,  # args.learning_rate - default is 5e-5, our notebook had 2e-5
             eps=1e-8,  # args.adam_epsilon  - default is 1e-8.
             correct_bias=False
             )
# opti = optim.Adam(net.parameters(), lr=2e-5)

# no_decay = ['bias', 'LayerNorm.weight']
# optimizer_grouped_parameters = [
#     {'params': [p for n, p in net.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
#     {'params': [p for n, p in net.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
# ]
# optimizer = AdamW(optimizer_grouped_parameters, lr=1e-5)

INFO:pytorch_transformers.modeling_utils:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at ../pretrained_download_dir/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.7156163d5fdc189c3016baca0775ffce230789d7fa2a42ef516483e4ca884517
INFO:pytorch_transformers.modeling_utils:Model config {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "output_attentions": false,
  "output_hidden_states": false,
  "pad_token_id": 0,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 2,
  "vocab_size": 30522
}

INFO:pytorch_transformers.modeling_u

In [5]:
num_epochs = config.training['epochs']
display_step = config['training']['display_step']
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(opti,
                                            num_warmup_steps=0,  # Default value in run_glue.py
                                            num_training_steps=total_steps)
# scheduler = WarmupLinearSchedule(opti, warmup_steps=config.training["warmup_proportion"],
#                                  t_total=config.training["total_training_steps"])

In [6]:
config.data['results_dir']

'./exp2'

In [7]:

training_stats = []
history = defaultdict(list)

resume_epoch = 0
print_every = 1 # 10
save_every = 200 # 1000

print('Initializing ...')
print("Training...")

total_t0 = time.time()
for epoch_idx in range(resume_epoch, num_epochs):
    total_loss = 0
    total_score = 0


    # ========================================
    #               Training
    # ========================================
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_idx + 1, num_epochs))
    print('Training...')
    t0 = time.time()
    total_train_loss = 0
    net.train()  # TODO: IMPORTANT !
    for batch_idx, train_batch_data in enumerate(train_dataloader):
        # Clear gradients
        net.zero_grad()  # TODO: check validity !
        opti.zero_grad()

        # Converting these to cuda tensors
        pos_ids, pos_mask, pos_type_ids, \
        neg_ids, neg_mask, neg_type_ids, \
        seqA_len, posSeqB_len, negSeqB_len, \
        label = train_batch_data

        pos_ids, pos_mask, pos_type_ids, \
        neg_ids, neg_mask, neg_type_ids, \
        seqA_len, posSeqB_len, negSeqB_len, \
        label = pos_ids.to(device), pos_mask.to(device), pos_type_ids.to(device), \
                neg_ids.to(device), neg_mask.to(device), neg_type_ids.to(device), \
                seqA_len.to(device), posSeqB_len.to(device), negSeqB_len.to(device), \
                label.to(device)

        pos_net_output = net(pos_ids, attn_masks=pos_mask, type_ids=pos_type_ids)
        neg_net_output = net(neg_ids, attn_masks=neg_mask, type_ids=neg_type_ids)
        # # TODO: do i need a softmax or not ?

        # Computing loss
        # loss = criterion(net_output, label.float())
        loss = criterion(pos_net_output, neg_net_output, label.float())
        batch_loss = loss.item()
        # total_train_loss += loss.item()

        # Back propagating the gradients
        loss.backward()
        if config.training['gradient_clipping']['use']:
            torch.nn.utils.clip_grad_norm_(net.parameters(), config.training['gradient_clipping']['clip_value'])

        # Optimization step
        opti.step()

        # Progress update every display_step batches.
        # if batch_idx % display_step == 0 and not batch_idx == 0:
        #     elapsed = format_time(time.time() - t0)
        #     # print('  Batch {:>5,}  of  {:>5,}  :  loss - {:>5,.2f}    Elapsed: {:}.'.format(batch_idx,
        #     #                                                                                 len(train_dataloader),
        #     #                                                                                 loss, elapsed))
        #     print('  Epoch {:>5,}  of  {:>5,}  :  Batch {:>5,}  of  {:>5,}  :  \
        #     loss - {:>5,.2f}    Elapsed: {:}.'.format(epoch_idx + 1, num_epochs,
        #                                               batch_idx + 1, len(train_dataloader),
        #                                               loss, elapsed))
        #     training_stats.append(
        #         {
        #             'epoch': epoch_idx + 1,
        #             'batch': batch_idx + 1,
        #             'step': (epoch_idx * n_train_batches) + batch_idx + 1,
        #             'Training Loss': loss,
        #             # 'Training Loss': avg_train_loss,
        #             # 'Training Time': training_time,
        #         }
        #     )

        total_loss += batch_loss

        with open(os.path.join(config.data['results_dir'], 'train-log-epoch.txt' ), 'a') as f:
                f.write(str(batch_idx+1) + '\t' + str(batch_idx+1) + '\t' + str(batch_loss) + '\t' + '\n')
        # writer.add_scalar('training loss', loss.item(), epoch * n_train_batches + batch_idx)
        # writer.add_scalar('training score', batch_score, epoch * n_train_batches + batch_idx)

        training_stats.append(
                {
                    'epoch': epoch_idx + 1,
                    'batch': batch_idx + 1,
                    'step': (epoch_idx * n_train_batches) + batch_idx + 1,
                    'Training Loss': batch_loss,
                    # 'Training Loss': avg_train_loss,
                    # 'Training Time': training_time,
                }
            )
                
        if batch_idx % print_every == 0: # Print progress
            total_loss_avg = total_loss / print_every 
            elapsed = format_time(time.time() - t0)
            print('| TRAIN SET | Epoch [{:02d}/{:02d}], Step [{:04d}/{:04d}], Loss: {:.4f} | Elapsed: {:}'
                          .format(epoch_idx+1, num_epochs, batch_idx, int(n_train_batches), total_loss_avg, elapsed))
            total_loss = 0              
        
        if ( (batch_idx == n_train_batches-1) or ((batch_idx+1) % save_every == 0) ): # Save checkpoint
            directory = os.path.join(config.data['results_dir'], 'ranking-pytorch-model')
            if not os.path.exists(directory):
                os.makedirs(directory)
            torch.save(net.state_dict(),
                       os.path.join(directory, 'epoch-{}.batch-{}.{}.pt'.format(epoch_idx+1, batch_idx+1, 'checkpoint')))


    scheduler.step()  # TODO: IMPORTANT !

    avg_train_loss = total_train_loss / len(train_dataloader)
    training_time = format_time(time.time() - t0)
    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(training_time))

history_df = pd.DataFrame({"step": [e['step'] for e in training_stats],
                           "Training Loss": [e['Training Loss'] for e in training_stats]})
# history_df.to_csv(os.path.join(config.data['results_dir'], "history.csv"), index=False)

print("")
print("Training complete!")
print("Total training took {:} (h:mm:ss)".format(format_time(time.time() - total_t0)))

if not config.cmd_args['mode'] == "experiment":
    torch.save(net, os.path.join(config.data['results_dir'], "model-dump.pkl"))

Initializing ...
Training...

Training...


ValueError: Caught ValueError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/raid6/home/ramraj/anaconda3/envs/ir-research-py3.8/lib/python3.8/site-packages/torch/utils/data/_utils/worker.py", line 202, in _worker_loop
    data = fetcher.fetch(index)
  File "/raid6/home/ramraj/anaconda3/envs/ir-research-py3.8/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 44, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/raid6/home/ramraj/anaconda3/envs/ir-research-py3.8/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 44, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/raid6/home/ramraj/2021/ir/entity-reranking/best-text-BERT/dataset.py", line 152, in __getitem__
    negInst = self.data_df[self.data_df['qID'] != qID].sample(n=3, replace=True, random_state=1).iloc[0]
  File "/raid6/home/ramraj/anaconda3/envs/ir-research-py3.8/lib/python3.8/site-packages/pandas/core/generic.py", line 5350, in sample
    locs = rs.choice(axis_length, size=n, replace=replace, p=weights)
  File "mtrand.pyx", line 903, in numpy.random.mtrand.RandomState.choice
ValueError: a must be greater than 0 unless no samples are taken


In [None]:
history_df.to_csv(os.path.join(config.data['results_dir'], "history.csv"), index=False)

In [7]:
# history_df = pd.DataFrame({"step": [e['step'] for e in training_stats],
#                            "Training Loss": [e['Training Loss'].cpu() for e in training_stats]})
# # history_df.to_csv(os.path.join(config.data['results_dir'], "history.csv"), index=False)

In [None]:
print("")
print("Training complete!")
print("Total training took {:} (h:mm:ss)".format(format_time(time.time() - total_t0)))

if not config.cmd_args['mode'] == "experiment":
    torch.save(net, os.path.join(config.data['results_dir'], "model-dump.pkl"))
# ========================================
#               NOT Validation, Just Testing
# ========================================
print("")
print("Validation...")
t0 = time.time()

test_dataset = TestTRECDataset(config.data['test_data'], config, is_train=False, bert_tokenizer=tokenizer)
test_dataloader = DataLoader(dataset=test_dataset,
                             batch_size=config.training["test_batch_size"],
                             pin_memory=config.cmd_args['device'] == 'cuda',
                             num_workers=config.training['num_workers'],
                             shuffle=True)
n_test_batches = len(test_dataloader)
print("Number of test batches : ", n_test_batches, "\n")
net.eval()

In [None]:
qID_list = []
paraID_list = []
pScore_list = []
t1 = time.time()
for batch_idx, test_batch_data in enumerate(test_dataloader):
    # Converting these to cuda tensors
    input_seq, input_mask, input_type_ids, label, qID, passageID, seqA_len, seqB_len = test_batch_data
    input_seq, input_mask, input_type_ids, \
    seqA_len, seqB_len = input_seq.to(device), input_mask.to(device), input_type_ids.to(device), \
                         seqA_len.to(device), seqB_len.to(device)

    with torch.no_grad():
        net_output = net(input_seq, attn_masks=input_mask, type_ids=input_type_ids)
        net_output = net_output.detach().cpu().numpy()

        for i in range(len(qID)):
            qID_list.append(qID[i])
            paraID_list.append(passageID[i])
            pScore_list.append(net_output[i])
    elapsed = format_time(time.time() - t1)
    
    if batch_idx % 50 == 0:
        print('  Batch {:>5,}  of  {:>5,}  :  processed    Elapsed: {:}.'.format(batch_idx,
                                                                             n_test_batches,
                                                                             elapsed))

pScore_list = [float(e) for e in pScore_list]
predicted_df = pd.DataFrame({"qID": qID_list,
                             "pID": paraID_list,
                             "pScore": pScore_list}, columns=["qID", "pID", "pScore"])
if not config.cmd_args['mode'] == "experiment":
    predicted_df.to_csv(os.path.join(config.data['results_dir'], "predictions.csv"))
print()


In [None]:
# ================================================
#               Reverse Sorting Relevance
# ================================================
predicted_df = predicted_df[['qID', 'pID', 'pScore']]
grouped_pred_df = predicted_df.groupby(["qID"])
num_queries = len(grouped_pred_df)
missing_q_sets = 0
save_ranked_file = os.path.join(config.data['results_dir'], "ranked.test.relevance.txt")
with open(save_ranked_file, 'w') as write_file:
    q_cnt = 1
    for name, row_group in grouped_pred_df:
        rank_cnt = 1

        # ======= SORTING =======
        sorted_row_group = row_group.sort_values(by='pScore', ascending=False, inplace=False)
        # =======================

        if len(sorted_row_group) != 100:
            # print(">>>>>>>>>>> Missing query %s with shape %s" % (name, sorted_row_group.shape))
            # print(">>>>>>>>>>> Missing query with size %s" % sorted_row_group.shape[0])
            missing_q_sets += 1

        for i, row in sorted_row_group.iterrows():
            write_file.write("%s\tQ0\t%s\t%s\t%s\trchan31\n" % \
                             (row["qID"], row["pID"], rank_cnt, row["pScore"]))
            rank_cnt += 1

        if q_cnt % 100 == 0: print("Finished composing for query number : %s / %s" % (q_cnt, num_queries))
        q_cnt += 1
print()
print("Missing query-doc pairs : ", missing_q_sets)
print("Done train, val, and test !!!")

In [12]:
! ls exp1

ranked.test.relevance.txt


In [13]:
! tail -5 exp1/ranked.test.relevance.txt

enwiki:Yellowstone%20National%20Park/Recreation	Q0	cbf3d5427fd8d7956c027fddd46e14b3779aa94b	96	2.5369160175323486	rchan31
enwiki:Yellowstone%20National%20Park/Recreation	Q0	e2d8413a1f00b1a8c29bea03766fbd7bb5d7b309	97	2.5269246101379395	rchan31
enwiki:Yellowstone%20National%20Park/Recreation	Q0	97596ad40e87e4528aac5f7b22005884819853d1	98	2.51876163482666	rchan31
enwiki:Yellowstone%20National%20Park/Recreation	Q0	8b3652a6bc32fb5b9a0b8efebb02f5609a43d075	99	2.4430224895477295	rchan31
enwiki:Yellowstone%20National%20Park/Recreation	Q0	c13cea34c7beba80a6026cc1721a1bad12e77497	100	2.0023608207702637	rchan31


In [18]:
! ls ../Eval/qrelsY1-test.V2.0

automatic-test.pages.cbor-hierarchical.qrels
lenient.benchmarkY1test.cbor.hierarchical.qrels
manual.benchmarkY1test.cbor.hierarchical.qrels


In [19]:
! ../Eval/trec_eval-master/trec_eval ../Eval/qrelsY1-test.V2.0/automatic-test.pages.cbor-hierarchical.qrels exp1/ranked.test.relevance.txt

runid                 	all	rchan31
num_q                 	all	2254
num_ret               	all	225156
num_rel               	all	6192
num_rel_ret           	all	2375
map                   	all	0.0935
gm_map                	all	0.0034
Rprec                 	all	0.0572
bpref                 	all	0.4689
recip_rank            	all	0.1524
iprec_at_recall_0.00  	all	0.1565
iprec_at_recall_0.10  	all	0.1562
iprec_at_recall_0.20  	all	0.1461
iprec_at_recall_0.30  	all	0.1229
iprec_at_recall_0.40  	all	0.0981
iprec_at_recall_0.50  	all	0.0941
iprec_at_recall_0.60  	all	0.0684
iprec_at_recall_0.70  	all	0.0665
iprec_at_recall_0.80  	all	0.0591
iprec_at_recall_0.90  	all	0.0585
iprec_at_recall_1.00  	all	0.0585
P_5                   	all	0.0491
P_10                  	all	0.0388
P_15                  	all	0.0331
P_20                  	all	0.0290
P_30                  	all	0.0234
P_100                 	all	0.0105
P_200                 	all	0.0053
P_500                 	al