In [1]:
import warnings
warnings.filterwarnings('ignore')

import io
import random
import numpy as np
import mxnet as mx
import gluonnlp as nlp
from bert import data, model
import time
from mxnet.gluon.utils import split_and_load 

In [2]:
np.random.seed(100)
random.seed(100)
mx.random.seed(10000)
# change `ctx` to `mx.cpu()` if no GPU is available.
ctx = [mx.gpu(0), mx.gpu(1), mx.gpu(2), mx.gpu(3), mx.gpu(4), mx.gpu(5), mx.gpu(6), mx.gpu(7)]
# ctx = mx.cpu()

In [3]:
bert_base, vocabulary = nlp.model.get_model('bert_12_768_12',#book_corpus_wiki_en_uncased
                                             dataset_name='biobert_v1.1_pubmed_cased', #biobert_v1.1_pubmed_cased
                                             pretrained=True, ctx=ctx, use_pooler=True,
                                             use_decoder=False, use_classifier=False)

In [4]:
bert_classifier = model.classification.BERTClassifier(bert_base, num_classes=2, dropout=0.1)
# only need to initialize the classifier layer.
bert_classifier.classifier.initialize(init=mx.init.Normal(0.02), ctx=ctx)
bert_classifier.hybridize(static_alloc=True)

# softmax cross entropy loss for classification
loss_function = mx.gluon.loss.SoftmaxCELoss()
loss_function.hybridize(static_alloc=True)

metric = mx.metric.Accuracy()

In [5]:
# bert_classifier.load_parameters('bisai/epoch54399_acc0.90652.params')
bert_classifier.load_parameters('bisai/512new/epoch121599.params')

In [6]:
num_discard_samples = 1
field_separator = nlp.data.Splitter('\t')
field_indices = [3, 4, 0]
data_train_raw = nlp.data.TSVDataset(filename='val_bert_10.tsv', # change val_bert or test_bert_train_val
                                 field_separator=field_separator,
                                 num_discard_samples=num_discard_samples,
                                 field_indices=field_indices)

In [8]:
bert_tokenizer = nlp.data.BERTTokenizer(vocabulary, lower=False)
max_len = 512
all_labels = ["0", "UNKOWN"]

# whether to transform the data as sentence pairs.
# for single sentence classification, set pair=False
# for regression task, set class_labels=None
# for inference without label available, set has_label=False
pair = True
transform = data.transform.BERTDatasetTransform(bert_tokenizer, max_len,
                                                class_labels=all_labels,
                                                has_label=True,
                                                pad=True,
                                                pair=pair)


data_train = data_train_raw.transform(transform)

sample_id = 5
print('vocabulary used for tokenization = \n%s'%vocabulary)
print('%s token id = %s'%(vocabulary.padding_token, vocabulary[vocabulary.padding_token]))
print('%s token id = %s'%(vocabulary.cls_token, vocabulary[vocabulary.cls_token]))
print('%s token id = %s'%(vocabulary.sep_token, vocabulary[vocabulary.sep_token]))
print('token ids = \n%s'%data_train[sample_id][0])
print('valid length = \n%s'%data_train[sample_id][1])
print('segment ids = \n%s'%data_train[sample_id][2])
print('label = \n%s'%data_train[sample_id][3])

vocabulary used for tokenization = 
Vocab(size=28996, unk="[UNK]", reserved="['[CLS]', '[SEP]', '[MASK]', '[PAD]']")
[PAD] token id = 0
[CLS] token id = 101
[SEP] token id = 102
token ids = 
[  101 21902  6078  1105  8304 14758  1103   148 12240  1104   159  2271
  2069  1116  1213 23645  6580  1133  1292  1127  8443  1107  3201 20562
   117  1105  7377  1105  3605 14737  2522 10581  5660  1606  6145  2114
  1104 11960  1116  1150  1125  2331  7372  1106  6860  1105  1127  8443
  1121   170  1661  1107  4666   117  3066   164   164   115   115   108
   108   115   115   166   166   119   102  1212  1103  7762  1206  8234
  1105  2629 15156   119  1188  2526 26856  1103  8550  1104  8234  1105
  2629 15156  1606   170  1326  1104  5136   119 11300 15022  1105  2629
 15156  1132  5708  3393  1439  1103  4073  8057  5822  4746  8297   119
 11300 15022  1110  3393  1107  2538  1104  1103  3154  1104   123 22496
  6142  2629 15156  1110  3393  1107  2538  1104  1103  2629  1104  1141
  9108

In [9]:
len(data_train)

343120

In [10]:
1

1

In [11]:
# Training the model with only three epochs
batch_size = 240
bert_dataloader = mx.gluon.data.DataLoader(data_train, batch_size=batch_size, num_workers=40,
                                           pin_memory=True, prefetch=20000)

log_interval = 160
test_log_interval = 320

In [12]:
start_log_interval_time = time.time()
new_recall_score = []

for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(f):
    # Load the data to the GPU
#     token_ids = token_ids.as_in_context(ctx)
#     valid_length = valid_length.as_in_context(ctx)
#     segment_ids = segment_ids.as_in_context(ctx)

#     label = label.as_in_context(ctx)

    token_ids = split_and_load(token_ids, ctx, even_split=False)
    valid_length = split_and_load(valid_length.astype('float32'), ctx, even_split=False)
    segment_ids = split_and_load(segment_ids, ctx, even_split=False)
    label = [bert_classifier(a, b, c)
                  for a, b, c in zip(token_ids, segment_ids, valid_length)]
    
    label = [l[:,1].asnumpy() for l in label]
    new_recall_score.extend(label)

    if (batch_id + 1) % test_log_interval == 0:
        print('[Batch {}/{}] elapsed {:.2f} s'.format(
            batch_id + 1, len(bert_dataloader),
            time.time() - start_log_interval_time))
        start_log_interval_time = time.time()

[Batch 320/1430] elapsed 455.13 s
[Batch 640/1430] elapsed 459.74 s
[Batch 960/1430] elapsed 461.03 s
[Batch 1280/1430] elapsed 464.48 s


In [13]:
1

1

In [14]:
bm25_rank_val.shape
# np_new_recall_score_idx.shape

NameError: name 'bm25_rank_val' is not defined

In [None]:
import pandas as pd
import numpy as np
train = pd.read_hdf('cleaned.h5', 'train')
val = pd.read_hdf('cleaned.h5', 'val')
candid = pd.read_hdf('cleaned.h5', 'candid')
candid = candid.reset_index(drop=True)
bm25_rank_train = np.load('train_recall.npy')
bm25_rank_val = np.load('val_recall.npy')

In [None]:
np_new_recall_score = np.hstack(new_recall_score).reshape([-1, 10])
np_new_recall_score_idx = np.argsort(-np_new_recall_score, axis=1)

In [None]:
bert_rerank_train_val = np.take_along_axis(bm25_rank_train[60000:,:], np_new_recall_score_idx, axis=1)

In [None]:
np_new_recall_score_idx.shape

In [None]:
bert_rerank_train_val

In [None]:
bert_rerank_train_val = np.take_along_axis(bm25_rank_val, np_new_recall_score_idx, axis=1)

In [None]:
val['ans0'] = bert_rerank_train_val[:,0]
val['ans1'] = bert_rerank_train_val[:,1]
val['ans2'] = bert_rerank_train_val[:,2]
def idx2name(idx):
    return candid.iloc[idx].paper_id
val['ans0'] = val['ans0'].apply(idx2name)
val['ans1'] = val['ans1'].apply(idx2name)
val['ans2'] = val['ans2'].apply(idx2name)

val[['description_id','ans0','ans1','ans2']].to_csv('commit123.csv', index=False, header=None)
