# Fine-tuning Entity Pair Classification with SciBERT



### Importing necessary modules

In [1]:
import warnings
warnings.filterwarnings('ignore')

import io
import random
import numpy as np
import mxnet as mx
import gluonnlp as nlp
from Bert import data, model

### Setting up the environment



In [2]:
np.random.seed(100)
random.seed(100)
mx.random.seed(10000)
ctx = mx.cpu(0)

## Using the pre-trained BERT model

This model is the same as the previous one (3.1 bert) except that we select the BERT model which is trained in the scientific corpus. 'scibert_scivocab_uncased'

### Get BERT

In [10]:
scibert, vocabulary = nlp.model.get_model('bert_12_768_12',
                                             dataset_name='scibert_scivocab_uncased',
                                             pretrained=True, ctx=ctx, use_pooler=True,
                                             use_decoder=False, use_classifier=False)
print(scibert)

BERTModel(
  (encoder): BERTEncoder(
    (dropout_layer): Dropout(p = 0.1, axes=())
    (layer_norm): BERTLayerNorm(eps=1e-12, axis=-1, center=True, scale=True, in_channels=768)
    (transformer_cells): HybridSequential(
      (0): BERTEncoderCell(
        (dropout_layer): Dropout(p = 0.1, axes=())
        (attention_cell): MultiHeadAttentionCell(
          (_base_cell): DotProductAttentionCell(
            (_dropout_layer): Dropout(p = 0.1, axes=())
          )
          (proj_query): Dense(768 -> 768, linear)
          (proj_key): Dense(768 -> 768, linear)
          (proj_value): Dense(768 -> 768, linear)
        )
        (proj): Dense(768 -> 768, linear)
        (ffn): BERTPositionwiseFFN(
          (ffn_1): Dense(768 -> 3072, linear)
          (activation): GELU()
          (ffn_2): Dense(3072 -> 768, linear)
          (dropout_layer): Dropout(p = 0.1, axes=())
          (layer_norm): BERTLayerNorm(eps=1e-12, axis=-1, center=True, scale=True, in_channels=768)
        )
        (la

### Transform the model for Entity classification

In [11]:
scibert_classifier = model.classification.BERTClassifier(scibert, num_classes=6, dropout=0.1)
# only need to initialize the classifier layer.
scibert_classifier.classifier.initialize(init=mx.init.Normal(0.02), ctx=ctx)
scibert_classifier.hybridize(static_alloc=True)

# softmax cross entropy loss for classification
loss_function = mx.gluon.loss.SoftmaxCELoss()
loss_function.hybridize(static_alloc=True)

metric = mx.metric.Accuracy()

## Data preprocessing for BERT



### Loading the dataset

In [12]:
import pickle
import pandas as pd
entity_pair = pd.read_pickle('entity_pair.pkl')
entity_pair.head()
type(entity_pair)

pandas.core.frame.DataFrame

In [13]:
entity_pair.to_csv('train.tsv', sep = '\t', index=False)

In [14]:
tsv_file = io.open('train.tsv', encoding='utf-8')
for i in range(5):
    print(tsv_file.readline())

entity_text_1	entity_text_2	label

database traditional information retrieval techniques use a	histogram of keywords as the	USAGE

representation but oral communication may offer	offer additional indices such as	USAGE

a large database of tv	database of tv shows emotions and	PART_WHOLE

of a distributed message-passing infrastructure for dialogue	infrastructure for dialogue systems which all	MODEL-FEATURE



In [15]:
# Skip the first line,
num_discard_samples = 1
# Split fields by tabs
field_separator = nlp.data.Splitter('\t')
# Fields to select from the file
field_indices = [0, 1, 2]
train_data = nlp.data.TSVDataset(filename='train.tsv',
                                 field_separator=field_separator,
                                 num_discard_samples=num_discard_samples,
                                 field_indices=field_indices)
sample_id = 0

print(train_data[sample_id][0])
print(train_data[sample_id][1])
print(train_data[sample_id][2])

database traditional information retrieval techniques use a
histogram of keywords as the
USAGE


In [16]:
# Use the vocabulary from pre-trained model for tokenization
bert_tokenizer = nlp.data.BERTTokenizer(vocabulary, lower=True)

# The maximum length of an input sequence

max_len = 12

all_labels = ["USAGE", "PART_WHOLE", "MODEL-FEATURE", "RESULT", "COMPARE", "TOPIC"]

pair = True
transform = data.transform.BERTDatasetTransform(bert_tokenizer, max_len,
                                                class_labels=all_labels,
                                                has_label=True,
                                                pad=True,
                                                pair=pair)
#data_train = data_train_raw.transform(transform)
data_train = train_data.transform(transform)

In [17]:
print('vocabulary used for tokenization = \n%s'%vocabulary)
print('%s token id = %s'%(vocabulary.padding_token, vocabulary[vocabulary.padding_token]))
print('%s token id = %s'%(vocabulary.cls_token, vocabulary[vocabulary.cls_token]))
print('%s token id = %s'%(vocabulary.sep_token, vocabulary[vocabulary.sep_token]))
print('token ids = \n%s'%data_train[sample_id][0])
print('valid length = \n%s'%data_train[sample_id][1])
print('segment ids = \n%s'%data_train[sample_id][2])
print('label = \n%s'%data_train[sample_id][3])

vocabulary used for tokenization = 
Vocab(size=31090, unk="[UNK]", reserved="['[CLS]', '[SEP]', '[MASK]', '[PAD]']")
[PAD] token id = 0
[CLS] token id = 102
[SEP] token id = 103
token ids = 
[  102  3139  3783   776  6606  2190   103 11261   131  4302   188   103]
valid length = 
12
segment ids = 
[0 0 0 0 0 0 0 1 1 1 1 1]
label = 
[0]


## Fine-tuning the model

In [21]:
# The hyperparameters
batch_size = 32
lr = 5e-6

# The FixedBucketSampler and the DataLoader for making the mini-batches
train_sampler = nlp.data.FixedBucketSampler(lengths=[int(item[1]) for item in data_train],
                                            batch_size=batch_size,
                                            shuffle=True)
bert_dataloader = mx.gluon.data.DataLoader(data_train, batch_sampler=train_sampler)

trainer = mx.gluon.Trainer(scibert_classifier.collect_params(), 'adam',
                           {'learning_rate': lr, 'epsilon': 1e-9})

# Collect all differentiable parameters
# `grad_req == 'null'` indicates no gradients are calculated (e.g. constant parameters)
# The gradients for these params are clipped later
params = [p for p in scibert_classifier.collect_params().values() if p.grad_req != 'null']
grad_clip = 1

# Training the model with only three epochs
log_interval = 4
num_epochs = 5
for epoch_id in range(num_epochs):
    metric.reset()
    step_loss = 0
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(bert_dataloader):
        with mx.autograd.record():

            # Load the data to the GPU
            token_ids = token_ids.as_in_context(ctx)
            valid_length = valid_length.as_in_context(ctx)
            segment_ids = segment_ids.as_in_context(ctx)
            label = label.as_in_context(ctx)

            # Forward computation
            out = scibert_classifier(token_ids, segment_ids, valid_length.astype('float32'))
            ls = loss_function(out, label).mean()

        # And backwards computation
        ls.backward()

        # Gradient clipping
        trainer.allreduce_grads()
        nlp.utils.clip_grad_global_norm(params, 1)
        trainer.update(1)

        step_loss += ls.asscalar()
        metric.update([label], [out])

        # Printing vital information
        if (batch_id + 1) % (log_interval) == 0:
            print('[Epoch {} Batch {}/{}] loss={:.4f}, lr={:.7f}, acc={:.3f}'
                         .format(epoch_id, batch_id + 1, len(bert_dataloader),
                                 step_loss / log_interval,
                                 trainer.learning_rate, metric.get()[1]))
            step_loss = 0

[Epoch 0 Batch 4/40] loss=1.1259, lr=0.0000050, acc=0.594
[Epoch 0 Batch 8/40] loss=1.0270, lr=0.0000050, acc=0.621
[Epoch 0 Batch 12/40] loss=1.0464, lr=0.0000050, acc=0.612
[Epoch 0 Batch 16/40] loss=1.1620, lr=0.0000050, acc=0.607
[Epoch 0 Batch 20/40] loss=1.1086, lr=0.0000050, acc=0.592
[Epoch 0 Batch 24/40] loss=1.1115, lr=0.0000050, acc=0.593
[Epoch 0 Batch 28/40] loss=1.0934, lr=0.0000050, acc=0.597
[Epoch 0 Batch 32/40] loss=1.0268, lr=0.0000050, acc=0.603
[Epoch 0 Batch 36/40] loss=1.0550, lr=0.0000050, acc=0.603
[Epoch 0 Batch 40/40] loss=0.9507, lr=0.0000050, acc=0.611
[Epoch 1 Batch 4/40] loss=0.9682, lr=0.0000050, acc=0.703
[Epoch 1 Batch 8/40] loss=0.9917, lr=0.0000050, acc=0.688
[Epoch 1 Batch 12/40] loss=0.9292, lr=0.0000050, acc=0.680
[Epoch 1 Batch 16/40] loss=1.0394, lr=0.0000050, acc=0.652
[Epoch 1 Batch 20/40] loss=1.0166, lr=0.0000050, acc=0.655
[Epoch 1 Batch 24/40] loss=0.9128, lr=0.0000050, acc=0.658
[Epoch 1 Batch 28/40] loss=0.9952, lr=0.0000050, acc=0.658
[