### IMPORT LIBRARIES

In [1]:
from transformers import TFAutoModelForTokenClassification, ElectraTokenizerFast
from datasets import load_dataset, load_metric
import numpy as np
from transformers import DataCollatorWithPadding
import tensorflow as tf
from tensorflow.keras.metrics import Precision, Recall
import tensorflow_addons as tfa
import math
from transformers import TokenClassificationPipeline

### MODEL CONFIGURATION DETAILS

In [1]:
BASE_PATH = "pretrained_models/"
MODEL_PATH = BASE_PATH + "NegBioElectra/"
LR_RATE = 3e-5
CKPT_PATH = BASE_PATH + "NegBioElectra_bioscope_scope_model"
EPOCHS = 15

### LOAD MODEL & TOKENIZER

In [2]:
model = TFAutoModelForTokenClassification.from_pretrained(
    MODEL_PATH, num_labels=1,from_pt=True
)
tokenizer = ElectraTokenizerFast.from_pretrained(MODEL_PATH,max_length=256)
tokenizer.add_tokens(['[NEG]','[SPE]'], special_tokens=True)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFElectraForTokenClassification: ['embeddings.position_ids']
- This IS expected if you are initializing TFElectraForTokenClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFElectraForTokenClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFElectraForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


2

### LOAD DATASET

In [4]:
ds = load_dataset("csv", data_files={'train': 'bioscope_scope_train_data.csv','test':'bioscope_scope_test_data.csv'})
ds

Using custom data configuration default-3cae43ef256a5a50
Reusing dataset csv (/home/studio-lab-user/.cache/huggingface/datasets/csv/default-3cae43ef256a5a50/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e)


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['sentence', 'labels'],
        num_rows: 4960
    })
    test: Dataset({
        features: ['sentence', 'labels'],
        num_rows: 531
    })
})

### PRE-PROCESS DATA

In [5]:
def convert_labels(examples):
    l = {'labels':[list(np.array(i.split("|"),dtype="int32")) for i in examples['labels']]} 
    l.update(tokenizer(examples['sentence'],truncation=True,max_length=256,padding=True))
    return l

In [6]:
encoded_dataset = ds.map(convert_labels, batched=True)

Loading cached processed dataset at /home/studio-lab-user/.cache/huggingface/datasets/csv/default-3cae43ef256a5a50/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e/cache-2c04ccfa41c651b0.arrow
Loading cached processed dataset at /home/studio-lab-user/.cache/huggingface/datasets/csv/default-3cae43ef256a5a50/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e/cache-e6fa2fa61991b9ff.arrow


### TRAIN-TEST SPLIT

In [7]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer,max_length=256, return_tensors="tf",padding="max_length")

In [8]:

tf_train_dataset = encoded_dataset["train"].to_tf_dataset(
    columns=['token_type_ids', 'attention_mask', 'input_ids'],
    label_cols=["labels"],
    shuffle=True,
    batch_size=32,
    collate_fn=data_collator,
)
tf_validation_dataset = encoded_dataset["test"].to_tf_dataset(
    columns=['token_type_ids', 'attention_mask', 'input_ids'],
    label_cols=["labels"],
    shuffle=True,
    batch_size=32,
    collate_fn=data_collator,
)

In [9]:
list(tf_train_dataset.take(1))

[({'attention_mask': <tf.Tensor: shape=(32, 256), dtype=int64, numpy=
   array([[1, 1, 1, ..., 0, 0, 0],
          [1, 1, 1, ..., 0, 0, 0],
          [1, 1, 1, ..., 0, 0, 0],
          ...,
          [1, 1, 1, ..., 0, 0, 0],
          [1, 1, 1, ..., 0, 0, 0],
          [1, 1, 1, ..., 0, 0, 0]])>,
   'input_ids': <tf.Tensor: shape=(32, 256), dtype=int64, numpy=
   array([[   2, 5855,   15, ...,    0,    0,    0],
          [   2, 3053,   61, ...,    0,    0,    0],
          [   2, 1682, 3219, ...,    0,    0,    0],
          ...,
          [   2, 2250,   15, ...,    0,    0,    0],
          [   2, 4206,   15, ...,    0,    0,    0],
          [   2, 1898, 5433, ...,    0,    0,    0]])>,
   'token_type_ids': <tf.Tensor: shape=(32, 256), dtype=int64, numpy=
   array([[0, 0, 0, ..., 0, 0, 0],
          [0, 0, 0, ..., 0, 0, 0],
          [0, 0, 0, ..., 0, 0, 0],
          ...,
          [0, 0, 0, ..., 0, 0, 0],
          [0, 0, 0, ..., 0, 0, 0],
          [0, 0, 0, ..., 0, 0, 0]])>},
  

### MODEL TRAINING

In [10]:
optimizer = tf.keras.optimizers.Adam(learning_rate=LR_RATE, epsilon=1e-08, clipnorm=1.0)
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
metric = tf.keras.metrics.BinaryAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

In [12]:
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=CKPT_PATH,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True,
    save_weights_only=True)
callbacks = [model_checkpoint_callback]

In [13]:
history = model.fit(
 tf_train_dataset, 
 epochs=15,
 validation_data=tf_validation_dataset,
 callbacks =[callbacks]
)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


### MODEL EVALUATION ON TEST DATASET(INCLUDING NEGATION & SPECULATION)

In [14]:
model.load_weights(CKPT_PATH)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f4e7815bc10>

In [15]:
from sklearn.metrics import f1_score

In [16]:
def sigmoid_array(x):                                        
    return 1 / (1 + np.exp(-x))

In [17]:
inp = []
pred_out = []
true_out = []
for i in tf_validation_dataset:
    in_,out_ = list(i)
    inp += list(in_['input_ids'].numpy())
    true_out += list(out_.numpy())
    pred = model(in_)
    pred_out += list(sigmoid_array(pred['logits'].numpy()))
    
len(inp),len(true_out),len(pred_out)

(512, 512, 512)

In [18]:
pred_out = np.array(pred_out)
pred_out.shape

(512, 256, 1)

In [19]:
pred_out = np.reshape(pred_out,(pred_out.shape[0],256))
pred_out.shape

(512, 256)

In [20]:
pred_out = np.round(pred_out)
pred_out[0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

In [21]:
true_out = np.array(true_out)
true_out.shape

(512, 256)

In [22]:
f1_score(true_out, pred_out, average='macro',zero_division=1)

0.9847214600216313

### ILLUSTRATION OF MODEL PREDICTING SCOPE FOR TEST DATASET

In [24]:
from termcolor import colored

for i in range(len(pred_out)):
    tt = tokenizer.convert_ids_to_tokens(inp[i],skip_special_tokens=False)[1:]
    text = ' '.join([x for x in tt if x != '[PAD]'][:-1])
    ind = [i for i in range(len(text)) if text.startswith('[SPE]', i)]
    c = "green"
    if len(ind) ==0:
        ind = [i for i in range(len(text)) if text.startswith('[NEG]', i)]
        c = "red"
    text = text.replace(text[ind[0]+6:ind[1]-1],colored(text[ind[0]+6:ind[1]-1],c))  
    print(colored("Sentence:","blue"))
    print(text.replace(' ##',''))
    print(colored("Given Scope:","blue"))
    ind = [i for i in np.where(true_out[i])[0] if i < len(text.split())]
    given_ = " ".join(np.array(text.split())[ind])
    given_text = text.replace(given_,colored(given_,"magenta")).replace(' ##','')
    print(given_text)
    print(colored("Predicted Scope:","blue"))
    ind = [i for i in np.where(pred_out[i])[0] if i < len(text.split())]
    pred_ = " ".join(np.array(text.split())[ind])
    pred_text = text.replace(pred_,colored(pred_,"magenta")).replace(' ##','')
    print(pred_text)
    print()
    print()

[34mSentence:[0m
the results suggest that there exists a functional antagonism between vitd3 and ra that [SPE] [32mmay[0m [SPE] have important implications for the regulation of certain immune and inflammatory responses through their inverse effects on cd14 and cd23 gene expression .
[34mGiven Scope:[0m
the results suggest that there exists a functional antagonism between vitd3 and ra that [SPE] [32mmay[0m [SPE] [35mhave important implications for the regulation of certain immune and inflammatory responses through their inverse effects on cd14 and cd23 gene expression[0m .
[34mPredicted Scope:[0m
the results suggest that there exists a functional antagonism between vitd3 and ra that [SPE] [32mmay[0m [SPE] [35mhave important implications for the regulation of certain immune and inflammatory responses through their inverse effects on cd14 and cd23 gene expression[0m .


[34mSentence:[0m
in contrast , anti - cd3 and anti - cd28 stimulated rel - / - t cells , which [NEG] 