### IMPORT LIBRARIES

In [1]:
from transformers import TFAutoModelForTokenClassification, ElectraTokenizerFast
from datasets import load_dataset, load_metric
import numpy as np
from transformers import DataCollatorWithPadding
import tensorflow as tf
from tensorflow.keras.metrics import Precision, Recall
import tensorflow_addons as tfa
import math
from transformers import TokenClassificationPipeline
from sklearn.metrics import f1_score

### MODEL CONFIGURATION DETAILS

In [1]:
BASE_PATH = "pretrained_models/"
MODEL_PATH = BASE_PATH + "NegBioElectra/"
LR_RATE = 5.5e-5
CKPT_PATH = BASE_PATH + "sherlock_models/NegBioElectra_sherlock_scope_model"
EPOCHS = 15

### LOAD BASE MODEL & TOKENIZER

In [2]:
model = TFAutoModelForTokenClassification.from_pretrained(
    MODEL_PATH, num_labels=1,from_pt=True
)
tokenizer = ElectraTokenizerFast.from_pretrained(MODEL_PATH,max_length=256)
tokenizer.add_tokens(['[NEG]'], special_tokens=True)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFElectraForTokenClassification: ['embeddings.position_ids']
- This IS expected if you are initializing TFElectraForTokenClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFElectraForTokenClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFElectraForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


1

### LOAD DATASET

In [4]:
ds = load_dataset("csv", data_files={'train': 'data/sherlock_scope_train.csv','dev':'data/sherlock_scope_dev.csv',"test_card":"data/sherlock_scope_test_cardboard.csv","test_circle":"data/sherlock_scope_test_circle.csv"})
ds

Using custom data configuration default-68d72f5d816b845f
Reusing dataset csv (/home/studio-lab-user/.cache/huggingface/datasets/csv/default-68d72f5d816b845f/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e)


  0%|          | 0/4 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['sentence', 'labels'],
        num_rows: 847
    })
    dev: Dataset({
        features: ['sentence', 'labels'],
        num_rows: 144
    })
    test_card: Dataset({
        features: ['sentence', 'labels'],
        num_rows: 119
    })
    test_circle: Dataset({
        features: ['sentence', 'labels'],
        num_rows: 116
    })
})

### PRE-PROCESS DATA

In [5]:
def convert_labels(examples):
    l = {'labels':[list(np.array(i.split("|"),dtype="int32")) for i in examples['labels']]} 
    l.update(tokenizer(examples['sentence'],truncation=True,max_length=256,padding=True))
    return l

In [6]:
encoded_dataset = ds.map(convert_labels, batched=True)

Loading cached processed dataset at /home/studio-lab-user/.cache/huggingface/datasets/csv/default-68d72f5d816b845f/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e/cache-cb5576f249310cd3.arrow
Loading cached processed dataset at /home/studio-lab-user/.cache/huggingface/datasets/csv/default-68d72f5d816b845f/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e/cache-39b9ddc55b807479.arrow
Loading cached processed dataset at /home/studio-lab-user/.cache/huggingface/datasets/csv/default-68d72f5d816b845f/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e/cache-ee328fe82e487357.arrow
Loading cached processed dataset at /home/studio-lab-user/.cache/huggingface/datasets/csv/default-68d72f5d816b845f/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e/cache-90f2f2a0fb3734b3.arrow


### TRAIN-TEST SPLIT

In [7]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer,max_length=256, return_tensors="tf",padding="max_length")

In [8]:

tf_train_dataset = encoded_dataset["train"].to_tf_dataset(
    columns=['token_type_ids', 'attention_mask', 'input_ids'],
    label_cols=["labels"],
    shuffle=True,
    batch_size=32,
    collate_fn=data_collator,
)
tf_validation_dataset = encoded_dataset["dev"].to_tf_dataset(
    columns=['token_type_ids', 'attention_mask', 'input_ids'],
    label_cols=["labels"],
    shuffle=True,
    batch_size=32,
    collate_fn=data_collator,
)

In [9]:
list(tf_train_dataset.take(1))

[({'attention_mask': <tf.Tensor: shape=(32, 256), dtype=int64, numpy=
   array([[1, 1, 1, ..., 0, 0, 0],
          [1, 1, 1, ..., 0, 0, 0],
          [1, 1, 1, ..., 0, 0, 0],
          ...,
          [1, 1, 1, ..., 0, 0, 0],
          [1, 1, 1, ..., 0, 0, 0],
          [1, 1, 1, ..., 0, 0, 0]])>,
   'input_ids': <tf.Tensor: shape=(32, 256), dtype=int64, numpy=
   array([[   2, 2493, 2963, ...,    0,    0,    0],
          [   2,   41,   41, ...,    0,    0,    0],
          [   2,   41,   41, ...,    0,    0,    0],
          ...,
          [   2, 1680, 3464, ...,    0,    0,    0],
          [   2, 1690, 4728, ...,    0,    0,    0],
          [   2,   41,   41, ...,    0,    0,    0]])>,
   'token_type_ids': <tf.Tensor: shape=(32, 256), dtype=int64, numpy=
   array([[0, 0, 0, ..., 0, 0, 0],
          [0, 0, 0, ..., 0, 0, 0],
          [0, 0, 0, ..., 0, 0, 0],
          ...,
          [0, 0, 0, ..., 0, 0, 0],
          [0, 0, 0, ..., 0, 0, 0],
          [0, 0, 0, ..., 0, 0, 0]])>},
  

### MODEL TRAINING

In [10]:
optimizer = tf.keras.optimizers.Adam(learning_rate=LR_RATE, epsilon=1e-08, clipnorm=1.0)
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
metric = tf.keras.metrics.BinaryAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

In [12]:
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=CKPT_PATH,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True,
    save_weights_only=True)
callbacks = [model_checkpoint_callback]

In [13]:
history = model.fit(
 tf_train_dataset, 
 epochs=30,
 validation_data=tf_validation_dataset,
 callbacks =[callbacks]
)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


### MODEL EVALUATION ON TEST DATASET

In [14]:
model.load_weights(CKPT_PATH)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f80d9dd2df0>

In [16]:
def sigmoid_array(x):                                        
    return 1 / (1 + np.exp(-x))

In [17]:
tf_test_circle = encoded_dataset["test_circle"].to_tf_dataset(
    columns=['token_type_ids', 'attention_mask', 'input_ids'],
    label_cols=["label"],
    shuffle=True,
    batch_size=1,
    collate_fn=data_collator,
)
tf_test_cardboard = encoded_dataset["test_card"].to_tf_dataset(
    columns=['token_type_ids', 'attention_mask', 'input_ids'],
    label_cols=["label"],
    shuffle=True,
    batch_size=1,
    collate_fn=data_collator,
)

In [18]:
inp = []
pred_out = []
true_out = []
for i in tf_test_cardboard:
    in_,out_ = list(i)
    inp += list(in_['input_ids'].numpy())
    true_out += list(out_.numpy())
    pred = model(in_)
    pred_out += list(sigmoid_array(pred['logits'].numpy()))
    
len(inp),len(true_out),len(pred_out)

(119, 119, 119)

In [19]:
for i in tf_test_circle:
    in_,out_ = list(i)
    inp += list(in_['input_ids'].numpy())
    true_out += list(out_.numpy())
    pred = model(in_)
    pred_out += list(sigmoid_array(pred['logits'].numpy()))
    
len(inp),len(true_out),len(pred_out)

(235, 235, 235)

In [20]:
pred_out = np.array(pred_out)
pred_out.shape

(235, 256, 1)

In [21]:
pred_out = np.reshape(pred_out,(pred_out.shape[0],256))
pred_out.shape

(235, 256)

In [22]:
pred_out = np.round(pred_out)
pred_out[0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 1., 1., 1., 0., 0., 1., 1., 0., 0., 0., 1., 1., 1., 1., 1., 1.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

In [23]:
true_out = np.array(true_out)
true_out.shape

(235, 256)

In [73]:
f1_score(true_out, pred_out, average='macro',zero_division=1)

0.9726279760174308