### IMPORT LIBRARIES

In [1]:
from transformers import TFAutoModelForTokenClassification, ElectraTokenizerFast
from datasets import load_dataset, load_metric
import numpy as np
from transformers import DataCollatorWithPadding
import tensorflow as tf
import math

### MODEL CONFIGURATION DETAILS

In [1]:
BASE_PATH = "pretrained_models/"
MODEL_PATH = BASE_PATH + "NegBioElectra/"
LR_RATE = 1e-5
CKPT_PATH = BASE_PATH + "NegBioElectra_sherlock_cue_model"
EPOCHS = 15

### LOAD BASE MODEL & TOKENIZER

In [2]:
model = TFAutoModelForTokenClassification.from_pretrained(
    MODEL_PATH, num_labels=3,from_pt=True
)
tokenizer = ElectraTokenizerFast.from_pretrained(MODEL_PATH,max_length=256)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFElectraForTokenClassification: ['embeddings.position_ids']
- This IS expected if you are initializing TFElectraForTokenClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFElectraForTokenClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFElectraForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### LOAD DATASET 

In [3]:
ds = load_dataset("csv", data_files={'train': 'sher_m1_train.csv','dev':'sher_m1_dev.csv',"test_card":"sher_m1_test_cardboard.csv","test_circle":"sher_m1_test_circle.csv"})
ds

Using custom data configuration default-a249b2b4cc520fbf
Reusing dataset csv (/home/studio-lab-user/.cache/huggingface/datasets/csv/default-a249b2b4cc520fbf/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e)


  0%|          | 0/4 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['sentence', 'labels'],
        num_rows: 847
    })
    dev: Dataset({
        features: ['sentence', 'labels'],
        num_rows: 144
    })
    test_card: Dataset({
        features: ['sentence', 'labels'],
        num_rows: 119
    })
    test_circle: Dataset({
        features: ['sentence', 'labels'],
        num_rows: 116
    })
})

### PRE-PROCESS DATA

In [4]:
def convert_labels(examples):
    l = {'labels':[list(np.array(i.split("|"),dtype="int32")) for i in examples['labels']]} 
    l.update(tokenizer(examples['sentence'],truncation=True,max_length=256,padding=True))
    return l

In [5]:
encoded_dataset = ds.map(convert_labels, batched=True)

Loading cached processed dataset at /home/studio-lab-user/.cache/huggingface/datasets/csv/default-a249b2b4cc520fbf/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e/cache-eeda3117e9604fc9.arrow
Loading cached processed dataset at /home/studio-lab-user/.cache/huggingface/datasets/csv/default-a249b2b4cc520fbf/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e/cache-2898f1a61108b0ee.arrow


  0%|          | 0/1 [00:00<?, ?ba/s]

Loading cached processed dataset at /home/studio-lab-user/.cache/huggingface/datasets/csv/default-a249b2b4cc520fbf/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e/cache-71e6d4cb0e20556f.arrow


Ignored unknown kwarg option direction


### TRAIN-TEST SPLIT

In [6]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer,max_length=256, return_tensors="tf",padding="max_length")

In [7]:
tf_train_dataset = encoded_dataset["train"].to_tf_dataset(
    columns=['token_type_ids', 'attention_mask', 'input_ids'],
    label_cols=["labels"],
    shuffle=True,
    batch_size=32,
    collate_fn=data_collator,
)
tf_validation_dataset = encoded_dataset["dev"].to_tf_dataset(
    columns=['token_type_ids', 'attention_mask', 'input_ids'],
    label_cols=["labels"],
    shuffle=True,
    batch_size=32,
    collate_fn=data_collator,
)
tf_test_board_dataset = encoded_dataset["dev"].to_tf_dataset(
    columns=['token_type_ids', 'attention_mask', 'input_ids'],
    label_cols=["labels"],
    shuffle=True,
    batch_size=32,
    collate_fn=data_collator,
)
tf_test_circle_dataset = encoded_dataset["dev"].to_tf_dataset(
    columns=['token_type_ids', 'attention_mask', 'input_ids'],
    label_cols=["labels"],
    shuffle=True,
    batch_size=32,
    collate_fn=data_collator,
)

In [8]:
list(tf_train_dataset.take(1))

[({'attention_mask': <tf.Tensor: shape=(32, 256), dtype=int64, numpy=
   array([[1, 1, 1, ..., 0, 0, 0],
          [1, 1, 1, ..., 0, 0, 0],
          [1, 1, 1, ..., 0, 0, 0],
          ...,
          [1, 1, 1, ..., 0, 0, 0],
          [1, 1, 1, ..., 0, 0, 0],
          [1, 1, 1, ..., 0, 0, 0]])>,
   'input_ids': <tf.Tensor: shape=(32, 256), dtype=int64, numpy=
   array([[   2,   50, 4401, ...,    0,    0,    0],
          [   2, 2027,   15, ...,    0,    0,    0],
          [   2, 2278, 3095, ...,    0,    0,    0],
          ...,
          [   2, 1802, 1953, ...,    0,    0,    0],
          [   2, 1690, 1680, ...,    0,    0,    0],
          [   2,   41,   41, ...,    0,    0,    0]])>,
   'token_type_ids': <tf.Tensor: shape=(32, 256), dtype=int64, numpy=
   array([[0, 0, 0, ..., 0, 0, 0],
          [0, 0, 0, ..., 0, 0, 0],
          [0, 0, 0, ..., 0, 0, 0],
          ...,
          [0, 0, 0, ..., 0, 0, 0],
          [0, 0, 0, ..., 0, 0, 0],
          [0, 0, 0, ..., 0, 0, 0]])>},
  

### MODEL TRAINING

In [9]:
optimizer = tf.keras.optimizers.Adam(learning_rate=LR_RATE, epsilon=1e-08, clipnorm=1.0)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

In [10]:
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=CKPT_PATH,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True,
    save_weights_only=True)
callbacks = [model_checkpoint_callback]

In [11]:
history = model.fit(
 tf_train_dataset, 
 epochs=EPOCHS, 
 validation_data=tf_validation_dataset, 
 callbacks =[callbacks]
)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


### MODEL EVALUATION ON VALIDATION DATASET

In [12]:
model.load_weights(CKPT_PATH)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f7d8469ff70>

In [51]:
tf_validation_dataset_ = encoded_dataset["test_circle"].to_tf_dataset(
    columns=['token_type_ids', 'attention_mask', 'input_ids'],
    label_cols=["label"],
    shuffle=True,
    batch_size=1,
    collate_fn=data_collator,
)

In [52]:
def sigmoid_array(x):                                        
    return 1 / (1 + np.exp(-x))

In [53]:
inp = []
pred_out = []
true_out = []
for i in tf_validation_dataset_:
    in_,out_ = list(i)
    inp += list(in_['input_ids'].numpy())
    true_out += list(out_.numpy())
    pred = model(in_)
    pred_out += list(np.argmax(pred['logits'].numpy(),axis=-1))
    
len(inp),len(true_out),len(pred_out)

(116, 116, 116)

In [54]:
pred_out = np.array(pred_out)
pred_out.shape

(116, 256)

In [55]:
true_out = np.array(true_out)
true_out.shape

(116, 256)

In [57]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score

m = MultiLabelBinarizer().fit(true_out)

f1_score(m.transform(true_out),
         m.transform(pred_out),
         average='macro',zero_division=1)

0.9956331877729259

### ILLUSTRATION OF MODEL PREDICTING CUE

In [13]:
sents = ["I dont like cake","she was not diagnosed with cancer","he failed to have plural effusion"]

In [14]:
from transformers import TokenClassificationPipeline

In [15]:
pipe = TokenClassificationPipeline(model=model, tokenizer=tokenizer)

In [16]:
out = pipe(["May month is good","she may have a lump","she was not diagnosed with cancer","he failed to have plural effusion","her diagnosis neither suggests tumour nor indicates carcinoma"])



Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [17]:
from termcolor import colored
print(colored("Sample model output","blue"))
print(f"{'*'*10} NOTE: {colored('GREEN','green')} indicates Speculation, {colored('RED','red')} indicates Negation {'*'*10}")
print()
for j in out:
    s = []
    for i in j:
        l = i['word']
        if i['entity'] == 'LABEL_1':
            s.append(l)
        elif i['entity'] == 'LABEL_2':
            s.append(colored(l,'red'))
        elif i['entity'] == 'LABEL_3':
            s.append(colored(l,'green'))
    text = ' '.join([x for x in s])
    print(text.replace(' ##',''))
    

[34mSample model output[0m
********** NOTE: [32mGREEN[0m indicates Speculation, [31mRED[0m indicates Negation **********

may month is good
she may have a lump
she was [31mnot[0m diagnosed with cancer
he [31mfailed[0m to have plural effusion
her diagnosis [31mneither[0m suggests tumour nor indicates carcinoma
