# Main Notebook for A4

This notebook is adjusted from https://github.com/huggingface/notebooks/blob/main/examples/token_classification.ipynb

Modified contents are:
- Removed remote contents (logging in huggingface, etc.)
- tokenize_and_align_labels(): modified for the dataset and resolved weird bug
- model parameter: num_labels=2 (True or False, in or not inside a negation scope)
- metric: used self-made metric loader script ('../scripts/span_metric.py') for this task
- compute_metrics(): adjusted for this task and datasets

In [1]:
import transformers
import pandas as pd

In [2]:
task = "negation_scope"
model_checkpoint = "bert-base-uncased" # bert-base-uncased for better percision, distilbert-base-uncased for faster run
batch_size = 16

## Loading the dataset
The datasets are pre-generated huggingface dataset classes.

In [3]:
import datasets

In [4]:
trainds = datasets.load_from_disk('../data/hf_dataset/trainds')
devds = datasets.load_from_disk('../data/hf_dataset/devds')
testds = datasets.load_from_disk('../data/hf_dataset/testFds')

In [5]:
trainds[2739]

{'id': 2739,
 'negation_scope_tags': [0, 0, 0, 0, 0, 0],
 'tokens': ['``', ',', 'sir', ',', '[NEG] none', '.']}

## Preprocess
Using the pre-trained AutoTokenizer with the given model to tokenize. Added special marks (-100) to the beginning and ending of sentences.

In [6]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

In [7]:
def tokenize_and_align_labels(inds, label_all_tokens=True):
    tokenized_inputs = tokenizer(inds["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(inds['scope']):

        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx in [530, 531, 532, 533, 534]:
                continue
            
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                #label_ids.append(label[word_idx])
                try:
                    x = label[word_idx]
                    x = 0 if x=='False' else 1
                    label_ids.append(x if label_all_tokens else -100)

                except:
                    print(word_idx)
            
            else:
                try:
                    x = label[word_idx]
                    x = 0 if x=='False' else 1
                    label_ids.append(x if label_all_tokens else -100)

                except:
                    print(word_idx)
            previous_word_idx = word_idx
                
        labels.append(label_ids)
        
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [8]:
trainds[2739]

{'id': 2739,
 'negation_scope_tags': [0, 0, 0, 0, 0, 0],
 'tokens': ['``', ',', 'sir', ',', '[NEG] none', '.']}

In [10]:
label_all_tokens = True
tokenized_train = trainds.map(tokenize_and_align_labels, batched=True)
tokenized_dev = devds.map(tokenize_and_align_labels, batched=True)
tokenized_test = testds.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/3779 [00:00<?, ? examples/s]

KeyError: 'scope'

In [None]:
tokenized_train[0]

## Load model and metric

In [11]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

In [12]:
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=2) # 2 labels are True/False for in negation scope. due to the conversion above they are 0/1.

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    f"{model_name}-finetuned-{task}",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,
)

In [14]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

In [15]:
from datasets import load_metric
# metric = load_metric("seqeval")
metric = load_metric('../scripts/span_metric.py',trust_remote_code=True) # A self-defined metric class calculating both token overlap and span agreement

  metric = load_metric('../scripts/span_metric.py',trust_remote_code=True) # A self-defined metric class calculating both token overlap and span agreement


In [16]:
label_list = [0,1] # IS IN NEGATION SCOPE OR NOT

## Train and evaluate

In [17]:
import numpy as np

In [18]:
def remove_ignored_index(predictions,labels):
    actual_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    actual_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return actual_predictions, actual_labels

In [19]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2) # Most possible label

    # Remove ignored index (special tokens)
    actual_predictions, actual_labels = remove_ignored_index(predictions,labels)
    
    results = metric.compute(predictions=actual_predictions, references=actual_labels)
    return {
        #"accuracy": results["overall_accuracy"],
        "token_precision":results["token_precision"], "token_recall":results["token_recall"], "token_f1":results["token_f1"],
        "span_precision":results["span_precision"], "span_recall":results["span_recall"], "span_f1":results["span_f1"]
    }

In [20]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_dev,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [22]:
trainer.train()

  8%|▊         | 54/711 [11:36<2:21:19, 12.91s/it]


KeyboardInterrupt: 

In [None]:
trainer.evaluate()

{'eval_loss': 0.12754054367542267,
 'eval_token_precision': 0.97821963043631,
 'eval_token_recall': 0.9754098360655737,
 'eval_token_f1': 0.9768127126670643,
 'eval_span_precision': 0.8699386503067484,
 'eval_span_recall': 0.8699386503067484,
 'eval_span_f1': 0.8699386503067484,
 'eval_runtime': 0.4749,
 'eval_samples_per_second': 1716.069,
 'eval_steps_per_second': 107.386,
 'epoch': 3.0}

In [None]:
predictions, labels, _ = trainer.predict(tokenized_test)
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
actual_predictions, actual_labels = remove_ignored_index(predictions,labels)

results = metric.compute(predictions=actual_predictions, references=actual_labels)
results

{'token_precision': 0.9850047803552558,
 'token_recall': 0.9823848238482384,
 'token_f1': 0.983693057614513,
 'span_precision': 0.8879928315412187,
 'span_recall': 0.8879928315412187,
 'span_f1': 0.8879928315412187}

## Model comparison

In [None]:
model_checkpoint = "distilbert-base-uncased" # bert-base-uncased for better percision, distilbert-base-uncased for faster run

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

label_all_tokens = True
tokenized_train = trainds.map(tokenize_and_align_labels, batched=True)
tokenized_dev = devds.map(tokenize_and_align_labels, batched=True)
tokenized_test = testds.map(tokenize_and_align_labels, batched=True)

model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=2) # 2 labels are True/False for in negation scope. due to the conversion above they are 0/1.

model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    f"{model_name}-finetuned-{task}",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,
)

data_collator = DataCollatorForTokenClassification(tokenizer)

trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_dev,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

Map:   0%|          | 0/815 [00:00<?, ? examples/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Token Precision,Token Recall,Token F1,Span Precision,Span Recall,Span F1
1,No log,0.134765,0.971784,0.979613,0.975683,0.853988,0.853988,0.853988
2,No log,0.131369,0.969796,0.980734,0.975234,0.857669,0.857669,0.857669
3,0.135100,0.128455,0.977675,0.97562,0.976646,0.865031,0.865031,0.865031


TrainOutput(global_step=711, training_loss=0.11775438225554347, metrics={'train_runtime': 14.8605, 'train_samples_per_second': 762.897, 'train_steps_per_second': 47.845, 'total_flos': 141634595162424.0, 'train_loss': 0.11775438225554347, 'epoch': 3.0})

In [None]:
trainer.evaluate()

{'eval_loss': 0.1284554898738861,
 'eval_token_precision': 0.9776748104465038,
 'eval_token_recall': 0.9756200084068937,
 'eval_token_f1': 0.9766463286345466,
 'eval_span_precision': 0.8650306748466258,
 'eval_span_recall': 0.8650306748466258,
 'eval_span_f1': 0.8650306748466259,
 'eval_runtime': 0.2836,
 'eval_samples_per_second': 2873.492,
 'eval_steps_per_second': 179.814,
 'epoch': 3.0}

In [None]:
predictions, labels, _ = trainer.predict(tokenized_test)
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
actual_predictions, actual_labels = remove_ignored_index(predictions,labels)

results = metric.compute(predictions=actual_predictions, references=actual_labels)
results

{'token_precision': 0.9836880144549287,
 'token_recall': 0.9835892803372478,
 'token_f1': 0.9836386449184441,
 'span_precision': 0.8709677419354839,
 'span_recall': 0.8709677419354839,
 'span_f1': 0.8709677419354839}

In [None]:
result_df = pd.DataFrame(columns=['Sentence', 'Labels', 'Prediction'])
for p, t, ds in zip(actual_predictions, actual_labels, testds):
    if p != t:
        result_df.loc[len(result_df)] = [ds['tokens'], t, p]

result_df.to_csv('../results/errors.csv')