## Setup

If you run this notebook on colab, you'll need to take a number of extra steps:

In [1]:
# check if on colab
COLAB = True
try:
    import google.colab
except:
    COLAB=False

if COLAB:
    # install required packages
    !pip install -q datasets==2.21.0 tokenizers==0.19.1 sentencepiece==0.2.0 protobuf==3.20.3 accelerate==0.33.0 transformers==4.44.1 torch~=2.4.0 trl==0.10.1 scikit-learn==1.5.1 seqeval==1.2.2

if COLAB:
    # download custom utils
    !mkdir -p utils
    !base_url=https://raw.githubusercontent.com/haukelicht/advanced_text_analysis/main/notebooks/utils
    !files=(io.py finetuning.py metrics.py)
    !for file in "${files[@]}"; do curl -o "utils/$file" "$base_url/$file"; done

if COLAB:
    data_path = 'https://raw.githubusercontent.com/haukelicht/advanced_text_analysis/data/labeled/parolin_multi-coped_2022/'
else:
    import os
    data_path = os.path.join('..', 'data', 'labeled', 'parolin_multi-coped_2022', '')

Next, we load the required modules, classes, and functions.

Note that some function come from the `utils` folder.
These are functions I have defined to handle general tasks, like

- reading data from a tabular file (e.g., CV);
- splitting the data into train, dev, and test split;
- tokenization,
- etc.

These functions should be general enough for many use cases. 
You can use them in your researhc if you want.
But please double check that they do what you want them to do if you want to publish results that depend on my code ;)

In [2]:
from utils.io import read_jsonlines
from utils.finetuning import (
    get_device, 
    split_data, 
    create_token_classification_dataset, 
    preprocess_token_classification_dataset
)

from datasets import DatasetDict
from transformers import (
    set_seed,
    AutoTokenizer,
    DataCollatorForTokenClassification, 
    AutoModelForTokenClassification, 
    Trainer,
    TrainingArguments
)

from utils.metrics import (
    parse_token_classifier_prediction_output, 
    compute_token_classification_metrics
)

In [3]:
SEED = 42
set_seed(SEED)

In [4]:
MODEL_NAME = 'roberta-base'
device = get_device()
print(f'Using device: {str(device)}')

Using device: mps


In [5]:
fp = data_path + 'parolin_multi-coped_2022-cameo_ner.jsonl'
data = read_jsonlines(fp)

In [15]:
print(data[0].keys())
for tok, lab in zip(data[0]['tokens'], data[0]['labels']):
    print(lab, tok, sep = '\t')

dict_keys(['tokens', 'labels'])
B-S	U.S.
I-S	military
I-S	chief
I-S	General
I-S	Colin
I-S	Powell
O	said
O	on
O	Wednesday
B-T	NATO
O	would
O	need
O	to
O	remain
O	strong
O	.


In [16]:
label_classes = sorted(set(l for doc in data  for l in doc['labels']))
label_classes = list(reversed(label_classes))
label_classes

['O', 'I-T', 'I-S', 'I-R', 'B-T', 'B-S', 'B-R']

In [5]:
types = [l[2:] for l in label_classes if l.startswith('B-')]
types

['T', 'S', 'R']

In [7]:
label2id = {l: i for i, l in enumerate(label_classes)}
id2label = {i: l for l, i in label2id.items()}

label2id

{'O': 0, 'I-T': 1, 'I-S': 2, 'I-R': 3, 'B-T': 4, 'B-S': 5, 'B-R': 6}

In [8]:
data_splits = split_data(data, dev_size=0.15, test_size=0.15, seed=42, return_dict=True)

In [9]:
data_splits = DatasetDict({s: create_token_classification_dataset(d) for s, d in data_splits.items()})

In [10]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True, add_prefix_space=True)
data_splits = data_splits.map(lambda x: preprocess_token_classification_dataset(x, tokenizer=tokenizer, label2id=label2id, truncation=True), batched=True)



Map:   0%|          | 0/1124 [00:00<?, ? examples/s]

Map:   0%|          | 0/240 [00:00<?, ? examples/s]

Map:   0%|          | 0/240 [00:00<?, ? examples/s]

In [11]:
data_splits = data_splits.remove_columns(['tokens'])

In [19]:
dest = './../results/example_classifier/'
training_args = TrainingArguments(
    output_dir=dest,
    # hyperparameters
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    optim='adamw_torch',
    use_mps_device=str(device)=='mps',
    fp16=str(device).startswith('cuda'),
    # evaluation on dev set
    eval_strategy='epoch',
    metric_for_best_model='macro_f1',
    # model saving
    save_strategy='epoch',
    load_best_model_at_end=True,
    save_total_limit=2,
    # logging
    logging_strategy='epoch',
    logging_dir=dest+'logs',
    # for reproducibility
    seed=42,
    data_seed=42,
    full_determinism=True
)



In [13]:
def model_init():
    model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME, num_labels=len(label2id))
    if isinstance(id2label[0], str):
        model.config.id2label = id2label
        model.config.label2id = label2id
    model.to(device);
    return model

In [16]:
def compute_metrics(p):
    labels, predictions = parse_token_classifier_prediction_output(p)
    return compute_token_classification_metrics(y_true=labels, y_pred=predictions, label2id=label2id)

In [20]:
trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=data_splits['train'],
    eval_dataset=data_splits['dev'],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=DataCollatorForTokenClassification(tokenizer),
)

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
trainer.train()

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/213 [00:00<?, ?it/s]

{'loss': 0.472, 'grad_norm': 4.745232582092285, 'learning_rate': 3.3333333333333335e-05, 'epoch': 1.0}


  0%|          | 0/8 [00:00<?, ?it/s]

{'eval_loss': 0.2595997452735901, 'eval_macro_precision': 0.49230411505830035, 'eval_macro_recall': 0.5701647737023735, 'eval_macro_f1': 0.5282855800329743, 'eval_micro_precision': 0.6238003838771593, 'eval_micro_recall': 0.720620842572062, 'eval_micro_f1': 0.668724279835391, 'eval_S_precision': 0.7701612903225806, 'eval_S_recall': 0.8761467889908257, 'eval_S_f1': 0.8197424892703863, 'eval_T_precision': 0.540084388185654, 'eval_T_recall': 0.6274509803921569, 'eval_T_f1': 0.5804988662131518, 'eval_R_precision': 0.16666666666666666, 'eval_R_recall': 0.20689655172413793, 'eval_R_f1': 0.18461538461538463, 'eval_runtime': 1.2321, 'eval_samples_per_second': 194.794, 'eval_steps_per_second': 6.493, 'epoch': 1.0}
{'loss': 0.2051, 'grad_norm': 9.174385070800781, 'learning_rate': 1.6666666666666667e-05, 'epoch': 2.0}


  0%|          | 0/8 [00:00<?, ?it/s]

{'eval_loss': 0.21786577999591827, 'eval_macro_precision': 0.5651007651289226, 'eval_macro_recall': 0.6441893956781278, 'eval_macro_f1': 0.6014375552147334, 'eval_micro_precision': 0.6596958174904943, 'eval_micro_recall': 0.7694013303769401, 'eval_micro_f1': 0.7103377686796316, 'eval_S_precision': 0.8031496062992126, 'eval_S_recall': 0.9357798165137615, 'eval_S_f1': 0.864406779661017, 'eval_T_precision': 0.5473251028806584, 'eval_T_recall': 0.6519607843137255, 'eval_T_f1': 0.5950782997762863, 'eval_R_precision': 0.3448275862068966, 'eval_R_recall': 0.3448275862068966, 'eval_R_f1': 0.3448275862068966, 'eval_runtime': 1.2917, 'eval_samples_per_second': 185.805, 'eval_steps_per_second': 6.193, 'epoch': 2.0}
{'loss': 0.1353, 'grad_norm': 5.610859394073486, 'learning_rate': 0.0, 'epoch': 3.0}


  0%|          | 0/8 [00:00<?, ?it/s]

{'eval_loss': 0.220681831240654, 'eval_macro_precision': 0.7063131313131312, 'eval_macro_recall': 0.7543757353199636, 'eval_macro_f1': 0.728502620543413, 'eval_micro_precision': 0.7454909819639278, 'eval_micro_recall': 0.8248337028824834, 'eval_micro_f1': 0.7831578947368422, 'eval_S_precision': 0.8701298701298701, 'eval_S_recall': 0.9220183486238532, 'eval_S_f1': 0.8953229398663698, 'eval_T_precision': 0.6416666666666667, 'eval_T_recall': 0.7549019607843137, 'eval_T_f1': 0.6936936936936938, 'eval_R_precision': 0.6071428571428571, 'eval_R_recall': 0.5862068965517241, 'eval_R_f1': 0.5964912280701754, 'eval_runtime': 1.3199, 'eval_samples_per_second': 181.838, 'eval_steps_per_second': 6.061, 'epoch': 3.0}
{'train_runtime': 85.6207, 'train_samples_per_second': 39.383, 'train_steps_per_second': 2.488, 'train_loss': 0.27078096631547094, 'epoch': 3.0}


TrainOutput(global_step=213, training_loss=0.27078096631547094, metrics={'train_runtime': 85.6207, 'train_samples_per_second': 39.383, 'train_steps_per_second': 2.488, 'total_flos': 80352384837120.0, 'train_loss': 0.27078096631547094, 'epoch': 3.0})

In [22]:
trainer.evaluate(data_splits['test'])

  0%|          | 0/8 [00:00<?, ?it/s]

{'eval_loss': 0.26187124848365784,
 'eval_macro_precision': 0.7016491067338526,
 'eval_macro_recall': 0.7629764590201861,
 'eval_macro_f1': 0.7292895009957645,
 'eval_micro_precision': 0.6951456310679611,
 'eval_micro_recall': 0.7955555555555556,
 'eval_micro_f1': 0.7419689119170985,
 'eval_S_precision': 0.864406779661017,
 'eval_S_recall': 0.9026548672566371,
 'eval_S_f1': 0.8831168831168832,
 'eval_T_precision': 0.5405405405405406,
 'eval_T_recall': 0.6862745098039216,
 'eval_T_f1': 0.6047516198704104,
 'eval_R_precision': 0.7,
 'eval_R_recall': 0.7,
 'eval_R_f1': 0.7,
 'eval_runtime': 3.5869,
 'eval_samples_per_second': 66.91,
 'eval_steps_per_second': 2.23,
 'epoch': 3.0}