In [1]:
# !pip install -q transformers datasets scikit-learn tqdm
# !pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
# !pip install transformers[torch]

In [2]:
# Cell 2: imports and config
import os
import json
from datasets import load_dataset, Dataset, DatasetDict
from transformers import GPT2TokenizerFast, GPT2ForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
import torch
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from tqdm import tqdm
device = "cuda" if torch.cuda.is_available() else "cpu"
batch_size = 32
learning_rate = 0.001
max_length = 256
special_entity_token = "<ENT>"


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Cell 3: load local json files (train.json, val.json, test.json expected in current dir)
def load_json_to_dataset(path):
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)
    for ex in data:
        ex["label"] = int(ex.get("label", ex.get("answer", "0")))
        if isinstance(ex["label"], str):
            ex["label"] = int(ex["label"])
    return Dataset.from_list(data)

train_ds = load_json_to_dataset("train.json")
val_ds = load_json_to_dataset("val.json")
test_ds = load_json_to_dataset("test.json")
dataset = DatasetDict({"train": train_ds, "validation": val_ds, "test": test_ds})


In [4]:
# Cell 4: tokenizer and preprocessing (map entity markers to a single token and add pad token)
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({"pad_token": tokenizer.eos_token})
if special_entity_token not in tokenizer.get_vocab():
    tokenizer.add_special_tokens({"additional_special_tokens": [special_entity_token]})

def preprocess_function(examples):
    texts = []
    for s in examples["sentence"]:
        s2 = s.replace("<\\\\entity><\\\\entity>", special_entity_token)
        s2 = s2.replace("<\\entity><\\entity>", special_entity_token)
        s2 = s2.replace("<\\\\entity>", special_entity_token)
        s2 = s2.replace("<\\entity>", special_entity_token)
        texts.append(s2)
    tokenized = tokenizer(texts, truncation=True, padding=False, max_length=max_length)
    tokenized["labels"] = [int(l) for l in examples["label"]]
    return tokenized

tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)


Map: 100%|██████████| 4261/4261 [00:00<00:00, 20519.80 examples/s]
Map: 100%|██████████| 535/535 [00:00<00:00, 21947.46 examples/s]
Map: 100%|██████████| 534/534 [00:00<00:00, 16589.57 examples/s]


In [5]:
# Cell 5: model setup
model = GPT2ForSequenceClassification.from_pretrained("gpt2", num_labels=2)
model.resize_token_embeddings(len(tokenizer))
model.config.pad_token_id = tokenizer.pad_token_id
data_collator = DataCollatorWithPadding(tokenizer, padding=True)


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [6]:
# Cell 6 (corrected for older transformers versions)
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="binary")
    prec = precision_score(labels, preds, zero_division=0)
    rec = recall_score(labels, preds, zero_division=0)
    return {"accuracy": acc, "f1": f1, "precision": prec, "recall": rec}

training_args = TrainingArguments(
    output_dir="gpt2-relation-classifier",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    eval_strategy="epoch",
    save_steps=500,
    learning_rate=learning_rate,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=50,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


  trainer = Trainer(


In [7]:
# Cell 7: train
trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.7066,0.822211,0.452336,0.0,0.0,0.0
2,0.6173,0.617301,0.674766,0.67658,0.742857,0.62116
3,0.4483,0.644258,0.704673,0.739274,0.715655,0.764505


TrainOutput(global_step=402, training_loss=0.6332217816689715, metrics={'train_runtime': 1677.3699, 'train_samples_per_second': 7.621, 'train_steps_per_second': 0.24, 'total_flos': 500163828940800.0, 'train_loss': 0.6332217816689715, 'epoch': 3.0})

In [8]:
# Cell 8: evaluate on test set
metrics = trainer.evaluate(tokenized_datasets["test"])
print(metrics)


{'eval_loss': 0.6966757774353027, 'eval_accuracy': 0.6610486891385767, 'eval_f1': 0.6916524701873935, 'eval_precision': 0.6633986928104575, 'eval_recall': 0.7224199288256228, 'eval_runtime': 18.0208, 'eval_samples_per_second': 29.632, 'eval_steps_per_second': 0.943, 'epoch': 3.0}


In [9]:
# Cell 9: save model and run a quick inference example
trainer.save_model("gpt2-relation-classifier-final")
sample_sentences = [
    "There is no evidence for an association of <\\\\entity><\\\\entity> alleles with <\\\\entity><\\\\entity> in our study groups.",
    "These results suggest that the <\\\\entity><\\\\entity> -93G-->A polymorphism could be used as a marker of genetic susceptibility to <\\\\entity><\\\\entity> of the lung."
]
proc = tokenizer([s.replace("<\\\\entity><\\\\entity>", special_entity_token) for s in sample_sentences], return_tensors="pt", truncation=True, padding=True, max_length=max_length).to(device)
model.to(device)
with torch.no_grad():
    out = model(**proc)
logits = out.logits.cpu().numpy()
preds = np.argmax(logits, axis=-1)
print(list(zip(sample_sentences, preds)))


[('There is no evidence for an association of <\\\\entity><\\\\entity> alleles with <\\\\entity><\\\\entity> in our study groups.', np.int64(1)), ('These results suggest that the <\\\\entity><\\\\entity> -93G-->A polymorphism could be used as a marker of genetic susceptibility to <\\\\entity><\\\\entity> of the lung.', np.int64(0))]
