# SCIBERT in HuggingFace

In [1]:
import os

# needs to be executed before importing torch or transformers
# server specific: only use last 3 gpus (on rattle.ifi.uzh.ch)
os.environ["CUDA_VISIBLE_DEVICES"] = "4,5,6"
# set the home directory for huggingface transformers (where the models are saved)
# by default this is '~/.cache/huggingface/hub'
# see https://stackoverflow.com/questions/61798573/where-does-hugging-faces-transformers-save-models
# server specific:
os.environ["HF_HOME"] = "/srv/scratch2/dbielik/.cache/huggingface/hub"

import torch
from pathlib import Path

torch.set_printoptions(threshold=10_000)
if not torch.cuda.is_available():
    print("Warning: CUDA not available!")

# path of the directory containing this file
BASE_DIR_PATH = Path.cwd().parent
# path of the data directory
DATA_DIR_PATH = BASE_DIR_PATH / "data" / "swisstext-2024-sharedtask"

SEED = 1337

In [2]:
from datasets import load_dataset

# load the dataset
# note: if you don't have the data in the folder, use the download-data.sh script
dataset = load_dataset("json", data_files=str(DATA_DIR_PATH / "task1.jsonl")).class_encode_column("SDG")
dataset = dataset["train"].train_test_split(test_size=0.3, stratify_by_column="SDG", seed=SEED)

example = dataset["train"][0]
print("Example instance:\t", example)

labels = set(dataset["train"]["SDG"])
# identity because labels are already ids and vice-versa
id2label = lambda i: i
label2id = id2label
labels

Example instance:	 {'ID': 'oai:www.zora.uzh.ch:168503', 'TITLE': 'The carbon bubble and the pricing of bank loans', 'ABSTRACT': 'Neglecting the possibility that fossil fuel reserves can become ‘stranded’ could result in a ‘carbon bubble’ as fossil fuel firms become overvalued. This column studies whether banks price the climate policy risk of fossil fuel firms. Prior to 2015, banks did not appear to price climate policy risk. After 2015, however, the risk is priced to a certain extent, especially for firms holding more fossil fuel reserves.', 'URL': 'https://www.zora.uzh.ch/id/eprint/168503', 'SDG': 5}


{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}

In [3]:
from transformers import AutoTokenizer

# base model
HF_MODEL_NAME = "allenai/scibert_scivocab_uncased"
# final model
MODEL_NAME = "scibert-uncased-ft-task1"

tokenizer = AutoTokenizer.from_pretrained(HF_MODEL_NAME)

def preprocess_data(instances):
    # take a batch of titles and abstracts and concat them
    titles = instances["TITLE"]
    abstracts = instances["ABSTRACT"]
    texts = [f"{title} {abstract}" for title, abstract in zip(titles, abstracts)]
    # encode
    encoding = tokenizer(texts, padding="max_length", truncation=True, max_length=512, return_tensors="pt")

    # add labels
    encoding["label"] = torch.tensor([label2id(label) for label in instances["SDG"]])

    return encoding

encoded_dataset = dataset.map(preprocess_data, batched=True, remove_columns=dataset["train"].column_names)
encoded_dataset.set_format("torch")

Map:   0%|          | 0/301 [00:00<?, ? examples/s]

Map:   0%|          | 0/129 [00:00<?, ? examples/s]

In [4]:
example = encoded_dataset["train"][0]
print("Example instance:\t", example)

tokenizer.decode(example["input_ids"])

Example instance:	 {'input_ids': tensor([  102,   111,  3473, 15925,   137,   111, 12775,   131,  5108, 20227,
        29116,   111,  4047,   198, 17634,  8086, 20387,   300,  3063,  1384,
        11161,   119,  5459,   968,  1186,   121,   106,  1384,  3473, 15925,
         5459,   188, 17634,  8086,  5244,  3063,   573, 11748,   205,   238,
         3724,   826,  1681,  9679,  4048,   111,  6002,  2951,  1265,   131,
        17634,  8086,  5244,   205,  1979,   147,  4505,   422,  9679,  1544,
          302,  1853,   147,  4048,  6002,  2951,  1265,   205,   647,  4505,
          422,   694,   422,   111,  1265,   165,  4048, 30118,   147,   106,
         2361,  3796,   422,  2825,   168,  5244, 11803,   475, 17634,  8086,
        20387,   205,   103,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,   

'[CLS] the carbon bubble and the pricing of bank loans neglecting the possibility that fossil fuel reserves can become ‘ stranded ’ could result in a ‘ carbon bubble ’ as fossil fuel firms become overvalued. this column studies whether banks price the climate policy risk of fossil fuel firms. prior to 2015, banks did not appear to price climate policy risk. after 2015, however, the risk is priced to a certain extent, especially for firms holding more fossil fuel reserves. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [

In [5]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    HF_MODEL_NAME,
    num_labels=len(labels)
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import TrainingArguments, Trainer, EvalPrediction

BATCH_SIZE = 12
METRIC_NAME = "accuracy"

args = TrainingArguments(
    BASE_DIR_PATH / "models" / MODEL_NAME,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=20,
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model=METRIC_NAME,
    seed=SEED
)

def compute_metrics(pred: EvalPrediction):
    labels = pred.label_ids
    accuracy = accuracy_score(labels, pred.predictions.argmax(-1))
    precision, recall, f1, _ = precision_recall_fscore_support(labels, pred.predictions.argmax(-1), average="weighted")
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


Detected kernel version 4.19.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [7]:
trainer.train()

OutOfMemoryError: Caught OutOfMemoryError in replica 0 on device 0.
Original Traceback (most recent call last):
  File "/home/user/dbielik/msc-thesis/venv/lib/python3.12/site-packages/torch/nn/parallel/parallel_apply.py", line 83, in _worker
    output = module(*input, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/user/dbielik/msc-thesis/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/user/dbielik/msc-thesis/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/user/dbielik/msc-thesis/venv/lib/python3.12/site-packages/transformers/models/bert/modeling_bert.py", line 1539, in forward
    outputs = self.bert(
              ^^^^^^^^^^
  File "/home/user/dbielik/msc-thesis/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/user/dbielik/msc-thesis/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/user/dbielik/msc-thesis/venv/lib/python3.12/site-packages/transformers/models/bert/modeling_bert.py", line 988, in forward
    encoder_outputs = self.encoder(
                      ^^^^^^^^^^^^^
  File "/home/user/dbielik/msc-thesis/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/user/dbielik/msc-thesis/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/user/dbielik/msc-thesis/venv/lib/python3.12/site-packages/transformers/models/bert/modeling_bert.py", line 582, in forward
    layer_outputs = layer_module(
                    ^^^^^^^^^^^^^
  File "/home/user/dbielik/msc-thesis/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/user/dbielik/msc-thesis/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/user/dbielik/msc-thesis/venv/lib/python3.12/site-packages/transformers/models/bert/modeling_bert.py", line 472, in forward
    self_attention_outputs = self.attention(
                             ^^^^^^^^^^^^^^^
  File "/home/user/dbielik/msc-thesis/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/user/dbielik/msc-thesis/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/user/dbielik/msc-thesis/venv/lib/python3.12/site-packages/transformers/models/bert/modeling_bert.py", line 402, in forward
    self_outputs = self.self(
                   ^^^^^^^^^^
  File "/home/user/dbielik/msc-thesis/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/user/dbielik/msc-thesis/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/user/dbielik/msc-thesis/venv/lib/python3.12/site-packages/transformers/models/bert/modeling_bert.py", line 340, in forward
    context_layer = torch.matmul(attention_probs, value_layer)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 18.00 MiB. GPU 


### Investigating sklearn precision warning

In [None]:
p = torch.ones((86, 18)).argmax(-1)
p[:10] = torch.Tensor([i for i in range(10)])
t = torch.ones(86, dtype=torch.long)
t[:18] = torch.Tensor([i for i in range(18)])

# no Warning
precision_recall_fscore_support(t, p, average="weighted", labels=[i for i in range(10)])

NameError: name 'torch' is not defined

In [None]:
# Warning
precision_recall_fscore_support(t, p, average="weighted", labels=[i for i in range(11)])