# SCIBERT in HuggingFace

In [1]:
# run this script to import the variables and settings from ipynb_util.py
%run ../ipynb_util_tars.py

In [2]:
# select dataset
from enum import Enum

class DatasetType(Enum):
    """Enum for the dataset type."""

    """Zora + OSDG upsampled dataset"""
    ZO_UP = "zo_up"
    """SwissText Shared Task 1 dataset (Zurich NLP)"""
    SWISSTEXT_SHARED_TASK1 = "swisstext_shared_task1"


DATASET_TYPE = DatasetType.ZO_UP

In [3]:
from datasets import load_dataset, Features, Value, ClassLabel
import pickle

# load the dataset
# note: if you don't have the data in the folder, use the download-data.sh script

match DATASET_TYPE:
    case DatasetType.ZO_UP:
        # dont need to use manual features as class_encode_column will create ClassLabel
        # careful: watch out for the order of the ClassLabel as it doesn't map directly to the SDG class. need use mapping functions (id2label, label2id)
        # sdgs = [str(i) for i in range(1, 18)] + ["non-relevant"]
        # features = Features({"sdg": ClassLabel(num_classes=len(sdgs), names=sdgs), "abstract": Value("string")})

        dataset = load_dataset("csv", data_files=str(DATA_DIR_PATH / "zo_up.csv"))
        dataset = dataset.rename_columns({"sdg": "SDG", "abstract": "ABSTRACT"}).class_encode_column("SDG")
        dataset = dataset["train"].train_test_split(test_size=0.3, stratify_by_column="SDG", seed=SEED)
    case DatasetType.SWISSTEXT_SHARED_TASK1:
        dataset = load_dataset("json", data_files=str(DATA_DIR_PATH / "swisstext-2024-sharedtask" / "task1-train.jsonl"))
        dataset = dataset["train"].train_test_split(test_size=0.3, seed=SEED)

print(dataset["train"].features)
example = dataset["train"][0]
print("Example instance:\t", example)


# Label encodings / mappings
labels = set(dataset["train"]["SDG"])
id2label = {i: dataset["train"].features["SDG"].int2str(i) for i in range(len(labels))}
label2id = {dataset["train"].features["SDG"].int2str(i): i for i in range(len(labels))}

# save the encodings to a file for later use
ENCODING_DIR = BASE_DIR_PATH / "encodings" / DATASET_TYPE.value
# create the directory if it doesn't exist
ENCODING_DIR.mkdir(parents=True, exist_ok=True)
with open(ENCODING_DIR / "id2label.pkl", "wb") as f:
    pickle.dump(id2label, f)

with open(ENCODING_DIR / "label2id.pkl", "wb") as f:
    pickle.dump(label2id, f)

labels

{'SDG': ClassLabel(names=['1', '10', '11', '12', '13', '14', '15', '16', '17', '2', '3', '4', '5', '6', '7', '8', '9'], id=None), 'ABSTRACT': Value(dtype='string', id=None), 'id': Value(dtype='string', id=None), 'sdg_desc_short': Value(dtype='string', id=None), 'sdg_desc_long': Value(dtype='string', id=None)}
Example instance:	 {'SDG': 16, 'ABSTRACT': 'The first attempts to modernize simply replaced the single huge engine with a huge electric motor, changing little. The drive-shafts were replaced by wires, the huge steam engine by dozens of small motors. Factories spread out, there was natural light, and room to use ceiling-slung cranes. Workers had responsibility for their own machines, they needed better training and better pay. The electric motor was a wonderful invention, once we changed all the everyday details that surrounded it.', 'id': None, 'sdg_desc_short': None, 'sdg_desc_long': None}


{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}

In [4]:
# verify that encodings work properly on example instance
# example instance has label 9 in the csv file
# example instance has label 16 in the encoded dataset
assert example["SDG"] == label2id[id2label[example["SDG"]]]
print("Encoded (label2id) label:\t", example["SDG"])
print("Decoded (id2label) label:\t", id2label[example["SDG"]])

print(id2label[16], label2id[id2label[16]], label2id["9"])

Encoded (label2id) label:	 16
Decoded (id2label) label:	 9
9 16 16


In [5]:
from transformers import AutoTokenizer

# whether the text should be lowered or not
SHOULD_LOWER = True

# base model
HF_MODEL_NAME = "allenai/scibert_scivocab_uncased"
# final model
MODEL_NAME = f"{HF_MODEL_NAME}-ft-{DATASET_TYPE.value}" + ("-lower" if SHOULD_LOWER else "")

tokenizer = AutoTokenizer.from_pretrained(HF_MODEL_NAME)

def preprocess_data(instances):
    match DATASET_TYPE:
        case DatasetType.SWISSTEXT_SHARED_TASK1:
            # take a batch of titles and abstracts and concat them
            titles = instances["TITLE"]
            abstracts = instances["ABSTRACT"]
            texts = [f"{title} {abstract}" for title, abstract in zip(titles, abstracts)]
        case DatasetType.ZO_UP:
            texts = instances["ABSTRACT"]

    if SHOULD_LOWER:
        texts = [text.lower() for text in texts]

    # encode
    encoding = tokenizer(texts, padding="max_length", truncation=True, max_length=512, return_tensors="pt")

    # add labels
    encoding["label"] = torch.tensor([label for label in instances["SDG"]])

    return encoding

encoded_dataset = dataset.map(preprocess_data, batched=True, remove_columns=dataset["train"].column_names)
encoded_dataset.set_format("torch")

  StockPickler.save(self, obj, save_persistent_id)
  StockPickler.save(self, obj, save_persistent_id)


Map:   0%|          | 0/271 [00:00<?, ? examples/s]

In [6]:
example = encoded_dataset["train"][0]
print("Example instance:\t", example)

tokenizer.decode(example["input_ids"])

Example instance:	 {'input_ids': tensor([  102,   111,   705,  7834,   147,  5901,   767,  4427,  6703,   111,
         1232, 11812,  2393,   190,   106, 11812,  5612,  3850,   422,  5468,
         3441,   205,   111,  7021,   579, 20665, 30113,   267,  6703,   214,
        17162,   422,   111, 11812, 22668,  2393,   214,   572, 11451,   131,
          952, 17898,   205,  1491,   301,  4696,   556,   422,   461,   241,
         2404,  2011,   422,   137,  4095,   147,   626, 26503,   579,  1252,
          794, 21830,  6820,   205,  5555,   883,  9945,   168,   547,  2910,
         7909,   422,   698,  2764,  1883,  2208,   137,  1883,  3982,   205,
          111,  5612,  3850,   241,   106, 23398,  1004, 28364,   422,  3246,
          185,  5414,   355,   111, 15304,  3779,   198, 17771,   256,   205,
          103,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,   

'[CLS] the first attempts to modernize simply replaced the single huge engine with a huge electric motor, changing little. the drive - shafts were replaced by wires, the huge steam engine by dozens of small motors. factories spread out, there was natural light, and room to use ceiling - slung cranes. workers had responsibility for their own machines, they needed better training and better pay. the electric motor was a wonderful invention, once we changed all the everyday details that surrounded it. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD

In [7]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    HF_MODEL_NAME,
    id2label=id2label,
    label2id=label2id,
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import TrainingArguments, Trainer, EvalPrediction

BATCH_SIZE = 8
METRIC_NAME = "accuracy"

args = TrainingArguments(
    f"{CHECKPOINT_PATH}/{MODEL_NAME}",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    #save_steps=10,
    #eval_steps=10,
    #logging_steps=10,
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=15,
    load_best_model_at_end=True,
    save_total_limit=2,
    metric_for_best_model=METRIC_NAME,
    seed=SEED
)

def compute_metrics(pred: EvalPrediction):
    labels = pred.label_ids
    accuracy = accuracy_score(labels, pred.predictions.argmax(-1))
    precision, recall, f1, _ = precision_recall_fscore_support(labels, pred.predictions.argmax(-1), average="weighted")
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)




In [9]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mdvdblk[0m ([33mngmi[0m). Use [1m`wandb login --relogin`[0m to force relogin




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6638,2.362746,0.335793,0.320061,0.335793,0.281176
2,1.944,1.732785,0.630996,0.632742,0.630996,0.610938
3,1.2477,1.272289,0.697417,0.708379,0.697417,0.683229
4,0.7618,1.115822,0.701107,0.702114,0.701107,0.695616
5,0.478,1.037539,0.715867,0.717914,0.715867,0.710875
6,0.2988,1.071532,0.726937,0.721062,0.726937,0.718276
7,0.1828,1.100405,0.719557,0.717658,0.719557,0.712799
8,0.1189,1.18766,0.715867,0.707046,0.715867,0.706496
9,0.0791,1.240289,0.719557,0.71626,0.719557,0.711197
10,0.0588,1.295745,0.726937,0.725384,0.726937,0.71814


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TrainOutput(global_step=600, training_loss=0.5339346595605214, metrics={'train_runtime': 259.5299, 'train_samples_per_second': 36.412, 'train_steps_per_second': 2.312, 'total_flos': 2486734338816000.0, 'train_loss': 0.5339346595605214, 'epoch': 15.0})

## Evaluation

In [10]:
trainer.evaluate()



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.0715315341949463,
 'eval_accuracy': 0.7269372693726938,
 'eval_precision': 0.7210623353133084,
 'eval_recall': 0.7269372693726938,
 'eval_f1': 0.718275735363878,
 'eval_runtime': 1.7243,
 'eval_samples_per_second': 157.163,
 'eval_steps_per_second': 9.859,
 'epoch': 15.0}

In [11]:
from sklearn.metrics import classification_report

model.eval()

# manual evaluation to show classifcation_report
true_labels = []
logits = []

for batch in encoded_dataset["test"]:
    batch = {k: v.to(trainer.args.device).unsqueeze(0) for k, v in batch.items()}
    label = batch.pop("label")

    # Forward pass
    with torch.no_grad():
        out = model(**batch)

    true_labels.append(label.item())
    logits.extend(out.logits.tolist())

probabilites = torch.nn.functional.softmax(torch.tensor(logits), dim=-1)
pred_labels = torch.argmax(probabilites, dim=-1).tolist()

report = classification_report(true_labels, pred_labels, target_names=[f"SDG {id2label[i]}" for i in range(len(labels))])

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [12]:
print(report)

              precision    recall  f1-score   support

       SDG 1       0.70      0.82      0.76        17
      SDG 10       0.50      0.35      0.41        17
      SDG 11       0.80      0.71      0.75        17
      SDG 12       0.54      0.41      0.47        17
      SDG 13       0.76      0.76      0.76        17
      SDG 14       1.00      0.94      0.97        17
      SDG 15       0.84      0.94      0.89        17
      SDG 16       0.77      0.59      0.67        17
      SDG 17       0.00      0.00      0.00         1
       SDG 2       0.59      0.81      0.68        16
       SDG 3       0.74      0.82      0.78        17
       SDG 4       0.83      0.88      0.86        17
       SDG 5       0.70      0.94      0.80        17
       SDG 6       0.79      0.88      0.83        17
       SDG 7       0.80      0.71      0.75        17
       SDG 8       0.53      0.50      0.52        16
       SDG 9       0.67      0.59      0.62        17

    accuracy              

## Inference

In [13]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

text = "Priming of microbial microcystin degradation in biomass-fed gravity driven membrane filtration biofilms Gravity-driven membrane (GDM) filtration is a promising tool for low-cost decentralized drinking water production. The biofilms in GDM systems are able of removing harmful chemical components, particularly toxic cyanobacterial metabolites such as microcystins (MCs). This is relevant for the application of GDM filtration because anthropogenic nutrient input and climate change have led to an increase of toxic cyanobacterial blooms. However, removal of MCs in newly developing GDM biofilms is only established after a prolonged period of time. Since cyanobacterial blooms are transient phenomena, it is important to understand MC removal in mature biofilms with or without prior toxin exposure. In this study, the microbial community composition of GDM biofilms was investigated in systems fed with water from a lake with periodic blooms of MC-producing cyanobacteria. Two out of three experimental treatments were supplemented with dead biomass of a MC-containing cyanobacterial strain, or of a non-toxic mutant, respectively. Analysis of bacterial rRNA genes revealed that both biomass-amended treatments were significantly more similar to each other than to a non-supplemented control. Therefore, it was hypothesized that biofilms could potentially be 'primed' for rapid MC removal by prior addition of non-toxic biomass. A subsequent experiment showed that MC removal developed significantly faster in mature biofilms that were pre-fed with biomass from the mutant strain than in unamended controls, indicating that MC degradation was a facultative trait of bacterial populations in GDM biofilms. The significant enrichment of bacteria related to both aerobic and anaerobic MC degraders suggested that this process might have occurred in parallel in different microniches."

MODEL_PATH = f"{CHECKPOINT_PATH}/{MODEL_NAME}/checkpoint-474/"
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_PATH,
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id
).to("cuda")
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model.eval()
encoding = tokenizer(text, return_tensors="pt")
encoding = {k: v.to(model.device) for k, v in encoding.items()}

outputs = model(**encoding)

OSError: Incorrect path_or_model_id: '/srv/scratch2/dbielik/.cache/huggingface/checkpoints/allenai/scibert_scivocab_uncased-ft-zo_up-lower/checkpoint-474/'. Please provide either the path to a local folder or the repo_id of a model on the Hub.

In [None]:
logits = outputs.logits
logits.shape

torch.Size([1, 17])

In [None]:
softmax = torch.nn.Softmax()
probs = softmax(logits.squeeze().cpu())

# for multi-label classification, we need to threshold the probabilities
"""
predictions = np.zeros(probs.shape)
predictions[np.where(probs >= 0.5)] = 1
predicted_labels = [id2label(i) for i in np.where(predictions == 1)]
print(predicted_labels)
"""
# for single-label classification, we can take the argmax
print(probs)
predictions = probs.argmax(-1)
predicted_label = id2label[predictions.int().item()]
print(predicted_label)

tensor([0.0043, 0.0046, 0.0038, 0.0069, 0.0203, 0.4612, 0.0245, 0.0032, 0.0041,
        0.0053, 0.0047, 0.0035, 0.0031, 0.4366, 0.0068, 0.0024, 0.0048],
       grad_fn=<SoftmaxBackward0>)
14


  return self._call_impl(*args, **kwargs)


In [None]:
import gc

def my_reset(*varnames):
    """
    Resets global variables and majority of CUDA memory. Only works in Jupyter.

    varnames are what you want to keep
    """
    try:
        del model
    except NameError:
        pass
    gc.collect()
    with torch.no_grad():
        torch.cuda.ipc_collect()
        torch.cuda.empty_cache()

    globals_ = globals()
    to_save = {v: globals_[v] for v in varnames}
    to_save['my_reset'] = my_reset  # lets keep this function by default
    del globals_
    get_ipython().magic("reset")
    globals().update(to_save)

# my_reset()

### Investigating sklearn precision warning

In [None]:
p = torch.ones((86, 18)).argmax(-1)
p[:10] = torch.Tensor([i for i in range(10)])
t = torch.ones(86, dtype=torch.long)
t[:18] = torch.Tensor([i for i in range(18)])

# no Warning
precision_recall_fscore_support(t, p, average="weighted", labels=[i for i in range(10)])

(0.9873459873459873, 0.1282051282051282, 0.12816755893678972, None)

In [None]:
# Warning
precision_recall_fscore_support(t, p, average="weighted", labels=[i for i in range(11)])

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


(0.9748479368732533, 0.12658227848101267, 0.12654518477303286, None)