# SCIBERT in HuggingFace

In [3]:
# run this script to import the variables and settings from ipynb_util.py
%run ipynb_util.py

In [4]:
from datasets import load_dataset

# load the dataset
# note: if you don't have the data in the folder, use the download-data.sh script
dataset = load_dataset("json", data_files=str(DATA_DIR_PATH / "task1-train.jsonl"))
dataset = dataset["train"].train_test_split(test_size=0.3, seed=SEED)

example = dataset["train"][0]
print("Example instance:\t", example)

labels = set(dataset["train"]["SDG"])
# identity because labels are already ids and vice-versa
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
labels

Generating train split: 0 examples [00:00, ? examples/s]

Example instance:	 {'ID': 'oai:www.zora.uzh.ch:126666', 'TITLE': 'Identifying phrasemes via interlingual association measures - A data-driven approach on dependency-parsed and word-aligned parallel corpora', 'ABSTRACT': 'In corpus linguistics, statistical association measures play a major role in identifying collocations such as ‘play’ and ‘role’ in ‘play a role’.  Those two words that appear considerably more often in the same context than one would expect from a random distribution are collocates.  They typically constitute meaning beyond the bare combination of both words’ semantics.\r\nWe employ the same association measures on interlingual word co-occurrences based on statistical word alignment and combine them with intralingual association measures on syntactical dependency relations in order to identify phrasemes.  Support verb constructions exemplify our approach.  They are characterized by the respective verb contributing little to the semantics of the whole construction, whic

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}

In [5]:
from transformers import AutoTokenizer

# base model
HF_MODEL_NAME = "allenai/scibert_scivocab_uncased"
# final model
MODEL_NAME = f"{HF_MODEL_NAME}-ft-task1"

tokenizer = AutoTokenizer.from_pretrained(HF_MODEL_NAME)

def preprocess_data(instances):
    # take a batch of titles and abstracts and concat them
    titles = instances["TITLE"]
    abstracts = instances["ABSTRACT"]
    texts = [f"{title} {abstract}" for title, abstract in zip(titles, abstracts)]
    # encode
    encoding = tokenizer(texts, padding="max_length", truncation=True, max_length=512, return_tensors="pt")

    # add labels
    encoding["label"] = torch.tensor([label2id[label] for label in instances["SDG"]])

    return encoding

encoded_dataset = dataset.map(preprocess_data, batched=True, remove_columns=dataset["train"].column_names)
encoded_dataset.set_format("torch")

Map:   0%|          | 0/430 [00:00<?, ? examples/s]

In [6]:
example = encoded_dataset["train"][0]
print("Example instance:\t", example)

tokenizer.decode(example["input_ids"])

Example instance:	 {'input_ids': tensor([  102,  6121, 14367,  8608,  2168,   357,  2123,   507,  2274,  2554,
          579,   106,   453,   579,  6920,  1139,   191, 10271,   579, 24199,
        30118,   137,  3824,   579, 10016,  3098, 22916,   121,  8995, 23876,
          422,  2397,  2274,  2554,  2250,   106,  1626,  1447,   121,  6121,
        15231, 25253,   555,   188,  1384,  2250,  5459,   137,  1384,  1447,
         5459,   121,  1384,  2250,   106,  1447,  5459,   205,  1052,   502,
         2880,   198,  1853,  8222,   475,  1992,   121,   111,   855,  2220,
          506,   482,   989,  3596,   263,   106,  1533,  1382,   220, 15231,
         5581,   123,   205,   698,  3915, 10334,  4419,  4882,   111, 13057,
         2702,   131,   655,  2880,  5459,  7816,   205,   185,  2134,   111,
          855,  2274,  2554,   191,   357,  2123,   507,  3824,   304,   579,
        16273,   791,   191,  2397,  3824,  6836,   137,  9548,  1445,   190,
         4743,  2123,   507,  2

'[CLS] identifying phrasemes via interlingual association measures - a data - driven approach on dependency - parsed and word - aligned parallel corpora in corpus linguistics, statistical association measures play a major role in identifying collocations such as ‘ play ’ and ‘ role ’ in ‘ play a role ’. those two words that appear considerably more often in the same context than one would expect from a random distribution are collocates. they typically constitute meaning beyond the bare combination of both words ’ semantics. we employ the same association measures on interlingual word co - occurrences based on statistical word alignment and combine them with intralingual association measures on syntactical dependency relations in order to identify phrasemes. support verb constructions exemplify our approach. they are characterized by the respective verb contributing little to the semantics of the whole construction, which we can determine with the aid of our intralingual association me

In [7]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    HF_MODEL_NAME,
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import TrainingArguments, Trainer, EvalPrediction

BATCH_SIZE = 8
METRIC_NAME = "accuracy"

args = TrainingArguments(
    f"{CHECKPOINT_PATH}/{MODEL_NAME}",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    #save_steps=10,
    #eval_steps=10,
    #logging_steps=10,
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=20,
    load_best_model_at_end=True,
    save_total_limit=2,
    metric_for_best_model=METRIC_NAME,
    seed=SEED
)

def compute_metrics(pred: EvalPrediction):
    labels = pred.label_ids
    accuracy = accuracy_score(labels, pred.predictions.argmax(-1))
    precision, recall, f1, _ = precision_recall_fscore_support(labels, pred.predictions.argmax(-1), average="weighted")
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


KeyError: 'test'

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,2.294022,0.348837,0.121687,0.348837,0.180433
2,No log,1.983403,0.387597,0.319233,0.387597,0.257885
3,No log,1.756778,0.457364,0.391972,0.457364,0.384278
4,No log,1.666955,0.511628,0.415087,0.511628,0.448823
5,No log,1.752029,0.527132,0.440522,0.527132,0.463918
6,No log,1.663559,0.51938,0.462906,0.51938,0.482689
7,No log,1.692489,0.527132,0.527017,0.527132,0.504697
8,No log,1.768125,0.542636,0.565891,0.542636,0.528349
9,No log,1.789269,0.51938,0.636693,0.51938,0.530452
10,No log,1.972066,0.534884,0.633583,0.534884,0.546809


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

TrainOutput(global_step=760, training_loss=0.5132972466318231, metrics={'train_runtime': 597.7961, 'train_samples_per_second': 10.07, 'train_steps_per_second': 1.271, 'total_flos': 1584156096552960.0, 'train_loss': 0.5132972466318231, 'epoch': 20.0})

## Evaluation

In [None]:
trainer.evaluate()

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 2.0742533206939697,
 'eval_accuracy': 0.5503875968992248,
 'eval_precision': 0.6207630338957288,
 'eval_recall': 0.5503875968992248,
 'eval_f1': 0.5498825849667871,
 'eval_runtime': 4.3855,
 'eval_samples_per_second': 29.415,
 'eval_steps_per_second': 3.876,
 'epoch': 20.0}

## Inference

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

text = "This is a test text about climate change."

MODEL_PATH = f"{CHECKPOINT_PATH}/{MODEL_NAME}/checkpoint-532"
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_PATH,
    num_labels=len(labels),
    id2label={i: label for i, label in enumerate(labels)},
    label2id={label: i for i, label in enumerate(labels)}
).to("cuda")
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model.eval()
encoding = tokenizer(text, return_tensors="pt")
encoding = {k: v.to(model.device) for k, v in encoding.items()}

outputs = model(**encoding)

In [None]:
logits = outputs.logits
logits.shape

torch.Size([1, 18])

In [None]:
softmax = torch.nn.Softmax()
probs = softmax(logits.squeeze().cpu())

# for multi-label classification, we need to threshold the probabilities
"""
predictions = np.zeros(probs.shape)
predictions[np.where(probs >= 0.5)] = 1
predicted_labels = [id2label(i) for i in np.where(predictions == 1)]
print(predicted_labels)
"""
# for single-label classification, we can take the argmax
print(probs)
predictions = probs.argmax(-1)
predicted_label = id2label[predictions.int().item()]
print(predicted_label)

tensor([1.4732e-03, 8.4256e-04, 1.2375e-03, 1.2201e-03, 1.0317e-03, 1.0760e-03,
        1.3087e-03, 1.3920e-03, 2.3354e-03, 7.2098e-04, 6.6897e-04, 8.4898e-04,
        8.5957e-04, 9.7841e-01, 1.9451e-03, 1.9424e-03, 1.5772e-03, 1.1073e-03],
       grad_fn=<SoftmaxBackward0>)
13


In [None]:
import gc

def my_reset(*varnames):
    """
    Resets global variables and majority of CUDA memory. Only works in Jupyter.

    varnames are what you want to keep
    """
    try:
        del model
    except NameError:
        pass
    gc.collect()
    with torch.no_grad():
        torch.cuda.ipc_collect()
        torch.cuda.empty_cache()

    globals_ = globals()
    to_save = {v: globals_[v] for v in varnames}
    to_save['my_reset'] = my_reset  # lets keep this function by default
    del globals_
    get_ipython().magic("reset")
    globals().update(to_save)

# my_reset()

  get_ipython().magic("reset")


Nothing done.


### Investigating sklearn precision warning

In [None]:
p = torch.ones((86, 18)).argmax(-1)
p[:10] = torch.Tensor([i for i in range(10)])
t = torch.ones(86, dtype=torch.long)
t[:18] = torch.Tensor([i for i in range(18)])

# no Warning
precision_recall_fscore_support(t, p, average="weighted", labels=[i for i in range(10)])

(0.9873459873459873, 0.1282051282051282, 0.12816755893678972, None)

In [None]:
# Warning
precision_recall_fscore_support(t, p, average="weighted", labels=[i for i in range(11)])

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


(0.9748479368732533, 0.12658227848101267, 0.12654518477303286, None)