In [1]:
%run ../ipynb_util_tars.py

## Test different ways to load the models

the outcome of this jupynb is an easy way to access the finetuned models via variables:
* `scibert_model` = SciBERT finetuned on ZO_UP
* `llama_model` = LLaMA-3 finetuned on ZO_UP with a classification head
* `unllama_model` = LLaMA-3 finetuned on ZO_UP with a classification head without causal mask

### Dataset + encodings

In [2]:
%run ../ipynb_load_data.py

{'SDG': ClassLabel(names=['1', '10', '11', '12', '13', '14', '15', '16', '17', '2', '3', '4', '5', '6', '7', '8', '9'], id=None), 'ABSTRACT': Value(dtype='string', id=None), 'id': Value(dtype='string', id=None), 'sdg_desc_short': Value(dtype='string', id=None), 'sdg_desc_long': Value(dtype='string', id=None)}
Example instance:	 {'SDG': 16, 'ABSTRACT': 'The first attempts to modernize simply replaced the single huge engine with a huge electric motor, changing little. The drive-shafts were replaced by wires, the huge steam engine by dozens of small motors. Factories spread out, there was natural light, and room to use ceiling-slung cranes. Workers had responsibility for their own machines, they needed better training and better pay. The electric motor was a wonderful invention, once we changed all the everyday details that surrounded it.', 'id': None, 'sdg_desc_short': None, 'sdg_desc_long': None}
Encoded (label2id) label:	 16
Decoded (id2label) label:	 9
9 16 16


In [3]:
sample_sentence = "Is this about poverty?"

### Evaluator

In [4]:
import pprint
import datasets
import evaluate
from evaluate import evaluator, Metric
from sklearn.metrics import accuracy_score


class MulticlassAccuracy(Metric):
    """Workaround for the default Accuracy class which doesn't support passing 'average' to the compute method."""

    def _info(self):
        return evaluate.MetricInfo(
            description="Accuracy",
            citation="",
            inputs_description="",
            features=datasets.Features(
                {
                    "predictions": datasets.Sequence(datasets.Value("int32")),
                    "references": datasets.Sequence(datasets.Value("int32")),
                }
                if self.config_name == "multilabel"
                else {
                    "predictions": datasets.Value("int32"),
                    "references": datasets.Value("int32"),
                }
            ),
            reference_urls=["https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html"],
        )

    def _compute(self, predictions, references, normalize=True, sample_weight=None, **kwargs):
        # take **kwargs to avoid breaking when the metric is used with a compute method that takes additional arguments
        return {
            "accuracy": float(
                accuracy_score(references, predictions, normalize=normalize, sample_weight=sample_weight)
            )
        }

task_evaluator = evaluator("text-classification")
task_evaluator.METRIC_KWARGS = {"average": "weighted"}
metrics_dict = {
    "accuracy": MulticlassAccuracy(),
    "precision": "precision",
    "recall": "recall",
    "f1": "f1",
}

## SciBERT baseline

In [5]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

SCIBERT_PATH = CHECKPOINT_PATH + "/allenai/scibert_scivocab_uncased-ft-zo_up-lower/checkpoint-240/"

scibert_model = AutoModelForSequenceClassification.from_pretrained(
    SCIBERT_PATH,
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id
).to("cuda")
scibert_tokenizer = AutoTokenizer.from_pretrained(SCIBERT_PATH)
scibert_model.eval()

# Sample input to SciBERT
sample_input = scibert_tokenizer(sample_sentence, return_tensors="pt").to("cuda")
sample_output = scibert_model(**sample_input)
print(torch.max(torch.softmax(sample_output.logits, dim=-1), dim=-1))

torch.return_types.max(
values=tensor([0.8610], device='cuda:0', grad_fn=<MaxBackward0>),
indices=tensor([0], device='cuda:0'))


In [6]:
# Evaluate SciBERT
eval_results = task_evaluator.compute(
    scibert_model,
    input_column="ABSTRACT",
    label_column="SDG",
    tokenizer=scibert_tokenizer,
    data=dataset["test"],
    label_mapping=label2id,
    metric=evaluate.combine(metrics_dict)
)
pprint.pprint(eval_results)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


{'accuracy': 0.7269372693726938,
 'f1': 0.718275735363878,
 'latency_in_seconds': 0.0068033860147521715,
 'precision': 0.7210623353133084,
 'recall': 0.7269372693726938,
 'samples_per_second': 146.98563301150966,
 'total_time_in_seconds': 1.8437176099978387}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [15]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# Evaluate SciBERT (manual v2)
scibert_tokenized_dataset = dataset.map(
    preprocess_data(scibert_tokenizer, include_labels=False), batched=True, remove_columns=dataset["train"].column_names, batch_size=32
)
scibert_tokenized_dataset.set_format("torch")
scibert_out_logits = torch.tensor([])
for batch in scibert_tokenized_dataset["test"]:
    scibert_out_logits = torch.cat(
        (
            scibert_out_logits,
            scibert_model(
                input_ids=batch["input_ids"].to("cuda").unsqueeze(0),
                attention_mask=batch["attention_mask"].to("cuda").unsqueeze(0)
            ).logits.detach().cpu()
        )
    )

#scibert_accuracy = accuracy_score(y_true=dataset["test"]["SDG"][:64], y_pred=preds_scibert.cpu())
_, scibert_preds = torch.max(torch.softmax(scibert_out_logits, dim=-1), dim=-1)
scibert_accuracy = accuracy_score(y_true=dataset["test"]["SDG"], y_pred=scibert_preds.cpu())

scibert_f1 = f1_score(y_true=dataset["test"]["SDG"], y_pred=scibert_preds.cpu(), average="weighted")
scibert_precision = precision_score(y_true=dataset["test"]["SDG"], y_pred=scibert_preds.cpu(), average="weighted")
scibert_recall = recall_score(y_true=dataset["test"]["SDG"], y_pred=scibert_preds.cpu(), average="weighted")

print(dataset["test"]["SDG"][:32])
print(scibert_preds[:32].tolist())

pprint.pprint({
    "accuracy": scibert_accuracy,
    "precision": scibert_precision,
    "recall": scibert_recall,
    "f1": scibert_f1
})

  StockPickler.save(self, obj, save_persistent_id)
  StockPickler.save(self, obj, save_persistent_id)


[3, 9, 4, 9, 14, 15, 1, 5, 5, 10, 11, 12, 15, 14, 14, 10, 0, 14, 4, 1, 7, 1, 6, 11, 7, 10, 1, 1, 9, 9, 2, 14]
[3, 9, 4, 9, 14, 15, 16, 5, 5, 12, 11, 12, 1, 4, 14, 10, 0, 13, 4, 1, 7, 15, 6, 11, 7, 10, 1, 15, 9, 9, 2, 10]
{'accuracy': 0.7269372693726938,
 'f1': 0.718275735363878,
 'precision': 0.7210623353133084,
 'recall': 0.7269372693726938}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## LLaMA-3

In [None]:
import torch
from transformers import AutoTokenizer, LlamaForSequenceClassification

LLAMA_PATH = f"{CHECKPOINT_PATH}/meta-llama/Meta-Llama-3-8B-ft-zo_up/checkpoint-2200/"
llama_tokenizer = AutoTokenizer.from_pretrained(LLAMA_PATH)
llama_model = LlamaForSequenceClassification.from_pretrained(
    LLAMA_PATH,
    num_labels=17,
    device_map="auto",
    torch_dtype=torch.bfloat16
)
llama_model.config.pad_token_id = llama_tokenizer.pad_token_id

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Meta-Llama-3-8B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
tokenized_sample = llama_tokenizer(sample_sentence, return_tensors="pt")
token_ids = tokenized_sample["input_ids"]

out = llama_model(token_ids)

In [None]:
torch.max(torch.softmax(out.logits, dim=-1), dim=-1)

torch.return_types.max(
values=tensor([0.4863], dtype=torch.bfloat16),
indices=tensor([12]))

In [None]:
# Evaluate LLaMA - can't use task evaluator because it doesn't support accelerate which is required for inference larger models
# https://github.com/huggingface/evaluate/issues/487

# tokenize the dataset first
llama_tokenized_dataset = dataset.map(
    preprocess_data(llama_tokenizer, padding="longest", max_length=1024, include_labels=False), batched=True, remove_columns=dataset["train"].column_names
)
llama_tokenized_dataset.set_format("torch")

print(token_ids.shape)
print(llama_tokenized_dataset["test"][0]["input_ids"].unsqueeze(dim=0).shape)

# need  to split the input_ids tensor into two tensors to avoid CUDA out of memory error
out = llama_model(**llama_tokenized_dataset["test"][:128])
out2 = llama_model(**llama_tokenized_dataset["test"][128:])
out = torch.cat((out.logits, out2.logits), dim=0)

  StockPickler.save(self, obj, save_persistent_id)
  StockPickler.save(self, obj, save_persistent_id)


torch.Size([1, 6])
torch.Size([1, 522])


In [None]:
pred_probs, preds = torch.max(torch.softmax(out, dim=-1), dim=-1)
true_labels = dataset["test"]["SDG"]

# try older tf version, tinyllama (or any model llama2)
llama_accuracy = accuracy_score(true_labels, preds)
print(llama_accuracy)

0.7269372693726938
