In [1]:
%run ../ipynb_util_tars.py

In [2]:
%run ../ipynb_load_data_natural.py

{'SDG': Value(dtype='int64', id=None), 'ABSTRACT': Value(dtype='string', id=None), 'id': Value(dtype='string', id=None), 'sdg_desc_short': Value(dtype='string', id=None), 'sdg_desc_long': Value(dtype='string', id=None), '__index_level_0__': Value(dtype='int64', id=None)}
Example instance:	 {'SDG': 8, 'ABSTRACT': 'The scheme gives enterprises with business activity in Norway a tax credit on their R&D projects. The R&D content must be approved by the Research Council of Norway ex ante. In 2009, the cap on expenses per enterprise for intramural R&D projects increased to NOK 5.5 million (previously it was N0K 4 million), and NOK11 million (previously it was NOK 8 million) for projects conducted at an R&D institution.', 'id': None, 'sdg_desc_short': None, 'sdg_desc_long': None, '__index_level_0__': 492}
id2label: {0: '1', 1: '2', 2: '3', 3: '4', 4: '5', 5: '6', 6: '7', 7: '8', 8: '9', 9: '10', 10: '11', 11: '12', 12: '13', 13: '14', 14: '15', 15: '16', 16: '17'}
label2id: {'1': 0, '2': 1, '

In [3]:
import torch.nn as nn
import torch
from transformers.models.llama.modeling_llama import (
    LlamaForSequenceClassification,
    LlamaDecoderLayer,
    LlamaConfig,
    LlamaRMSNorm,
    LlamaModel,
    LLAMA_INPUTS_DOCSTRING,
    add_start_docstrings_to_model_forward,
    BaseModelOutputWithPast,
)
from transformers.cache_utils import Cache, DynamicCache
from typing import Optional, List, Union, Tuple


class UnmaskingLlamaModel(LlamaModel):
    """
    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]

    Args:
        config: LlamaConfig
    """

    def __init__(self, config: LlamaConfig):
        super().__init__(config)
        self.padding_idx = config.pad_token_id
        self.vocab_size = config.vocab_size

        self.embed_tokens = nn.Embedding(
            config.vocab_size, config.hidden_size, self.padding_idx
        )
        self.layers = nn.ModuleList(
            [
                LlamaDecoderLayer(config, layer_idx)
                for layer_idx in range(config.num_hidden_layers)
            ]
        )
        self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
        self.gradient_checkpointing = False

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.embed_tokens

    def set_input_embeddings(self, value):
        self.embed_tokens = value

    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        cache_position: Optional[torch.LongTensor] = None,
    ) -> Union[Tuple, BaseModelOutputWithPast]:
        output_attentions = (
            output_attentions
            if output_attentions is not None
            else self.config.output_attentions
        )
        output_hidden_states = (
            output_hidden_states
            if output_hidden_states is not None
            else self.config.output_hidden_states
        )
        use_cache = use_cache if use_cache is not None else self.config.use_cache
        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )

        if (input_ids is None) ^ (inputs_embeds is not None):
            raise ValueError(
                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
            )

        if self.gradient_checkpointing and self.training and use_cache:
            logger.warning_once(
                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
            )
            use_cache = False

        if inputs_embeds is None:
            inputs_embeds = self.embed_tokens(input_ids)

        return_legacy_cache = False
        if use_cache and not isinstance(
            past_key_values, Cache
        ):  # kept for BC (non `Cache` `past_key_values` inputs)
            return_legacy_cache = True
            past_key_values = DynamicCache.from_legacy_cache(past_key_values)

        if cache_position is None:
            past_seen_tokens = (
                past_key_values.get_seq_length() if past_key_values is not None else 0
            )
            cache_position = torch.arange(
                past_seen_tokens,
                past_seen_tokens + inputs_embeds.shape[1],
                device=inputs_embeds.device,
            )
        if position_ids is None:
            position_ids = cache_position.unsqueeze(0)

        causal_mask = self._update_causal_mask(
            attention_mask,
            inputs_embeds,
            cache_position,
            past_key_values,
            output_attentions,
        )
        if causal_mask is not None:
            # Assuming causal_mask is a tensor with shape (batch_size, 1, seq_length, hidden_size)
            causal_mask_last_row = causal_mask[:, :, -1, :].unsqueeze(2)

            # contiguous used to avoid "RuntimeError: CUDA error: misaligned address" in some cases
            causal_mask = causal_mask_last_row.expand_as(causal_mask).contiguous()
            # causal_mask = torch.zeros_like(causal_mask, device=inputs_embeds.device)
        else:
            pass

        # embed positions
        hidden_states = inputs_embeds

        # decoder layers
        all_hidden_states = () if output_hidden_states else None
        all_self_attns = () if output_attentions else None
        next_decoder_cache = None

        for decoder_layer in self.layers:
            if output_hidden_states:
                all_hidden_states += (hidden_states,)

            if self.gradient_checkpointing and self.training:
                layer_outputs = self._gradient_checkpointing_func(
                    decoder_layer.__call__,
                    hidden_states,
                    causal_mask,
                    position_ids,
                    past_key_values,
                    output_attentions,
                    use_cache,
                    cache_position,
                )
            else:
                layer_outputs = decoder_layer(
                    hidden_states,
                    attention_mask=causal_mask,
                    position_ids=position_ids,
                    past_key_value=past_key_values,
                    output_attentions=output_attentions,
                    use_cache=use_cache,
                    cache_position=cache_position,
                )

            hidden_states = layer_outputs[0]

            if use_cache:
                next_decoder_cache = layer_outputs[2 if output_attentions else 1]

            if output_attentions:
                all_self_attns += (layer_outputs[1],)

        hidden_states = self.norm(hidden_states)

        # add hidden states from the last decoder layer
        if output_hidden_states:
            all_hidden_states += (hidden_states,)

        next_cache = next_decoder_cache if use_cache else None
        if return_legacy_cache:
            next_cache = next_cache.to_legacy_cache()

        if not return_dict:
            return tuple(
                v
                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns]
                if v is not None
            )
        return BaseModelOutputWithPast(
            last_hidden_state=hidden_states,
            past_key_values=next_cache,
            hidden_states=all_hidden_states,
            attentions=all_self_attns,
        )


class UnmaskingLlamaForSequenceClassification(LlamaForSequenceClassification):

    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.model = UnmaskingLlamaModel(config)
        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)

        # Initialize weights and apply final processing
        self.post_init()


In [4]:
import torch
from transformers import AutoTokenizer

LLAMA_PATH = CHECKPOINT_PATH + "/final/meta-llama/Meta-Llama-3-8B-ft-zo_up-unmasked/checkpoint-2528/"
llama_tokenizer = AutoTokenizer.from_pretrained(LLAMA_PATH)
llama_model = UnmaskingLlamaForSequenceClassification.from_pretrained(
    LLAMA_PATH,
    label2id=label2id,
    id2label=id2label,
    device_map="auto",
    torch_dtype=torch.bfloat16
)
llama_model.eval()
llama_model.config.pad_token_id = llama_tokenizer.pad_token_id

encoded_dataset = dataset.map(preprocess_data(llama_tokenizer, max_length=1024, padding="longest"), batched=True, remove_columns=dataset["train"].column_names)
encoded_dataset.set_format("torch")

from sklearn.metrics import classification_report

# manual evaluation to show classifcation_report
true_labels = []
logits = []

for batch in encoded_dataset["test"]:
    batch = {k: v.to(llama_model.device).unsqueeze(0) for k, v in batch.items()}
    label = batch.pop("label")

    # Forward pass
    with torch.no_grad():
        out = llama_model(**batch)

    true_labels.append(label.item())
    logits.extend(out.logits.tolist())

probabilites = torch.nn.functional.softmax(torch.tensor(logits), dim=-1)
pred_labels = torch.argmax(probabilites, dim=-1).tolist()

report = classification_report(true_labels, pred_labels, target_names=[f"SDG {id2label[i]}" for i in range(len(labels))], digits=4)
print(report)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of UnmaskingLlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Meta-Llama-3-8B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  StockPickler.save(self, obj, save_persistent_id)
  StockPickler.save(self, obj, save_persistent_id)


Map:   0%|          | 0/630 [00:00<?, ? examples/s]

Map:   0%|          | 0/271 [00:00<?, ? examples/s]

              precision    recall  f1-score   support

       SDG 1     0.7895    0.8824    0.8333        17
       SDG 2     0.8333    0.8824    0.8571        17
       SDG 3     0.7500    0.8824    0.8108        17
       SDG 4     0.9412    0.9412    0.9412        17
       SDG 5     0.8125    0.7647    0.7879        17
       SDG 6     0.8889    1.0000    0.9412        16
       SDG 7     0.6667    0.6250    0.6452        16
       SDG 8     0.7857    0.6471    0.7097        17
       SDG 9     0.7222    0.7647    0.7429        17
      SDG 10     0.5333    0.4706    0.5000        17
      SDG 11     0.8824    0.8824    0.8824        17
      SDG 12     0.6875    0.6471    0.6667        17
      SDG 13     0.8125    0.7647    0.7879        17
      SDG 14     1.0000    1.0000    1.0000        17
      SDG 15     0.8125    0.7647    0.7879        17
      SDG 16     0.5789    0.6471    0.6111        17
      SDG 17     0.0000    0.0000    0.0000         1

    accuracy              

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [5]:
from transformers import Trainer, TrainingArguments, EvalPrediction
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


def compute_metrics(pred: EvalPrediction):
    labels = pred.label_ids
    accuracy = accuracy_score(labels, pred.predictions.argmax(-1))
    precision, recall, f1, _ = precision_recall_fscore_support(labels, pred.predictions.argmax(-1), average="weighted")
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

eval_trainer = Trainer(
    model=llama_model,
    args=TrainingArguments(output_dir="./eval_output", per_device_eval_batch_size=4),
    eval_dataset=encoded_dataset["test"],
    compute_metrics=compute_metrics
)
eval_results = eval_trainer.evaluate()
print(eval_results)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mdvdblk[0m ([33mngmi[0m). Use [1m`wandb login --relogin`[0m to force relogin


{'eval_loss': 1.389865517616272, 'eval_accuracy': 0.7822878228782287, 'eval_precision': 0.7782114418888549, 'eval_recall': 0.7822878228782287, 'eval_f1': 0.7785993031664066, 'eval_runtime': 38.9459, 'eval_samples_per_second': 6.958, 'eval_steps_per_second': 1.746}


In [6]:
pred_labels

[3,
 2,
 6,
 5,
 3,
 8,
 5,
 14,
 14,
 8,
 4,
 11,
 0,
 3,
 14,
 8,
 2,
 13,
 0,
 4,
 3,
 2,
 0,
 0,
 15,
 15,
 2,
 12,
 0,
 4,
 9,
 11,
 15,
 9,
 13,
 11,
 3,
 10,
 10,
 13,
 5,
 0,
 1,
 3,
 0,
 10,
 10,
 1,
 15,
 14,
 0,
 8,
 7,
 2,
 0,
 1,
 11,
 4,
 10,
 11,
 2,
 15,
 12,
 6,
 0,
 13,
 11,
 6,
 9,
 0,
 7,
 1,
 3,
 14,
 6,
 5,
 4,
 5,
 4,
 3,
 12,
 3,
 5,
 12,
 13,
 14,
 14,
 12,
 2,
 12,
 2,
 10,
 14,
 14,
 10,
 15,
 9,
 11,
 3,
 9,
 2,
 9,
 7,
 0,
 13,
 15,
 11,
 10,
 5,
 4,
 13,
 12,
 8,
 2,
 6,
 0,
 15,
 11,
 14,
 1,
 11,
 3,
 2,
 7,
 5,
 3,
 3,
 6,
 7,
 14,
 1,
 12,
 10,
 12,
 4,
 15,
 0,
 6,
 13,
 14,
 2,
 10,
 9,
 14,
 2,
 1,
 4,
 7,
 7,
 5,
 5,
 8,
 8,
 13,
 5,
 13,
 15,
 9,
 4,
 5,
 2,
 11,
 7,
 2,
 13,
 6,
 11,
 13,
 4,
 1,
 12,
 15,
 9,
 6,
 4,
 1,
 7,
 12,
 5,
 15,
 12,
 5,
 6,
 3,
 7,
 15,
 7,
 15,
 2,
 2,
 9,
 2,
 5,
 7,
 1,
 10,
 3,
 4,
 13,
 10,
 8,
 0,
 5,
 11,
 9,
 4,
 2,
 6,
 8,
 13,
 14,
 8,
 15,
 6,
 0,
 8,
 9,
 13,
 4,
 10,
 12,
 2,
 8,
 8,
 12,
 1,
 9,
 1,
 13,