In [6]:
import json
import torch
import numpy as np
import evaluate
from datasets import load_dataset, Dataset, DatasetDict
from transformers import AutoTokenizer, BitsAndBytesConfig
from transformers import DataCollatorForTokenClassification
from transformers import TrainingArguments, Trainer
from peft import get_peft_model, LoraConfig, TaskType
from dotenv import dotenv_values

from utils import DataPreprocessor, DatasetFormatConverter
from modeling_llama import LlamaForTokenClassification

### Esempio con LlamaForSequenceClassification

In [None]:
from transformers.models.llama import LlamaForSequenceClassification
from transformers import AutoTokenizer
import torch
LLAMA_TOKEN = dotenv_values(".env.base")['LLAMA_TOKEN']

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf", token=LLAMA_TOKEN,
                                          cache_dir='/data/disk1/share/pferrazzi/.cache')
tokenizer.pad_token = tokenizer.eos_token
print("tokenizer.eos_token", tokenizer.eos_token)
sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]
tokens = tokenizer(sequences, padding="max_length", truncation=True, return_tensors="pt", max_length=8)
print("tokens", tokens)
model = LlamaForSequenceClassification.from_pretrained("meta-llama/Llama-2-7b-chat-hf", token=LLAMA_TOKEN,
                                                       cache_dir='/data/disk1/share/pferrazzi/.cache')
model.config.pad_token_id = model.config.eos_token_id
outputs = model(**tokens)
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
print("predictions", predictions)

tokenizer.eos_token </s>
tokens {'input_ids': tensor([[    1,   306, 29915,   345,  1063, 10534,   363,   263],
        [    1,  1105,   505,   306, 29991,     2,     2,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 0, 0, 0]])}


Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.27it/s]
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-2-7b-chat-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


predictions tensor([[0.0854, 0.9146],
        [0.9874, 0.0126]], grad_fn=<SoftmaxBackward0>)


### Load the data

In [3]:
BASE_MODEL_CHECKPOINT = 'meta-llama/Llama-2-7b-hf'
LLAMA_TOKEN = dotenv_values(".env.base")['LLAMA_TOKEN']
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_CHECKPOINT,
                                          token =LLAMA_TOKEN)
tokenizer.pad_token = tokenizer.eos_token
seqeval = evaluate.load("seqeval")

In [4]:
DATASET_CHEKPOINT="ferrazzipietro/e3c-sentences" 
TRAIN_LAYER="en.layer1"
offset=False
instruction_on_response_format='Extract the entities contained in the text. Extract only entities contained in the text.\nReturn the result in a json format: [{"entity":"entity_name"}].'# 'Return the result in a json format.'
simplest_prompt=False
dataset_text_field="prompt"
preprocessor = DataPreprocessor(BASE_MODEL_CHECKPOINT, 
                                tokenizer)
dataset = load_dataset(DATASET_CHEKPOINT) #download_mode="force_redownload"
dataset = dataset[TRAIN_LAYER]
dataset = dataset.shuffle(seed=1234)  # Shuffle dataset here
dataset = preprocessor.preprocess_data_one_layer(dataset, 
                                                 instruction_on_response_format=instruction_on_response_format,
                                                 simplest_prompt=simplest_prompt)
dataset = dataset.map(lambda samples: tokenizer(samples[dataset_text_field]), batched=True)
dataset_format_converter = DatasetFormatConverter(dataset)
dataset_format_converter.apply()
ds = dataset_format_converter.dataset
ds = ds.rename_column("word_level_labels", "ner_tags")
ds = ds.rename_column("words", "tokens")
label2id = dataset_format_converter.label2id
id2label = {v: k for k, v in label2id.items()}
label_list = list(label2id.keys())

### load the model

In [7]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit= True,# model_loading_params.load_in_4bit,
    load_in_8bit = False,#  model_loading_params.load_in_8bit,

    bnb_4bit_quant_type= "nf4",
    bnb_4bit_compute_dtype= torch.bfloat16,
    bnb_4bit_use_double_quant= True,

    # llm_int8_threshold= 6.0,# model_loading_params.llm_int8_threshold,
    # llm_int8_skip_modules= ["q_proj", "k_proj", "v_proj", "o_proj","gate_proj"],# model_loading_params.llm_int8_skip_modules,
    # llm_int8_has_fp16_weight= True# model_loading_params.llm_int8_has_fp16_weight
)


model = LlamaForTokenClassification.from_pretrained(
    BASE_MODEL_CHECKPOINT, 
    num_labels=len(label2id), 
    id2label=id2label, 
    label2id=label2id,
    token = LLAMA_TOKEN,
    load_in_4bit=True,
    quantization_config = bnb_config,
    # device_map = 'auto',
    cache_dir='/data/disk1/share/pferrazzi/.cache')
peft_config = LoraConfig(task_type=TaskType.TOKEN_CLS, inference_mode=False, r=12, lora_alpha=32, lora_dropout=0.1)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.30s/it]
Some weights of LlamaForTokenClassification were not initialized from the model checkpoint at meta-llama/Llama-2-7b-hf and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 6,303,747 || all params: 6,613,659,654 || trainable%: 0.09531405197404492


### tokenization

In [8]:
def tokenize_and_align_labels(examples, max_length=28, word_column_name='words', labels_column_name='word_level_labels'):# , word_column_name='tokens', labels_column_name='ner_tags'):#

    tokenized_inputs = tokenizer(examples[word_column_name], is_split_into_words=True, padding='longest', max_length=max_length, truncation=True)

    labels = []
    for i, label in enumerate(examples[labels_column_name]):
        # print('label: ', label)
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-99)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


tokenized_ds = dataset_format_converter.dataset.map(tokenize_and_align_labels, batched=True)# dataset_format_converter.dataset.map(tokenize_and_align_labels, batched=True)
train_data, val_data, test_data = preprocessor.split_layer_into_train_val_test_(tokenized_ds, TRAIN_LAYER)
# tokenized_ds = ds.map(tokenize_and_align_labels, batched=True)
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
train_data, val_data, test_data = preprocessor.split_layer_into_train_val_test_(tokenized_ds, TRAIN_LAYER)

### calculate metrics for training

In [9]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }


### TRAIN

In [13]:
HF_TOKEN = dotenv_values(".env.base")['HF_TOKEN']

training_args = TrainingArguments(
    output_dir="my_awesome_ds_model",
    learning_rate=1e-4,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps= 4,
    num_train_epochs=1,
    max_steps=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    #load_best_model_at_end=True,
    # push_to_hub=True,
    # hub_token=HF_TOKEN,
    # hub_model_id='ls_llama_e3c',
    # report_to="wandb",
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()


dataloader_config = DataLoaderConfiguration(dispatch_batches=None)
Detected kernel version 4.15.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mferrazzipietro[0m. Use [1m`wandb login --relogin`[0m to force relogin


You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
../aten/src/ATen/native/cuda/Loss.cu:250: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [3,0,0] Assertion `t >= 0 && t < n_classes` failed.
../aten/src/ATen/native/cuda/Loss.cu:250: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [4,0,0] Assertion `t >= 0 && t < n_classes` failed.
../aten/src/ATen/native/cuda/Loss.cu:250: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [8,0,0] Assertion `t >= 0 && t < n_classes` failed.
../aten/src/ATen/native/cuda/Loss.cu:250: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [9,0,0] Assertion `t >= 0 && t < n_classes` failed.
../aten/src/ATen/native/cuda/Loss.cu:250: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [12,0,0] Assertion `t >= 0 && t < n

RuntimeError: Caught RuntimeError in replica 0 on device 0.
Original Traceback (most recent call last):
  File "/home/pferrazzi/LS-LLaMA-E3C/.venv/lib/python3.11/site-packages/torch/nn/parallel/parallel_apply.py", line 83, in _worker
    output = module(*input, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/pferrazzi/LS-LLaMA-E3C/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/pferrazzi/LS-LLaMA-E3C/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/pferrazzi/LS-LLaMA-E3C/.venv/lib/python3.11/site-packages/peft/peft_model.py", line 1590, in forward
    return self.base_model(
           ^^^^^^^^^^^^^^^^
  File "/home/pferrazzi/LS-LLaMA-E3C/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/pferrazzi/LS-LLaMA-E3C/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/pferrazzi/LS-LLaMA-E3C/.venv/lib/python3.11/site-packages/peft/tuners/tuners_utils.py", line 161, in forward
    return self.model.forward(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/pferrazzi/LS-LLaMA-E3C/.venv/lib/python3.11/site-packages/accelerate/hooks.py", line 167, in new_forward
    return module._hf_hook.post_forward(module, output)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/pferrazzi/LS-LLaMA-E3C/.venv/lib/python3.11/site-packages/accelerate/hooks.py", line 380, in post_forward
    output = send_to_device(output, self.input_device, skip_keys=self.skip_keys)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/pferrazzi/LS-LLaMA-E3C/.venv/lib/python3.11/site-packages/accelerate/utils/operations.py", line 189, in send_to_device
    {
  File "/home/pferrazzi/LS-LLaMA-E3C/.venv/lib/python3.11/site-packages/accelerate/utils/operations.py", line 190, in <dictcomp>
    k: t if k in skip_keys else send_to_device(t, device, non_blocking=non_blocking, skip_keys=skip_keys)
                                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/pferrazzi/LS-LLaMA-E3C/.venv/lib/python3.11/site-packages/accelerate/utils/operations.py", line 174, in send_to_device
    raise error
  File "/home/pferrazzi/LS-LLaMA-E3C/.venv/lib/python3.11/site-packages/accelerate/utils/operations.py", line 158, in send_to_device
    return tensor.to(device, non_blocking=non_blocking)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.



In [1]:
import json
import sys
import numpy as np
import evaluate
from datasets import load_dataset, Dataset, DatasetDict
from transformers import AutoTokenizer, BitsAndBytesConfig
from transformers import DataCollatorForTokenClassification
from transformers import TrainingArguments, Trainer
from peft import get_peft_model, LoraConfig, TaskType, PeftModel
from dotenv import dotenv_values
import wandb
import datetime
import os
import torch

from utils import DataPreprocessor, DatasetFormatConverter
from modeling_llama import LlamaForTokenClassification



WANDB_KEY = dotenv_values(".env.base")['WANDB_KEY']
BASE_MODEL_CHECKPOINT = 'meta-llama/Llama-2-7b-chat-hf'
LLAMA_TOKEN = dotenv_values(".env.base")['LLAMA_TOKEN']
HF_TOKEN = dotenv_values(".env.base")['HF_TOKEN']
use_e3c = True


# os.environ["TOKENIZERS_PARALLELISM"] = "false"
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_CHECKPOINT,
                                          token =LLAMA_TOKEN,
                                          cache_dir='/data/disk1/share/pferrazzi/.cache')
tokenizer.pad_token = tokenizer.eos_token
seqeval = evaluate.load("seqeval")

# if not use_e3c:
#     ds = load_dataset("wnut_17")
#     label2id_ds = { "O": 0, "B-corporation": 1, "I-corporation": 2, "B-creative-work": 3, "I-creative-work": 4, "B-group": 5, "I-group": 6, "B-location": 7, "I-location": 8, "B-person": 9, "I-person": 10, "B-product": 11, "I-product": 12, }
#     id2label_ds = {v: k for k, v in label2id_ds.items()}
#     label_list_ds = list(label2id_ds.keys()) # ds["train"].features[f"ner_tags"].feature.names
#     id2label = id2label_ds
#     label2id = label2id_ds
#     label_list = label_list_ds
#     ds = ds.rename_column("ner_tags", "word_level_labels")
#     ds = ds.rename_column("tokens", "words")
if use_e3c:
    DATASET_CHEKPOINT="ferrazzipietro/e3c-sentences" 
    TRAIN_LAYER="en.layer1"
    offset=False
    instruction_on_response_format='Extract the entities contained in the text. Extract only entities contained in the text.\nReturn the result in a json format: [{"entity":"entity_name"}].'# 'Return the result in a json format.'
    simplest_prompt=False
    dataset_text_field="prompt"
    preprocessor = DataPreprocessor(BASE_MODEL_CHECKPOINT, 
                                    tokenizer)
    dataset = load_dataset(DATASET_CHEKPOINT) #download_mode="force_redownload"
    dataset = dataset[TRAIN_LAYER]
    dataset = dataset.shuffle(seed=1234)  # Shuffle dataset here
    dataset = preprocessor.preprocess_data_one_layer(dataset, 
                                                    instruction_on_response_format=instruction_on_response_format,
                                                    simplest_prompt=simplest_prompt)
    dataset = dataset.map(lambda samples: tokenizer(samples[dataset_text_field]), batched=True)
    dataset_format_converter = DatasetFormatConverter(dataset)
    dataset_format_converter.apply()

    ds = dataset_format_converter.dataset
    ds = ds.rename_column("word_level_labels", "ner_tags")
    ds = ds.rename_column("words", "tokens")
    label2id = dataset_format_converter.label2id
    id2label = {v: k for k, v in label2id.items()}
    label_list = list(label2id.keys())


def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], is_split_into_words=True, padding='longest', max_length=256, truncation=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs



bnb_config = BitsAndBytesConfig(
    load_in_4bit= True,# model_loading_params.load_in_4bit,
    load_in_8bit = False,#  model_loading_params.load_in_8bit,

    bnb_4bit_quant_type= "nf4",
    bnb_4bit_compute_dtype= torch.bfloat16,
    bnb_4bit_use_double_quant= True,

    # llm_int8_threshold= 6.0,# model_loading_params.llm_int8_threshold,
    # llm_int8_skip_modules= ["q_proj", "k_proj", "v_proj", "o_proj","gate_proj"],# model_loading_params.llm_int8_skip_modules,
    # llm_int8_has_fp16_weight= True# model_loading_params.llm_int8_has_fp16_weight
)

model = LlamaForTokenClassification.from_pretrained(
    BASE_MODEL_CHECKPOINT, 
    num_labels=len(label2id), 
    id2label=id2label, 
    label2id=label2id,
    token = LLAMA_TOKEN,
    quantization_config=bnb_config,    
    device_map = 'cuda:0',
    cache_dir='/data/disk1/share/pferrazzi/.cache'
    )

peft_config = LoraConfig(task_type=TaskType.TOKEN_CLS, inference_mode=False, r=12, lora_alpha=32, lora_dropout=0.1)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()


tokenized_ds = ds.map(tokenize_and_align_labels, batched=True)# dataset_format_converter.dataset.map(tokenize_and_align_labels, batched=True)
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)


if use_e3c:
    train_data, val_data, test_data = preprocessor.split_layer_into_train_val_test_(tokenized_ds, TRAIN_LAYER)



wandb.login(key = WANDB_KEY)
run = wandb.init(project='ls_llama_e3c', job_type="training", anonymous="allow",
                  name=datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                  config={'model': BASE_MODEL_CHECKPOINT, 
                          'time': datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")})




def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["soverall_f1"],
        "accuracy": results["overall_accuracy"],
    }



training_args = TrainingArguments(
    output_dir="my_awesome_ds_model",
    learning_rate=1e-4,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps= 4,
    num_train_epochs=1,
    max_steps=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    #load_best_model_at_end=True,
    push_to_hub=True,
    hub_token=HF_TOKEN,
    hub_model_id='ls_llama_e3c',
    report_to="wandb",
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data.select(range(6)),
    eval_dataset=val_data.select(range(6)),
    tokenizer=tokenizer,
    data_collator=data_collator,
    # compute_metrics=compute_metrics,
)

trainer.train()



  from .autonotebook import tqdm as notebook_tqdm
  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(
Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.15s/it]
Some weights of LlamaForTokenClassification were not initialized from the model checkpoint at meta-llama/Llama-2-7b-chat-hf and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


trainable params: 6,303,747 || all params: 6,613,659,654 || trainable%: 0.09531405197404492


[34m[1mwandb[0m: Currently logged in as: [33mferrazzipietro[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/pferrazzi/.netrc


dataloader_config = DataLoaderConfiguration(dispatch_batches=None)
Detected kernel version 4.15.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,No log,
2,No log,
3,No log,
4,No log,
5,No log,
6,No log,
6,No log,



Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/resolve/main/config.json.
Access to model meta-llama/Llama-2-7b-chat-hf is restricted. You must be authenticated to access it. - silently ignoring the lookup for the file config.json in meta-llama/Llama-2-7b-chat-hf.

Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/resolve/main/config.json.
Access to model meta-llama/Llama-2-7b-chat-hf is restricted. You must be authenticated to access it. - silently ignoring the lookup for the file config.json in meta-llama/Llama-2-7b-chat-hf.

Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/resolve/main/config.json.
Access to model meta-llama/Llama-2-7b-chat-hf is restricted. You must be authenticated to access it. - silently ignoring the lookup for the file config.json in meta-llama/Llama-2-7b-chat-hf.

Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/r

TrainOutput(global_step=10, training_loss=3.366946792602539, metrics={'train_runtime': 109.9806, 'train_samples_per_second': 0.727, 'train_steps_per_second': 0.091, 'total_flos': 131600366220960.0, 'train_loss': 3.366946792602539, 'epoch': 6.67})

In [4]:
trainer

False

In [8]:
from torch import nn
for layer in trainer.model.children():
    #if isinstance(layer, nn.Linear):
    print(layer.state_dict())

OrderedDict([('model.model.embed_tokens.weight', tensor([[ 1.1921e-06, -1.7881e-06, -4.2915e-06,  ...,  8.3447e-07,
         -6.4373e-06,  8.9407e-07],
        [ 1.8387e-03, -3.8147e-03,  9.6130e-04,  ..., -9.0332e-03,
          2.6550e-03, -3.7537e-03],
        [ 1.0193e-02,  9.7656e-03, -5.2795e-03,  ...,  2.9297e-03,
          4.0817e-04, -5.0964e-03],
        ...,
        [-1.3550e-02, -3.5095e-03, -1.8921e-02,  ..., -9.3384e-03,
          8.7891e-03, -1.2741e-03],
        [-1.0681e-02,  8.9722e-03,  1.2573e-02,  ..., -3.3691e-02,
         -1.6235e-02,  3.0212e-03],
        [-9.0942e-03, -1.8082e-03, -6.9809e-04,  ...,  3.8452e-03,
         -1.2085e-02,  7.2861e-04]], device='cuda:0', dtype=torch.float16)), ('model.model.layers.0.self_attn.q_proj.base_layer.weight', tensor([[ 83],
        [103],
        [ 74],
        ...,
        [114],
        [108],
        [197]], device='cuda:0', dtype=torch.uint8)), ('model.model.layers.0.self_attn.q_proj.base_layer.weight.absmax', tensor([ 7

In [2]:
sequences = ['hello', 'its me']
tokenized = tokenizer(sequences, padding=True, return_tensors="pt")
trainer.model(**tokenized)

TokenClassifierOutput(loss={'logits': tensor([[[nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
         [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
         [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]],

        [[nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
         [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
         [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]]],
       dtype=torch.float16, grad_fn=<ToCopyBackward0>)}, logits=tensor([[[nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
         [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
         [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]],

        [[nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
         [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
         [nan, nan, nan, nan, nan, nan, nan, nan, nan, n

### USO IL LORO

In [1]:
# -*- coding: utf-8 -*-

import json
import sys
import numpy as np
import evaluate
from datasets import load_dataset, Dataset, DatasetDict
from transformers import AutoTokenizer
from transformers import DataCollatorForTokenClassification
from transformers import TrainingArguments, Trainer
from peft import get_peft_model, LoraConfig, TaskType
from dotenv import dotenv_values

from modeling_llama import LlamaForTokenClassification


def load_ontonotesv5():
    ret = {}
    for split_name in ['train', 'dev', 'test']:
        data = []
        with open(f'./data/NER/ontonotesv5/{split_name}.jsonl', 'r') as reader:
            for line in reader:
                data.append(json.loads(line))
        ret[split_name] = Dataset.from_list(data)
    return DatasetDict(ret)


# if len(sys.argv) != 3:
#     print('usage python %.py task model_size')
#     sys.exit()
# task, model_size = sys.argv[1], sys.argv[2].lower()
task, model_size = 'wnut_17', '7b'
print(f'handling task {task}')

epochs = 10
batch_size = 2
learning_rate = 1e-4
max_length = 64
if model_size == '7b':
    model_id = 'meta-llama/Llama-2-7b-chat-hf'#'NousResearch/Llama-2-7b-hf'
    lora_r = 12
elif model_size == '13b':
    model_id = 'NousResearch/Llama-2-13b-hf'
    lora_r = 12
else:
    raise NotImplementedError
tokenizer = AutoTokenizer.from_pretrained(model_id,token = dotenv_values(".env.base")['LLAMA_TOKEN'],
    cache_dir='/data/disk1/share/pferrazzi/.cache')
tokenizer.pad_token = tokenizer.eos_token
# seqeval = evaluate.load("seqeval")
if task == 'wnut_17':
    ds = load_dataset("wnut_17")
    label2id = { "O": 0, "B-corporation": 1, "I-corporation": 2, "B-creative-work": 3, "I-creative-work": 4, "B-group": 5, "I-group": 6, "B-location": 7, "I-location": 8, "B-person": 9, "I-person": 10, "B-product": 11, "I-product": 12, }
elif task == 'conll2003':
    ds = load_dataset("conll2003")
    label2id = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}
elif task == 'ontonotesv5':
    ds = load_ontonotesv5()
    label2id = {'O': 0, 'B-NORP': 1, 'B-PERSON': 2, 'B-WORK_OF_ART': 3, 'B-QUANTITY': 4, 'B-EVENT': 5, 'B-DATE': 6, 'B-TIME': 7, 'B-PERCENT': 8, 'B-LANGUAGE': 9, 'B-ORG': 10, 'B-CARDINAL': 11, 'B-LAW': 12, 'B-GPE': 13, 'B-PRODUCT': 14, 'B-LOC': 15, 'B-MONEY': 16, 'B-ORDINAL': 17, 'B-FAC': 18}
else:
    raise NotImplementedError
id2label = {v: k for k, v in label2id.items()}
label_list = list(label2id.keys()) # ds["train"].features[f"ner_tags"].feature.names

model = LlamaForTokenClassification.from_pretrained(
    model_id, num_labels=len(label2id), id2label=id2label, label2id=label2id,
    load_in_4bit=True,
    token = dotenv_values(".env.base")['LLAMA_TOKEN'],
    cache_dir='/data/disk1/share/pferrazzi/.cache'

)# .bfloat16()

peft_config = LoraConfig(task_type=TaskType.TOKEN_CLS, inference_mode=False, r=lora_r, lora_alpha=32, lora_dropout=0.1)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()


def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], is_split_into_words=True, padding='longest', max_length=max_length, truncation=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


tokenized_ds = ds.map(tokenize_and_align_labels, batched=True)
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)


# def compute_metrics(p):
#     predictions, labels = p
#     predictions = np.argmax(predictions, axis=2)

#     true_predictions = [
#         [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
#         for prediction, label in zip(predictions, labels)
#     ]
#     true_labels = [
#         [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
#         for prediction, label in zip(predictions, labels)
#     ]

#     results = seqeval.compute(predictions=true_predictions, references=true_labels)
#     return {
#         "precision": results["overall_precision"],
#         "recall": results["overall_recall"],
#         "f1": results["overall_f1"],
#         "accuracy": results["overall_accuracy"],
#     }


training_args = TrainingArguments(
    output_dir="my_awesome_ds_model",
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps= 4,
    num_train_epochs=epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"].select(range(16)),
    eval_dataset=tokenized_ds["test"].select(range(16)),
    tokenizer=tokenizer,
    data_collator=data_collator,
    # compute_metrics=compute_metrics,
)

trainer.train()


  from .autonotebook import tqdm as notebook_tqdm
  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(


handling task wnut_17


Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.12s/it]
Some weights of LlamaForTokenClassification were not initialized from the model checkpoint at meta-llama/Llama-2-7b-chat-hf and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 6,344,717 || all params: 6,613,741,594 || trainable%: 0.09593233890111372


Map: 100%|██████████| 1009/1009 [00:00<00:00, 10585.56 examples/s]
dataloader_config = DataLoaderConfiguration(dispatch_batches=None)
Detected kernel version 4.15.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mferrazzipietro[0m. Use [1m`wandb login --relogin`[0m to force relogin


You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,No log,
2,No log,
3,No log,
4,No log,
5,No log,
6,No log,
7,No log,
8,No log,
9,No log,
10,No log,



Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/resolve/main/config.json.
Access to model meta-llama/Llama-2-7b-chat-hf is restricted. You must be authenticated to access it. - silently ignoring the lookup for the file config.json in meta-llama/Llama-2-7b-chat-hf.

Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/resolve/main/config.json.
Access to model meta-llama/Llama-2-7b-chat-hf is restricted. You must be authenticated to access it. - silently ignoring the lookup for the file config.json in meta-llama/Llama-2-7b-chat-hf.

Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/resolve/main/config.json.
Access to model meta-llama/Llama-2-7b-chat-hf is restricted. You must be authenticated to access it. - silently ignoring the lookup for the file config.json in meta-llama/Llama-2-7b-chat-hf.

Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/r

TrainOutput(global_step=10, training_loss=392.2526611328125, metrics={'train_runtime': 161.8758, 'train_samples_per_second': 0.988, 'train_steps_per_second': 0.062, 'total_flos': 199352334704640.0, 'train_loss': 392.2526611328125, 'epoch': 10.0})

In [3]:
sequences = ['hello', 'its me']
tokenized = tokenizer(sequences, padding=True, return_tensors="pt")
trainer.model(**tokenized)

TokenClassifierOutput(loss={'logits': tensor([[[nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
         [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
         [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]],

        [[nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
         [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
         [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]]],
       dtype=torch.float16, grad_fn=<ToCopyBackward0>)}, logits=tensor([[[nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
         [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
         [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]],

        [[nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
         [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
         [nan, nan, nan, nan, nan, nan, nan, nan, nan, n