In [34]:
import torch
import torch.nn.functional as F
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

model_name = "Qwen/Qwen2.5-7B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",  # Automatically distribute across available GPUs
)

Loading checkpoint shards: 100%|██████████| 4/4 [00:09<00:00,  2.43s/it]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [35]:
from peft import LoraConfig, get_peft_model
from transformers import Trainer

lora_config = LoraConfig(
    r=16,  # Rank of LoRA adaptation
    lora_alpha=32,  # Scaling factor
    lora_dropout=0.1,  # Dropout for regularization
    target_modules=["q_proj", "v_proj"],  # Apply LoRA to attention layers
    bias="none")

model = get_peft_model(model, lora_config)

model.print_trainable_parameters()

trainable params: 5,046,272 || all params: 7,620,662,784 || trainable%: 0.0662


In [36]:
from datasets import Dataset
import pandas as pd

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding='max_length', max_length=512)

data = {
    "text": ["I love programming", "Python is amazing", "I hate bugs", "I enjoy learning new things", "I dislike errors"],
    "label": [1, 1, 0, 1, 0]
}
df = pd.DataFrame(data)
dataset = Dataset.from_pandas(df)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

tokenized_dataset

Map: 100%|██████████| 5/5 [00:00<00:00, 478.88 examples/s]


Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 5
})

In [37]:
tokenized_dataset.column_names

['text', 'label', 'input_ids', 'attention_mask']

In [32]:
tokenized_dataset = tokenized_dataset.remove_columns(["text"])


In [33]:
from transformers import TrainingArguments

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        """
        Custom loss function for fine-tuning.

        Default: CrossEntropyLoss
        Here: Example using Mean Squared Error (MSE) Loss
        """
        labels = inputs.pop("labels")  # Extract labels
        outputs = model(**inputs)  # Forward pass
        logits = outputs.logits  # Extract logits
        print(f"labels: {labels}")
        print(f"outputs: {outputs}")
        print(f"logits: {logits}")

        loss = F.nll_loss(logits, labels)

        return (loss, outputs) if return_outputs else loss

training_args = TrainingArguments(
    output_dir="./llama-lora",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    logging_dir="./logs",
    learning_rate=2e-4,
    num_train_epochs=1,
    save_strategy="epoch",
    fp16=True,  # Mixed precision
    optim="adamw_torch",
    remove_unused_columns=False
)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
)

trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


labels: tensor([0, 0], device='cuda:0')
outputs: CausalLMOutputWithPast(loss={'logits': tensor([[[nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         ...,
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan]],

        [[nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         ...,
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan]]], device='cuda:0',
       grad_fn=<ToCopyBackward0>), 'past_key_values': ((tensor([[[[ 2.1713e-02,  9.0820e-01,  2.8828e+00,  ..., -1.1731e+02,
           -1.6350e+02, -1.1375e+02],
          [ 3.3561e+00,  1.4028e+00,  4.4800e-02,  ..., -1.1494e+02,
           -1.6400e+02, -1.1381e+02],
          [ 4.2491e+00,  6.44

RuntimeError: Expected target size [2, 152064], got [2]