In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM, PreTrainedTokenizer
from torch.utils.tensorboard import SummaryWriter
import pandas as pd
import time

In [2]:
device = "mps"  # Change to "cuda" if using an NVIDIA GPU, or "cpu" otherwise


MODEL_NAME = "Qwen/Qwen2.5-0.5B"  # Replace with your desired pre-trained model name
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(device)

In [3]:
# -------------------------------
# 3. Define LoRa Modules
# -------------------------------
class LoRaLinear(nn.Module):
    """
    A LoRa (Low-Rank Adaptation) linear module that factors the weight update
    into two low-rank matrices and adds them to the base weights.
    """
    def __init__(self, in_features, out_features, rank=4):
        super().__init__()
        self.low_rank_A = nn.Parameter(torch.zeros(in_features, rank))
        self.low_rank_B = nn.Parameter(torch.zeros(rank, out_features))
        self.scaling = 0.01  # Scale factor for initialization

        nn.init.normal_(self.low_rank_A, mean=0.0, std=self.scaling)
        nn.init.normal_(self.low_rank_B, mean=0.0, std=self.scaling)

    def forward(self, x):
        return x @ self.low_rank_A @ self.low_rank_B

In [4]:
class LoRaLinearWrapper(nn.Module):
    """
    A wrapper for an existing nn.Linear layer that adds a LoRa offset on top.
    This replaces the original linear forward pass with:
        y = original_linear(x) + LoRaLinear(x)
    """
    def __init__(self, base_linear: nn.Linear, rank=4):
        super().__init__()
        self.base_linear = base_linear
        # Create the LoRa adapter
        self.lora = LoRaLinear(
            base_linear.in_features,
            base_linear.out_features,
            rank=rank
        )

    def forward(self, x):
        # Original projection
        out = self.base_linear(x)
        # Add LoRa offset
        out += self.lora(x)
        return out

In [5]:
# 2. Collect needed replacements in a list
replacements = []

def replace_linear_with_lora(model, rank, alpha):
    for name, module in model.named_children():
        if isinstance(module, torch.nn.Linear):
            # Replace the Linear layer with LinearWithLoRA
            if name == 'q_proj' or name == 'v_proj':
                setattr(model, name, LoRaLinearWrapper(module, rank))
        else:
            # Recursively apply the same function to child modules
            replace_linear_with_lora(module, rank, alpha)

replace_linear_with_lora(model, rank=16, alpha=1)

In [6]:
# -------------------------------
# 5. Prepare Dataset
# -------------------------------
dataframe = pd.read_csv('dataset.csv')
max_seq_length = 512

# Define a custom dataset class
class QADataset(Dataset):
    def __init__(self, tokenized_data):
        self.data = tokenized_data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

In [7]:
# Pre-tokenize the data
def tokenize_data(dataframe, tokenizer: PreTrainedTokenizer, max_seq_length: int, device: str):
    tokenized_data = []
    for _, row in dataframe.iterrows():
        question = row['Question']
        answer = row['Answer']

        try:
            # Tokenize the question
            inputs = tokenizer(
                question,
                padding='max_length',
                truncation=True,
                max_length=max_seq_length,
                return_tensors="pt"
            )

            # Tokenize the answer as labels
            labels = tokenizer(
                f"{question}: {answer}",
                padding='max_length',
                truncation=True,
                max_length=max_seq_length,
                return_tensors="pt"
            )['input_ids'].squeeze(0)  # shape: [max_seq_length]

            tokenized_data.append({
                'input_ids': inputs['input_ids'].squeeze(0).to(device),
                'attention_mask': inputs['attention_mask'].squeeze(0).to(device),
                'labels': labels.to(device)
            })
        except Exception as e:
            print(f"Error tokenizing: {e}")
            continue

    return tokenized_data

In [8]:
tokenized_data = tokenize_data(dataframe, tokenizer, max_seq_length, device)
qa_dataset = QADataset(tokenized_data)
train_loader = DataLoader(qa_dataset, batch_size=4, shuffle=True)
model.to(device)

# -------------------------------
# 6. Select LoRa Parameters & Optimizer
# -------------------------------
# Only keep LoRa parameters (we gave them names 'lora' inside LoRaLinearWrapper).
lora_parameters = []
for name, param in model.named_parameters():
    # You can check for "low_rank_" or the name "lora." in name
    # to include only the newly introduced LoRa params.
    if "low_rank_A" in name or "low_rank_B" in name:
        param.requires_grad = True
        lora_parameters.append(param)
    else:
        param.requires_grad = False

Error tokenizing: text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).


In [9]:
optimizer = AdamW(lora_parameters, lr=5e-5)

# -------------------------------
# 7. Training Setup
# -------------------------------
epochs = 1

exp_name = "LoRa_Example"
timestamp = time.time()

# -------------------------------
# 8. Training Loop
# -------------------------------
global_step = 0
model.train()

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(count_parameters(model), "trainable parameters in the model.")

1081344 trainable parameters in the model.


In [10]:
writer = SummaryWriter(log_dir=f"./tensorboard_logs/{exp_name}_{int(timestamp)}")
for epoch in range(epochs):
    epoch_loss = 0.0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}", leave=True)
    
    for step, batch in enumerate(progress_bar):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        optimizer.zero_grad()

        # Forward pass
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels = labels
        )
        loss = outputs.loss

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        global_step += 1

        # Update tqdm progress bar description with loss
        avg_loss = epoch_loss / (step + 1)
        progress_bar.set_description(f"Epoch {epoch+1}/{epochs} | Step {step+1}/{len(train_loader)} | Loss: {avg_loss:.4f}")

        writer.add_scalar("Training Loss", avg_loss, global_step)

    # End of epoch logging
    epoch_loss /= len(train_loader)
    writer.add_scalar("Epoch Loss", epoch_loss, epoch)
    print(f"Epoch {epoch+1} finished. Avg Loss = {epoch_loss:.4f}")


writer.close()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
Epoch 1/1:   0%|          | 0/1224 [00:00<?, ?it/s]TOKENIZERS_PARALLELISM=(true | false)
Epoch 1/1 | Step 1224/1224 | Loss: 2.0628: 100%|██████████| 1224/1224 [18:27<00:00,  1.11it/s]

Epoch 1 finished. Avg Loss = 2.0628





In [11]:
torch.save(
    {name: param for name, param in model.named_parameters() if param.requires_grad},
    "lora_weights.pt"
)

In [15]:
def generate_response(
    model, 
    tokenizer, 
    prompt: str, 
    device: str = "mps", 
    max_length: int = 128, 
    temperature: float = 1.0, 
    top_k: int = 50, 
    top_p: float = 0.95, 
    do_sample: bool = True
):
    """Helper function to run inference on a single model."""
    # Tokenize input
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    
    # Generate output
    with torch.no_grad():
        output_ids = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=max_length,
            temperature=temperature,
            top_k=top_k,
            top_p=top_p,
            do_sample=do_sample,
            pad_token_id=tokenizer.eos_token_id
        )
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

def compare_models(
    prompt: str,
    original_model,   # Replace with whichever base model you used
    fine_tuned_model,
    tokenizer,
    device: str = "mps"
):
    """
    Loads the original (pretrained) model and the fine-tuned model, 
    generates responses to the same prompt, and prints both.
    """

    # 3. Generate response from the original model
    print("\nGenerating response from the original model...")
    original_response = generate_response(
        model=original_model,
        tokenizer=tokenizer,
        prompt=prompt,
        device=device
    )

    # 4. Generate response from the fine-tuned model
    print("Generating response from the fine-tuned model...")
    fine_tuned_response = generate_response(
        model=fine_tuned_model,
        tokenizer=tokenizer,
        prompt=prompt,
        device=device
    )

    # 5. Print results side by side
    print("\nPROMPT:")
    print(prompt)
    print("\nORIGINAL MODEL RESPONSE:")
    print(original_response)
    print("\nFINE-TUNED MODEL RESPONSE:")
    print(fine_tuned_response)

In [19]:
original_model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(device)

test_prompt = "What is the capital of france?"
compare_models(
    prompt=test_prompt,
    original_model=original_model,
    fine_tuned_model=model,
    tokenizer=tokenizer,
    device="mps"
)

Both `max_new_tokens` (=2048) and `max_length`(=128) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



Generating response from the original model...


Both `max_new_tokens` (=2048) and `max_length`(=128) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Generating response from the fine-tuned model...

PROMPT:
What is the capital of france?

ORIGINAL MODEL RESPONSE:
What is the capital of france? - le peuple de france - France 2 - 1995 - SÉRIE 1 - Télérama
The capital of France is Paris. French people call it "le capital d'Angleterre", "la capitale de l'Angleterre" or even "la capitale de l'Europe", just like "the capital of the United Kingdom" or "the capital of Europe". It is also called "le capital de France", "le capital français" and "le capital de la France".
The name "le capital de la France" means "the capital of France" and the name "le capital de l'Angleterre" means "the capital of England". The word "France" is in French "France". It means the land of France.
The capital of France is "Paris". The city is very popular with visitors because it is not far from London and Vienna. Paris is the country's most famous city and is located in the center of France, near the Loire river.
The French capital Paris is located at the foot 