<a href="https://colab.research.google.com/github/frank-morales2020/MLxDL/blob/main/demo_alignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install colab-env --quiet

import warnings
warnings.filterwarnings("ignore", message="You seem to be using the pipelines sequentially on GPU")

import colab_env
import os

access_token_write = os.getenv("HUGGINGFACE_ACCESS_TOKEN_WRITE")

from huggingface_hub import login

login(
  token=access_token_write,
  add_to_git_credential=True
)

In [2]:
import warnings
warnings.filterwarnings("ignore", message="You seem to be using the pipelines sequentially on GPU")

In [None]:
!pip install transformers datasets accelerate bitsandbytes peft -q

In [None]:
import os

import bitsandbytes
import torch
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, TaskType
from transformers import (
    AutoModelForQuestionAnswering,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
)

# Set environment variable for CUDA debugging
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

# Load 10% of the SQuAD dataset
squad = load_dataset("squad", split="train[:10%]")

# Load the tokenizer and model with 4-bit quantization
model_name = "mistralai/Mistral-7B-v0.1"  # Changed to Mistral 7B
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # Set pad token

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForQuestionAnswering.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,  # Trust remote code for Mistral
)

# Preprocess the dataset
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Get the answer spans
    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while idx < len(sequence_ids) and sequence_ids[idx] != 1:
            idx += 1
        context_start = idx

        while idx < len(sequence_ids) and sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs


tokenized_squad = squad.map(
    preprocess_function,
    batched=True,
    remove_columns=squad.column_names,
)

# Define LoRA config
# Define LoRA config
lora_config = LoraConfig(
    r=16,              # Increased rank to 16 for more complex adaptations
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM,
)

# Apply LoRA
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


# Define training arguments
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=2,  # Reduced batch size to 2
    gradient_accumulation_steps=8,  # Increased gradient accumulation to 8
    learning_rate=5e-5,             # Adjusted learning rate
    fp16=True,
    logging_steps=50,               # Log every 50 steps
    save_steps=100,
    num_train_epochs=1,
    warmup_steps=100,
    max_grad_norm=1.0,
    report_to="none",
    push_to_hub=False,
    optim="adamw_torch",
    # weight_decay=0.01   # Add weight decay if needed
)

# Define Trainer (using the standard Trainer)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_squad,
)

# Train the model
trainer.train()

# Save the model
trainer.save_model("./fine_tuned_squad_model")

# -------------------
# Test the model
# -------------------

# Load the fine-tuned model
model = AutoModelForQuestionAnswering.from_pretrained(
    "./fine_tuned_squad_model", load_in_4bit=True, device_map="auto", trust_remote_code=True
)

# Test example
question = "What is a potential concern about large language models?"
context = "Large language models can be helpful tools, but it's important to use them responsibly. They can sometimes generate incorrect or misleading information."

# Prepare input for the model
inputs = tokenizer(question, context, return_tensors="pt").to(model.device)

# Get the answer
with torch.no_grad():
    outputs = model(**inputs)

# Get the predicted answer span
answer_start_index = outputs.start_logits.argmax()
answer_end_index = outputs.end_logits.argmax()

# Decode the predicted answer
predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
generated_answer = tokenizer.decode(predict_answer_tokens)

print("Question:", question)
print("Generated Answer:", generated_answer)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of MistralForQuestionAnswering were not initialized from the model checkpoint at mistralai/Mistral-7B-v0.1 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 6,815,744 || all params: 7,117,484,034 || trainable%: 0.0958


Step,Training Loss
