<a href="https://colab.research.google.com/github/frank-morales2020/MLxDL/blob/main/demo_alignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install colab-env --quiet

import warnings
warnings.filterwarnings("ignore", message="You seem to be using the pipelines sequentially on GPU")

import colab_env
import os

access_token_write = os.getenv("HUGGINGFACE_ACCESS_TOKEN_WRITE")

from huggingface_hub import login

login(
  token=access_token_write,
  add_to_git_credential=True
)

In [2]:
import warnings
warnings.filterwarnings("ignore", message="You seem to be using the pipelines sequentially on GPU")

In [None]:
!pip install transformers datasets accelerate bitsandbytes peft -q

In [34]:
!nvidia-smi

Sun Jan 19 16:44:03 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA L4                      Off | 00000000:00:03.0 Off |                    0 |
| N/A   76C    P0              34W /  72W |  15575MiB / 23034MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
import os

import bitsandbytes
import torch
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, TaskType
from transformers import (
    AutoModelForQuestionAnswering,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
)

# Set environment variable for CUDA debugging
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

# Load 10% of the SQuAD dataset
squad = load_dataset("squad", split="train[:10%]")

# Load the tokenizer and model with 4-bit quantization
model_name = "mistralai/Mistral-7B-v0.1"  # Changed to Mistral 7B
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # Set pad token

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForQuestionAnswering.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,  # Trust remote code for Mistral
)

# Preprocess the dataset
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Get the answer spans
    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while idx < len(sequence_ids) and sequence_ids[idx] != 1:
            idx += 1
        context_start = idx

        while idx < len(sequence_ids) and sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs


tokenized_squad = squad.map(
    preprocess_function,
    batched=True,
    remove_columns=squad.column_names,
)

# Define LoRA config
# Define LoRA config
lora_config = LoraConfig(
    r=16,              # Increased rank to 16 for more complex adaptations
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM,
)

# Apply LoRA
model = get_peft_model(model, lora_config)

print('\n')
model.print_trainable_parameters()
print('\n')


# Define training arguments
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=2,  # Reduced batch size to 2
    gradient_accumulation_steps=8,  # Increased gradient accumulation to 8
    learning_rate=5e-5,             # Adjusted learning rate
    fp16=True,
    logging_steps=50,               # Log every 50 steps
    save_steps=100,
    num_train_epochs=1,
    warmup_steps=100,
    max_grad_norm=1.0,
    report_to="none",
    push_to_hub=False,
    optim="adamw_torch",
    # weight_decay=0.01   # Add weight decay if needed
)

# Define Trainer (using the standard Trainer)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_squad,
)

# Train the model
trainer.train()

# Save the model
trainer.save_model("./fine_tuned_squad_model")

# -------------------
# Test the model
# -------------------

# Load the fine-tuned model
model = AutoModelForQuestionAnswering.from_pretrained(
    "./fine_tuned_squad_model", load_in_4bit=True, device_map="auto", trust_remote_code=True
)

# Test example
question = "What is a potential concern about large language models?"
context = "Large language models can be helpful tools, but it's important to use them responsibly. They can sometimes generate incorrect or misleading information."

# Prepare input for the model
inputs = tokenizer(question, context, return_tensors="pt").to(model.device)

# Get the answer
with torch.no_grad():
    outputs = model(**inputs)

# Get the predicted answer span
answer_start_index = outputs.start_logits.argmax()
answer_end_index = outputs.end_logits.argmax()

# Decode the predicted answer
predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
generated_answer = tokenizer.decode(predict_answer_tokens)

print("Question:", question)
print("Generated Answer:", generated_answer)

In [None]:
from huggingface_hub import upload_folder, create_repo

# Replace with the path to your saved model directory
model_path = "/content/gdrive/MyDrive/model/fine_tuned_squad_model"
repo_id = "frankmorales2020/Mistral-7B-v0.1_squad_alignment"

# Create the repository if it doesn't exist
create_repo(repo_id, exist_ok=True)  # exist_ok=True to avoid errors if it already exists

upload_folder(
    folder_path=model_path,
    repo_id=repo_id,
    commit_message="Upload frankmorales2020/Mistral-7B-v0.1_squad_alignment",
)
# full code the test the model

In [None]:
# -------------------
# Test the model
# -------------------

!pip install transformers datasets accelerate bitsandbytes peft -q

import re
import torch
from transformers import (
    AutoModelForQuestionAnswering,
    AutoTokenizer,
    BitsAndBytesConfig,
)

# Create BitsAndBytesConfig object
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)

# Load the fine-tuned model with quantization_config
model_path = "/content/gdrive/MyDrive/model/fine_tuned_squad_model"  # Replace with your model path
model = AutoModelForQuestionAnswering.from_pretrained(
    model_path,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
tokenizer.pad_token = tokenizer.eos_token  # Set pad token

# Test example
question = "What is a potential concern about large language models?"
context = """One potential concern about large language models is that they might generate incorrect or misleading information.
              This is because they learn from vast amounts of text data, which may contain biases or inaccuracies.
              While large language models can be helpful tools, it's crucial to use them responsibly and with careful evaluation."""  # Revised context


def answer_question(question, context):
    # Check for context overlap with question
    if question in context:
        print("Warning: Context contains the question. This might cause issues.")

    inputs = tokenizer(question, context, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model(**inputs)

    answer_start_index = outputs.start_logits.argmax()
    answer_end_index = outputs.end_logits.argmax()

    #print("Start Logits:", outputs.start_logits)
    #print("End Logits:", outputs.end_logits)

    # Limit answer length (optional - adjust as needed)
    max_answer_length = 30

    # Relaxed thresholds further
    start_logit_threshold = 0.0  # Adjust as needed
    end_logit_threshold = 0.0  # Adjust or remove as needed

    if outputs.start_logits[0, answer_start_index] < start_logit_threshold:
        print("Low confidence in start position. Trying second-best.")
        # Get second-best prediction index for start
        start_logits_sorted, start_indices_sorted = torch.sort(
            outputs.start_logits[0], descending=True
        )
        answer_start_index = start_indices_sorted[1].item()

    if outputs.end_logits[0, answer_end_index] < end_logit_threshold:
        print("Low confidence in end position. Trying second-best.")
        # Get second-best prediction index for end
        end_logits_sorted, end_indices_sorted = torch.sort(
            outputs.end_logits[0], descending=True
        )
        answer_end_index = end_indices_sorted[1].item()

    predict_answer_tokens = inputs.input_ids[
        0, answer_start_index : answer_end_index + 1
    ]
    generated_answer = tokenizer.decode(predict_answer_tokens)

    # If the generated answer is empty or doesn't make sense, try keyword-based search
    if not generated_answer or len(generated_answer.split()) < 2:
        print("Trying keyword-based search.")
        keywords = [
            "risk",
            "inaccuracy",
            "misleading",
            "bias",
            "danger",
            "problem",
            "issue",
        ]  # Expanded keywords
        for keyword in keywords:
            if keyword in context:
                # Find the keyword in the context and extract the surrounding text (wider range)
                match = re.search(r".{0,100}" + keyword + r".{0,100}", context)
                if match:
                    generated_answer = match.group(0)
                    break  # Stop if a keyword is found and an answer is extracted

    # Relaxed post-processing (adjust as needed):
    generated_answer = generated_answer.replace("<s>", "").strip()

    # Optional post-processing (re-introduce if needed):
    # 1. Filter punctuation-only answers
    if generated_answer.strip() in [".", ",", "!", "?", ";", ":"]:
        print("Answer is likely punctuation. Filtering...")
        generated_answer = ""
    # 2. Filter answers overlapping with the question
    # if generated_answer in question:
    #    print("Answer overlaps with question. Filtering...")
    #    generated_answer = ""

    return generated_answer


generated_answer = answer_question(question, context)

In [30]:
print("Question:", question)
print("Generated Answer:", generated_answer)

Question: What is a potential concern about large language models?
Generated Answer: This is because they learn from vast amounts of text data, which may contain biases or inaccuracies. 
              While large language models can be helpful tools, it's crucial to use them respons
