<a href="https://colab.research.google.com/github/frank-morales2020/MLxDL/blob/main/Copy_of_emotion_noGPT2_YesMistral7b_as_text_generator_LoRA_chapter5dot5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install Hugging Face libraries
!pip install  --upgrade transformers datasets accelerate evaluate bitsandbytes --quiet

In [1]:
!nvidia-smi

Sun Jan  5 15:16:37 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off | 00000000:00:04.0 Off |                    0 |
| N/A   30C    P0              43W / 400W |      2MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
# Import required libraries
import json            # For parsing JSON data
import random          # For setting seeds and shuffling data
import gzip            # For decompressing dataset
import requests        # For downloading dataset from URL
import torch           # Main PyTorch library
from peft import get_peft_model, LoraConfig, TaskType  # For efficient finetuning using LoRA
from torch.utils.data import Dataset, DataLoader  # For dataset handling
from transformers import AutoTokenizer, AutoModelForCausalLM  # Hugging Face model components
from torch.optim import AdamW    # Optimizer for training
from tqdm import tqdm   # Progress bar utilities
import re               # For text normalization

def set_seed(seed):
    """
    Sets random seeds for reproducibility across different libraries.

    Args:
        seed (int): Seed value for random number generation
    """
    # Set Python's built-in random seed
    random.seed(seed)
    # Set PyTorch's CPU random seed
    torch.manual_seed(seed)
    # Set seed for all available GPUs
    torch.cuda.manual_seed_all(seed)
    # Request cuDNN to use deterministic algorithms
    torch.backends.cudnn.deterministic = True
    # Disable cuDNN's auto-tuner for consistent behavior
    torch.backends.cudnn.benchmark = False

def build_prompt(text):
    """
    Creates a standardized prompt for emotion classification.

    Args:
        text (str): Input text to classify

    Returns:
        str: Formatted prompt for the model
    """
    # Format the input text into a consistent prompt structure
    return f"Predict the emotion for the following text: {text}\nEmotion:"

def encode_text(tokenizer, text, return_tensor=False):
    """
    Encodes text using the provided tokenizer.

    Args:
        tokenizer: Hugging Face tokenizer
        text (str): Text to encode
        return_tensor (bool): Whether to return PyTorch tensor

    Returns:
        List or tensor of token IDs
    """
    # If tensor output is requested, encode with PyTorch tensors
    if return_tensor:
        return tokenizer.encode(
            text, add_special_tokens=False, return_tensors="pt"
        )
    # Otherwise return list of token IDs
    else:
        return tokenizer.encode(text, add_special_tokens=False)

def decode_text(tokenizer, token_ids):
    """
    Decodes token IDs back to text.

    Args:
        tokenizer: Hugging Face tokenizer
        token_ids: List or tensor of token IDs

    Returns:
        str: Decoded text
    """
    # Convert token IDs back to text, skipping special tokens
    return tokenizer.decode(token_ids, skip_special_tokens=True)

class PromptCompletionDataset(Dataset):
    """
    PyTorch Dataset for prompt-completion pairs.
    Handles the conversion of text data into model-ready format.

    Args:
        data (list): List of dictionaries containing prompts and completions
        tokenizer: Hugging Face tokenizer
    """
    def __init__(self, data, tokenizer):
        # Store the raw data and tokenizer for later use
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        # Return the total number of examples in the dataset
        return len(self.data)

    def __getitem__(self, idx):
        """
        Returns a single training example.

        Args:
            idx (int): Index of the example to fetch

        Returns:
            dict: Contains input_ids, labels, prompt, and expected completion
        """
        # Get the specific example from our dataset
        item = self.data[idx]
        prompt = item["prompt"]
        completion = item["completion"]

        # Convert text to token IDs for both prompt and completion
        encoded_prompt = encode_text(self.tokenizer, prompt)
        encoded_completion = encode_text(self.tokenizer, completion)
        # Get the end-of-sequence token ID
        eos_token = self.tokenizer.eos_token_id

        # Combine prompt and completion tokens with EOS token
        input_ids = encoded_prompt + encoded_completion + [eos_token]
        # Create labels: -100 for prompt (ignored in loss), completion tokens for learning
        labels = [-100] * len(encoded_prompt) + encoded_completion + [eos_token]

        return {
            "input_ids": input_ids,
            "labels": labels,
            "prompt": prompt,
            "expected_completion": completion
        }

def collate_fn(batch):
    """
    Collates batch of examples into training-ready format.
    Handles padding and conversion to tensors.

    Args:
        batch: List of examples from Dataset

    Returns:
        tuple: (input_ids, attention_mask, labels, prompts, expected_completions)
    """
    # Find the longest sequence in the batch for padding
    max_length = max(len(item["input_ids"]) for item in batch)

    # Pad input sequences to max_length with pad token
    input_ids = [
        item["input_ids"] +
        [tokenizer.pad_token_id] * (max_length - len(item["input_ids"]))
        for item in batch
    ]

    # Pad label sequences with -100 (ignored in loss calculation)
    labels = [
        item["labels"] +
        [-100] * (max_length - len(item["labels"]))
        for item in batch
    ]

    # Create attention masks: 1 for real tokens, 0 for padding
    attention_mask = [
        [1] * len(item["input_ids"]) +
        [0] * (max_length - len(item["input_ids"]))
        for item in batch
    ]

    # Keep original prompts and completions for evaluation
    prompts = [item["prompt"] for item in batch]
    expected_completions = [item["expected_completion"] for item in batch]

    # Convert everything to PyTorch tensors except text
    return (
        torch.tensor(input_ids),
        torch.tensor(attention_mask),
        torch.tensor(labels),
        prompts,
        expected_completions
    )

def normalize_text(text):
    """
    Normalizes text for consistent comparison.

    Args:
        text (str): Input text

    Returns:
        str: Normalized text
    """
    # Remove leading/trailing whitespace and convert to lowercase
    text = text.strip().lower()
    # Replace multiple whitespace characters with single space
    text = re.sub(r"\s+", ' ', text)
    return text

def calculate_accuracy(model, tokenizer, loader):
    """
    Calculates prediction accuracy on a dataset.

    Args:
        model: Finetuned model
        tokenizer: Associated tokenizer
        loader: DataLoader containing evaluation examples

    Returns:
        float: Accuracy score
    """
    # Set model to evaluation mode
    model.eval()
    # Initialize counters for accuracy calculation
    correct = 0
    total = 0

    # Disable gradient computation for efficiency
    with torch.no_grad():
        for input_ids, attention_mask, labels, prompts, expected_completions in loader:
            for prompt, expected_completion in zip(prompts, expected_completions):
                # Generate model's prediction
                generated_text = generate_text(model, tokenizer, prompt)
                # Compare normalized versions of prediction and target
                if normalize_text(generated_text) == normalize_text(expected_completion):
                    correct += 1
                total += 1

    # Calculate accuracy, handling empty dataset case
    accuracy = correct / total if total > 0 else 0
    # Reset model to training mode
    model.train()
    return accuracy

def generate_text(model, tokenizer, prompt, max_new_tokens=50):
    """
    Generates text completion for a given prompt.

    Args:
        model: Finetuned model
        tokenizer: Associated tokenizer
        prompt (str): Input prompt
        max_new_tokens (int): Maximum number of tokens to generate

    Returns:
        str: Generated completion
    """
    # Encode prompt and move to model's device
    input_ids = tokenizer(prompt, return_tensors="pt").to(model.device)

    # Generate completion using model's generate method
    output_ids = model.generate(
        input_ids=input_ids["input_ids"],
        attention_mask=input_ids["attention_mask"],
        max_new_tokens=max_new_tokens,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id
    )[0]

    # Extract and decode only the generated part (excluding prompt)
    generated_text = decode_text(tokenizer, output_ids[input_ids["input_ids"].shape[1]:])
    return generated_text.strip()

def test_model(model_path, test_input):
    """
    Tests a saved model on a single input.

    Args:
        model_path (str): Path to saved model
        test_input (str): Text to classify
    """
    # Determine device (GPU if available, else CPU)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Load saved model and tokenizer
    model = AutoModelForCausalLM.from_pretrained(model_path).to(device)
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    # Configure padding token
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = tokenizer.pad_token_id

    # Generate and display prediction
    prompt = build_prompt(test_input)
    generated_text = generate_text(model, tokenizer, prompt)

    print(f"Input: {test_input}")
    print(f"Generated emotion: {generated_text}")

def download_and_prepare_data(data_url, tokenizer, batch_size, test_ratio=0.1):
    """
    Downloads and prepares dataset for training.

    Args:
        data_url (str): URL of the dataset
        tokenizer: Tokenizer for text processing
        batch_size (int): Batch size for DataLoader
        test_ratio (float): Proportion of data for testing

    Returns:
        tuple: (train_loader, test_loader)
    """
    # Download and decompress dataset
    response = requests.get(data_url)
    content = gzip.decompress(response.content).decode()

    # Process each example into prompt-completion pairs
    dataset = []
    for entry in map(json.loads, content.splitlines()):
        dataset.append({
            "prompt": build_prompt(entry['text']),
            "completion": entry["label"].strip()
        })

    # Split into train and test sets
    random.shuffle(dataset)
    split_index = int(len(dataset) * (1 - test_ratio))
    train_data = dataset[:split_index]
    test_data = dataset[split_index:]

    # Create datasets
    train_dataset = PromptCompletionDataset(train_data, tokenizer)
    test_dataset = PromptCompletionDataset(test_data, tokenizer)

    # Create data loaders
    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        collate_fn=collate_fn
    )
    test_loader = DataLoader(
        test_dataset,
        batch_size=batch_size,
        shuffle=False,
        collate_fn=collate_fn
    )

    return train_loader, test_loader

def get_hyperparameters():
    """
    Returns training hyperparameters.

    Returns:
        tuple: (num_epochs, batch_size, learning_rate)
    """
    # Train for more epochs with LoRA as it's more efficient
    num_epochs = 18
    # Batch size
    batch_size = 16
    # Standard learning rate for finetuning transformers
    learning_rate = 5e-5

    return num_epochs, batch_size, learning_rate

# Main training script
if __name__ == "__main__":
    # Set random seeds for reproducibility
    set_seed(42)

    # Configure basic training parameters
    data_url = "https://www.thelmbook.com/data/emotions"
    model_name = "openai-community/gpt2"

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Initialize tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token

    #####

    # Modified by Frank Morales 04/01/2025
    from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

    # Hugging Face model id
    model_id = "mistralai/Mistral-7B-Instruct-v0.1"

    # BitsAndBytesConfig int-4 config
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
    )

    # Load model and tokenizer
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        device_map="auto",
        #attn_implementation="flash_attention_2",
        torch_dtype=torch.bfloat16,
        quantization_config=bnb_config
    )
    tokenizer = AutoTokenizer.from_pretrained(model_id,use_fast=True)
    tokenizer.padding_side = 'right' # to prevent warnings

    # Initialize tokenizer
    tokenizer.pad_token = tokenizer.eos_token

    # We redefine the pad_token and pad_token_id with out of vocabulary token (unk_token)
    tokenizer.pad_token = tokenizer.unk_token
    tokenizer.pad_token_id = tokenizer.unk_token_id

    #####



    # Configure LoRA parameters
    peft_config = LoraConfig(
        task_type = TaskType.CAUSAL_LM,  # Set task type for causal language modeling
        inference_mode = False,          # Enable training mode
        r = 16,                          # Rank of LoRA update matrices
        lora_alpha = 32                  # LoRA scaling factor
    )

    # Load model and apply LoRA configuration
    # original
    #model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

    model = get_peft_model(model, peft_config)

    # Get hyperparameters and prepare data
    num_epochs, batch_size, learning_rate = get_hyperparameters()
    train_loader, test_loader = download_and_prepare_data(data_url, tokenizer, batch_size)

    # Initialize optimizer
    optimizer = AdamW(model.parameters(), lr=learning_rate)

    # Training loop
    for epoch in range(num_epochs):
        total_loss = 0
        num_batches = 0
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")

        for input_ids, attention_mask, labels, _, _ in progress_bar:
            # Move batch to device
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            # Forward pass
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            loss = outputs.loss

            # Backward pass and optimization
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            # Update metrics
            total_loss += loss.item()
            num_batches += 1
            progress_bar.set_postfix({"Loss": total_loss / num_batches})

        # Calculate and display epoch metrics
        avg_loss = total_loss / num_batches
        test_acc = calculate_accuracy(model, tokenizer, test_loader)
        print(f"Epoch {epoch+1} - Average loss: {avg_loss:.4f}, Test accuracy: {test_acc:.4f}")

    # Calculate final model performance
    train_acc = calculate_accuracy(model, tokenizer, train_loader)
    print(f"Training accuracy: {train_acc:.4f}")
    print(f"Test accuracy: {test_acc:.4f}")

    # Save the LoRA-tuned model and tokenizer
    model.save_pretrained("./finetuned_model")
    tokenizer.save_pretrained("./finetuned_model")

    # Test the finetuned model with a sample input
    test_input = "I'm so happy to be able to finetune an LLM!"
    test_model("./finetuned_model", test_input)

Using device: cuda


config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.10k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Epoch 1/18: 100%|██████████| 1125/1125 [05:23<00:00,  3.47it/s, Loss=0.165]


Epoch 1 - Average loss: 0.1654, Test accuracy: 0.9320


Epoch 2/18: 100%|██████████| 1125/1125 [05:21<00:00,  3.50it/s, Loss=0.0555]


Epoch 2 - Average loss: 0.0555, Test accuracy: 0.9275


Epoch 3/18: 100%|██████████| 1125/1125 [05:20<00:00,  3.51it/s, Loss=0.044]


Epoch 3 - Average loss: 0.0440, Test accuracy: 0.9330


Epoch 4/18: 100%|██████████| 1125/1125 [05:21<00:00,  3.50it/s, Loss=0.0363]


Epoch 4 - Average loss: 0.0363, Test accuracy: 0.9360


Epoch 5/18: 100%|██████████| 1125/1125 [05:20<00:00,  3.51it/s, Loss=0.0326]


Epoch 5 - Average loss: 0.0326, Test accuracy: 0.9340


Epoch 6/18: 100%|██████████| 1125/1125 [05:21<00:00,  3.50it/s, Loss=0.0246]


Epoch 6 - Average loss: 0.0246, Test accuracy: 0.9390


Epoch 7/18: 100%|██████████| 1125/1125 [05:21<00:00,  3.50it/s, Loss=0.0211]


Epoch 7 - Average loss: 0.0211, Test accuracy: 0.9350


Epoch 8/18: 100%|██████████| 1125/1125 [05:20<00:00,  3.51it/s, Loss=0.0133]


Epoch 8 - Average loss: 0.0133, Test accuracy: 0.9350


Epoch 9/18: 100%|██████████| 1125/1125 [05:22<00:00,  3.49it/s, Loss=0.011]


Epoch 9 - Average loss: 0.0110, Test accuracy: 0.9325


Epoch 10/18: 100%|██████████| 1125/1125 [05:22<00:00,  3.49it/s, Loss=0.0101]


Epoch 10 - Average loss: 0.0101, Test accuracy: 0.9330


Epoch 11/18: 100%|██████████| 1125/1125 [05:22<00:00,  3.49it/s, Loss=0.0108]


Epoch 11 - Average loss: 0.0108, Test accuracy: 0.9285


Epoch 12/18: 100%|██████████| 1125/1125 [05:21<00:00,  3.49it/s, Loss=0.00848]


Epoch 12 - Average loss: 0.0085, Test accuracy: 0.9315


Epoch 13/18: 100%|██████████| 1125/1125 [05:21<00:00,  3.49it/s, Loss=0.00727]


Epoch 13 - Average loss: 0.0073, Test accuracy: 0.9280


Epoch 14/18: 100%|██████████| 1125/1125 [05:21<00:00,  3.50it/s, Loss=0.00575]


Epoch 14 - Average loss: 0.0058, Test accuracy: 0.9270


Epoch 15/18: 100%|██████████| 1125/1125 [05:21<00:00,  3.49it/s, Loss=0.00474]


Epoch 15 - Average loss: 0.0047, Test accuracy: 0.9225


Epoch 16/18: 100%|██████████| 1125/1125 [05:21<00:00,  3.50it/s, Loss=0.00913]


Epoch 16 - Average loss: 0.0091, Test accuracy: 0.9315


Epoch 17/18: 100%|██████████| 1125/1125 [05:21<00:00,  3.50it/s, Loss=0.00448]


Epoch 17 - Average loss: 0.0045, Test accuracy: 0.9320


Epoch 18/18: 100%|██████████| 1125/1125 [05:21<00:00,  3.50it/s, Loss=0.00374]


Epoch 18 - Average loss: 0.0037, Test accuracy: 0.9370
Training accuracy: 0.9951
Test accuracy: 0.9370
Using device: cuda


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Input: I'm so happy to be able to finetune an LLM!
Generated emotion: joy
