<a href="https://colab.research.google.com/github/gis2010/AgentGPT/blob/main/emotion_GPT2_as_text_generator_LoRA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install torch==2.0.1 peft==0.4.0 transformers==4.30.0 requests==2.28.2 tqdm==4.65.0
# Import required libraries
import json
import random
import gzip
import requests
import torch
from peft import get_peft_model, LoraConfig, TaskType
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForCausalLM
from torch.optim import AdamW
from tqdm import tqdm
import re

# Set random seeds for reproducibility
def set_seed(seed):
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Build prompt for emotion classification
def build_prompt(text):
    return f"Predict the emotion for the following text: {text}\nEmotion:"

# Encode text using tokenizer
def encode_text(tokenizer, text, return_tensor=False):
    if return_tensor:
        return tokenizer.encode(text, add_special_tokens=False, return_tensors="pt")
    else:
        return tokenizer.encode(text, add_special_tokens=False)

# Decode token IDs back to text
def decode_text(tokenizer, token_ids):
    return tokenizer.decode(token_ids, skip_special_tokens=True)

# Define HDC operations
class HDCOperations:
    def __init__(self, dim=10000):
        self.dim = dim

    def pad_tensors(self, a, b):
        """
        Pad the shorter tensor with zeros to match the size of the longer tensor.
        """
        max_len = max(a.size(0), b.size(0))
        a_padded = torch.zeros(max_len, dtype=a.dtype)
        b_padded = torch.zeros(max_len, dtype=b.dtype)
        a_padded[:a.size(0)] = a
        b_padded[:b.size(0)] = b
        return a_padded, b_padded

    def bind(self, a, b):
        """
        Binding operation: XOR for binary vectors.
        """
        a, b = self.pad_tensors(a, b)  # Ensure both tensors have the same size
        return a ^ b

    def unbind(self, a, b):
        """
        Unbinding operation: XOR for binary vectors (self-inverse).
        """
        a, b = self.pad_tensors(a, b)  # Ensure both tensors have the same size
        return a ^ b

    def bundle(self, vectors):
        """
        Bundling operation: Thresholded sum for binary vectors.
        """
        return (sum(vectors) > len(vectors) / 2).int()

# HDC-based memory storage
class HDCMemory:
    def __init__(self, dim=10000):
        self.dim = dim
        self.memory = {}
        self.hdc_ops = HDCOperations(dim)

    def store(self, key, value):
        """
        Store a key-value pair using HDC binding.
        """
        self.memory[key] = self.hdc_ops.bind(key, value)

    def retrieve(self, key):
        """
        Retrieve a value using HDC unbinding.
        """
        if key in self.memory:
            return self.hdc_ops.unbind(key, self.memory[key])
        else:
            raise KeyError("Key not found in memory.")

    def bundle(self, keys):
        """
        Bundle multiple keys into a single vector.
        """
        vectors = [self.memory[key] for key in keys]
        return self.hdc_ops.bundle(vectors)

# Dataset class with HDC integration
class PromptCompletionDataset(Dataset):
    def __init__(self, data, tokenizer, hdc_memory):
        self.data = data
        self.tokenizer = tokenizer
        self.hdc_memory = hdc_memory

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        prompt = item["prompt"]
        completion = item["completion"]

        # Encode prompt and completion as high-dimensional vectors
        encoded_prompt = self.tokenizer.encode(prompt, add_special_tokens=False)
        encoded_completion = self.tokenizer.encode(completion, add_special_tokens=False)

        # Convert to tensors
        key = torch.tensor(encoded_prompt, dtype=torch.int)
        value = torch.tensor(encoded_completion, dtype=torch.int)

        # Store in HDC memory
        self.hdc_memory.store(key, value)

        # Retrieve from HDC memory (for demonstration)
        retrieved_value = self.hdc_memory.retrieve(key)

        return {
            "input_ids": encoded_prompt + encoded_completion + [self.tokenizer.eos_token_id],
            "labels": [-100] * len(encoded_prompt) + encoded_completion + [self.tokenizer.eos_token_id],
            "prompt": prompt,
            "expected_completion": completion,
            "retrieved_completion": self.tokenizer.decode(retrieved_value.tolist(), skip_special_tokens=True)
        }

# Collate function for DataLoader
def collate_fn(batch):
    max_length = max(len(item["input_ids"]) for item in batch)
    input_ids = [item["input_ids"] + [tokenizer.pad_token_id] * (max_length - len(item["input_ids"])) for item in batch]
    labels = [item["labels"] + [-100] * (max_length - len(item["labels"])) for item in batch]
    attention_mask = [[1] * len(item["input_ids"]) + [0] * (max_length - len(item["input_ids"])) for item in batch]
    prompts = [item["prompt"] for item in batch]
    expected_completions = [item["expected_completion"] for item in batch]
    retrieved_completions = [item["retrieved_completion"] for item in batch]

    return (
        torch.tensor(input_ids),
        torch.tensor(attention_mask),
        torch.tensor(labels),
        prompts,
        expected_completions,
        retrieved_completions
    )

# Download and prepare dataset
def download_and_prepare_data(data_url, tokenizer, batch_size, test_ratio=0.1):
    response = requests.get(data_url)
    content = gzip.decompress(response.content).decode()
    dataset = [{"prompt": build_prompt(entry['text']), "completion": entry["label"].strip()} for entry in map(json.loads, content.splitlines())]
    random.shuffle(dataset)
    split_index = int(len(dataset) * (1 - test_ratio))
    train_data = dataset[:split_index]
    test_data = dataset[split_index:]

    # Initialize HDC memory
    hdc_memory = HDCMemory()

    # Create datasets
    train_dataset = PromptCompletionDataset(train_data, tokenizer, hdc_memory)
    test_dataset = PromptCompletionDataset(test_data, tokenizer, hdc_memory)

    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

    return train_loader, test_loader, hdc_memory

# Calculate accuracy
def calculate_accuracy(model, tokenizer, loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for input_ids, attention_mask, labels, prompts, expected_completions, retrieved_completions in loader:
            for prompt, expected_completion in zip(prompts, expected_completions):
                generated_text = generate_text(model, tokenizer, prompt)
                if generated_text.strip().lower() == expected_completion.strip().lower():
                    correct += 1
                total += 1
    accuracy = correct / total if total > 0 else 0
    model.train()
    return accuracy

# Generate text using the model
def generate_text(model, tokenizer, prompt, max_new_tokens=50):
    input_ids = tokenizer(prompt, return_tensors="pt").to(model.device)
    output_ids = model.generate(
        input_ids=input_ids["input_ids"],
        attention_mask=input_ids["attention_mask"],
        max_new_tokens=max_new_tokens,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id
    )[0]
    generated_text = decode_text(tokenizer, output_ids[input_ids["input_ids"].shape[1]:])
    return generated_text.strip()

# Test the model
def test_model(model_path, test_input, hdc_memory):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Load saved model and tokenizer
    model = AutoModelForCausalLM.from_pretrained(model_path).to(device)
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    # Configure padding token
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = tokenizer.pad_token_id

    # Generate and display prediction
    prompt = build_prompt(test_input)
    generated_text = generate_text(model, tokenizer, prompt)

    print(f"Input: {test_input}")
    print(f"Generated emotion: {generated_text}")

    # Retrieve from HDC memory (for demonstration)
    key = torch.tensor(tokenizer.encode(prompt, add_special_tokens=False), dtype=torch.int)
    retrieved_value = hdc_memory.retrieve(key)
    print(f"Retrieved completion from HDC memory: {tokenizer.decode(retrieved_value.tolist(), skip_special_tokens=True)}")

# Main training script
if __name__ == "__main__":
    set_seed(42)

    # Configure basic training parameters
    data_url = "https://www.thelmbook.com/data/emotions"
    model_name = "openai-community/gpt2"
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Initialize tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token

    # Configure LoRA parameters
    peft_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        inference_mode=False,
        r=16,
        lora_alpha=32
    )

    # Load model and apply LoRA configuration
    model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
    model = get_peft_model(model, peft_config)

    # Get hyperparameters and prepare data
    num_epochs, batch_size, learning_rate = 18, 16, 5e-5
    train_loader, test_loader, hdc_memory = download_and_prepare_data(data_url, tokenizer, batch_size)

    # Initialize optimizer
    optimizer = AdamW(model.parameters(), lr=learning_rate)

    # Training loop
    for epoch in range(num_epochs):
        total_loss = 0
        num_batches = 0
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")

        for input_ids, attention_mask, labels, _, _, _ in progress_bar:
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            # Backward pass and optimization
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            # Update metrics
            total_loss += loss.item()
            num_batches += 1
            progress_bar.set_postfix({"Loss": total_loss / num_batches})

        # Evaluate on test set
        test_acc = calculate_accuracy(model, tokenizer, test_loader)
        print(f"Epoch {epoch+1} - Average loss: {total_loss / num_batches:.4f}, Test accuracy: {test_acc:.4f}")

    # Save the model and tokenizer
    model.save_pretrained("./finetuned_model")
    tokenizer.save_pretrained("./finetuned_model")

    # Test the finetuned model
    test_input = "I'm so happy to be able to finetune an LLM!"
    test_model("./finetuned_model", test_input, hdc_memory)

Collecting torch==2.0.1
  Downloading torch-2.0.1-cp310-cp310-manylinux1_x86_64.whl.metadata (24 kB)
Collecting peft==0.4.0
  Downloading peft-0.4.0-py3-none-any.whl.metadata (21 kB)
Collecting transformers==4.30.0
  Downloading transformers-4.30.0-py3-none-any.whl.metadata (113 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m113.6/113.6 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting requests==2.28.2
  Downloading requests-2.28.2-py3-none-any.whl.metadata (4.6 kB)
Collecting tqdm==4.65.0
  Downloading tqdm-4.65.0-py3-none-any.whl.metadata (56 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.0/57.0 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu11==11.7.99 (from torch==2.0.1)
  Downloading nvidia_cuda_nvrtc_cu11-11.7.99-2-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu11==11.7.99 (from torch==2.0.1)
  Downloading nvidia_cuda_runtime_cu11-11.7.99-py3-none-manylinux1_

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Epoch 1/18:   0%|          | 0/1125 [00:03<?, ?it/s]


RuntimeError: The size of tensor a (42) must match the size of tensor b (2) at non-singleton dimension 0