## Fine-tuning OPT-1.3B for Donor Profile Generation
- This notebook implements fine-tuning of OPT-1.3B model on donor profiles data.

## 1. Setup and Imports
First, install required packages:

In [None]:
!pip install -q transformers==4.31.0 datasets==2.14.5 accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 trl==0.4.7


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.9/116.9 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m31.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.5/92.5 MB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.4/77.4 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# Setup
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import torch
import json
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import Dataset
import numpy as np
from random import randint
import gc

## 2. Load Data

In [None]:
# Load the processed dataset
with open('/content/drive/MyDrive/data/processed/llama_training/processed_original_dataset.json', 'r') as f:
    profiles = json.load(f)

# Create dataset using the text field directly
dataset = Dataset.from_dict({
    "text": [p["text"] for p in profiles]
})

# Split dataset
dataset = dataset.train_test_split(test_size=0.1, seed=42)


## 3. Model and Tokenizer Setup

In [None]:
# Load tokenizer and model
model_name = "facebook/opt-1.3b"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# Load model in 8-bit
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_8bit=True,
    device_map="auto",
    torch_dtype=torch.float16,
)

# Configure LoRA
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# Prepare model for training
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/653 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/441 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.63G [00:00<?, ?B/s]

  return torch.load(checkpoint_file, map_location="cpu")


generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

## 4. Data Processing

In [None]:
def preprocess_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=512,
        padding="max_length",
    )

tokenized_dataset = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset["train"].column_names,
)


Map:   0%|          | 0/225 [00:00<?, ? examples/s]

Map:   0%|          | 0/25 [00:00<?, ? examples/s]

## 5. Training Setup and Execution

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/outputs/llama_fine_tuned",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    evaluation_strategy="steps",
    eval_steps=50,
    save_strategy="steps",
    save_steps=50,
    warmup_steps=50,
    logging_steps=10,
    learning_rate=2e-4,
    fp16=True,
    load_best_model_at_end=True,
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
)


## 6. Train the Model

In [None]:
# Train
trainer.train()

# Save the final model
trainer.save_model("/content/drive/MyDrive/outputs/opt_fine_tuned/final")


  new_forward = torch.cuda.amp.autocast(dtype=torch.float16)(model_forward_func)
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss


## 7. Generate New Profiles

In [None]:
# Load the trained model for generation
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel, PeftConfig
import json
from datetime import datetime

def load_model_and_tokenizer():
    print("Loading base model and tokenizer...")
    model_name = "facebook/opt-1.3b"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        load_in_8bit=True,
        device_map="auto",
        torch_dtype=torch.float16,
    )

    print("Loading trained adapter...")
    adapter_path = "/content/drive/MyDrive/outputs/opt_fine_tuned/final"
    model = PeftModel.from_pretrained(model, adapter_path)

    return model, tokenizer

def generate_profiles(model, tokenizer, num_profiles=10, save_to_file=True):
    """Generate donor profiles using the consistent prompt format."""

    prompt = """Based on these donor profiles, generate 10 new donor profiles. For each profile, provide:
Profile #[number]:
* Height: [ft'in" (cm)]
* Weight: [lb (kg)]
* Eye Color: [color]
* Hair: [color/texture]
* Skin Tone: [tone]
* Education: [level and field]
* Ethnic Background: [ethnicity]
* Occupation: [job]
* Interests/Hobbies: [categories]
[Detailed description paragraph that captures personality, aspirations, physical features, and character traits in a marketing style similar to the samples provided]
Generate all 10 profiles numbered 1-10.

### Response:"""

    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

    print("Generating profiles...")
    outputs = model.generate(
        **inputs,
        max_length=2048,  # Increased to accommodate multiple profiles
        num_return_sequences=1,
        temperature=0.9,
        do_sample=True,
        top_p=0.95,
    )

    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Split the generated text into individual profiles
    profiles = []
    # We'll parse the generated text into separate profiles
    raw_profiles = generated_text.split("Profile #")[1:]  # Skip the prompt part

    for i, profile_text in enumerate(raw_profiles, 1):
        profile_data = {
            "profile_id": f"org_llama_{i}",
            "generated_text": f"Profile #{profile_text.strip()}"
        }
        profiles.append(profile_data)

    if save_to_file:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"/content/drive/MyDrive/outputs/generated_profiles_{timestamp}.json"
        with open(filename, 'w') as f:
            json.dump(profiles, f, indent=2)
        print(f"\nSaved generated profiles to: {filename}")

    return profiles

# Load the model
print("Loading model...")
model, tokenizer = load_model_and_tokenizer()

# Generate profiles
generated_profiles = generate_profiles(model, tokenizer)

# Print example
print("\nExample of generated profile:")
print(json.dumps(generated_profiles[0], indent=2))

Loading model...
Loading base model and tokenizer...
Loading trained adapter...


  adapters_weights = torch.load(


Generating profiles...

Saved generated profiles to: /content/drive/MyDrive/outputs/generated_profiles_20241117_221552.json

Example of generated profile:
{
  "profile_id": "org_llama_1",
  "generated_text": "Profile #[number]:\n* Height: [ft'in\" (cm)]\n* Weight: [lb (kg)]\n* Eye Color: [color]\n* Hair: [color/texture]\n* Skin Tone: [tone]\n* Education: [level and field]\n* Ethnic Background: [ethnicity]\n* Occupation: [job]\n* Interests/Hobbies: [categories]\n[Detailed description paragraph that captures personality, aspirations, physical features, and character traits in a marketing style similar to the samples provided]\nGenerate all 10 profiles numbered 1-10.\n\n### Response:\n\nStrong and well-mannered. A hard worker and loyal friend. A tall, slim, and very intelligent individual. He has good manners, is polite, respectful, and has a sharp tongue.\n\n### Response:\n\nKind, generous, helpful, and friendly. Has an infectious smile that lights up his room. He is polite and kind and 

- Second attempt

In [None]:
# Load the trained model for generation
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel, PeftConfig
import json
from datetime import datetime

def load_model_and_tokenizer():
    print("Loading base model and tokenizer...")
    model_name = "facebook/opt-1.3b"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        load_in_8bit=True,
        device_map="auto",
        torch_dtype=torch.float16,
    )

    print("Loading trained adapter...")
    adapter_path = "/content/drive/MyDrive/outputs/opt_fine_tuned/final"
    model = PeftModel.from_pretrained(model, adapter_path)

    return model, tokenizer

def generate_profiles(model, tokenizer, save_to_file=True):
    """Generate donor profiles using the exact prompt."""

    prompt = """Based on these donor profiles, generate 10 new donor profiles. For each profile, provide:
Profile #[number]:
* Height: [ft'in" (cm)]
* Weight: [lb (kg)]
* Eye Color: [color]
* Hair: [color/texture]
* Skin Tone: [tone]
* Education: [level and field]
* Ethnic Background: [ethnicity]
* Occupation: [job]
* Interests/Hobbies: [categories]
[Detailed description paragraph that captures personality, aspirations, physical features, and character traits in a marketing style similar to the samples provided]
Generate all 10 profiles numbered 1-10.

### Response:"""

    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

    print("Generating profiles...")
    outputs = model.generate(
        **inputs,
        max_length=2048,  # Increased for multiple profiles
        num_return_sequences=1,
        temperature=0.9,
        do_sample=True,
        top_p=0.95,
        no_repeat_ngram_size=3,  # Prevent repetitive text
        repetition_penalty=1.2    # Discourage repetition
    )

    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract just the response part if present
    if "### Response:" in generated_text:
        generated_text = generated_text.split("### Response:")[-1].strip()

    profile_data = {
        "profile_id": "org_llama_1",
        "generated_text": generated_text
    }

    if save_to_file:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"/content/drive/MyDrive/outputs/generated_profiles_{timestamp}.json"
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump([profile_data], f, indent=2, ensure_ascii=False)
        print(f"\nSaved generated profiles to: {filename}")

    return [profile_data]

# Load the model
print("Loading model...")
model, tokenizer = load_model_and_tokenizer()

# Generate profiles
print("\nGenerating profiles...")
generated_profiles = generate_profiles(model, tokenizer)

# Print the generated text
print("\nGenerated profiles:")
print(generated_profiles[0]["generated_text"])

Loading model...
Loading base model and tokenizer...
Loading trained adapter...

Generating profiles...
Generating profiles...

Saved generated profiles to: /content/drive/MyDrive/outputs/generated_profiles_20241117_222006.json

Generated profiles:
[no comment]
### End Response

#### End Result

* Name: Brian( )
* Gender: Male
* Date of Birth: 03/23/1999
* Educational Background: None
* Specialty (Education): Language Arts
* Job Title: Community Services Worker
* Professional Background: English Teacher
* Status: Married
* Family Members: Parents(1)
* Religion: None/None
* Culture: American
* Home Away From Home: Hawaii
* Pet Animal: Cat(?)
* Security Strategy: None - Do whatever I want...

## Response: Excellent Educator!
## End Response


- Another attempt

In [None]:
# First, clear CUDA memory
import torch
import gc

def clear_gpu_memory():
    gc.collect()
    torch.cuda.empty_cache()
    if torch.cuda.is_available():
        torch.cuda.synchronize()

print("Clearing GPU memory...")
clear_gpu_memory()

# Now try loading with more conservative settings
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel, PeftConfig
import json
from datetime import datetime

def load_model_and_tokenizer():
    print("Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained("facebook/opt-1.3b")

    print("Loading base model...")
    model = AutoModelForCausalLM.from_pretrained(
        "facebook/opt-1.3b",
        load_in_8bit=True,
        torch_dtype=torch.float16,
        low_cpu_mem_usage=True,
        device_map={"": 0}  # Explicitly map to first GPU
    )

    print("Loading adapter...")
    adapter_path = "/content/drive/MyDrive/outputs/opt_fine_tuned/final"
    model = PeftModel.from_pretrained(model, adapter_path)
    model.eval()  # Set to evaluation mode

    return model, tokenizer

def generate_profiles(model, tokenizer, save_to_file=True):
    prompt = """Based on these donor profiles, generate 10 new donor profiles. For each profile, provide:
Profile #[number]:
* Height: [ft'in" (cm)]
* Weight: [lb (kg)]
* Eye Color: [color]
* Hair: [color/texture]
* Skin Tone: [tone]
* Education: [level and field]
* Ethnic Background: [ethnicity]
* Occupation: [job]
* Interests/Hobbies: [categories]
[Detailed description paragraph that captures personality, aspirations, physical features, and character traits in a marketing style similar to the samples provided]
Generate all 10 profiles numbered 1-10.

### Response:"""

    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

    print("Generating profiles...")
    with torch.no_grad():  # Disable gradient calculation
        outputs = model.generate(
            **inputs,
            max_length=1024,  # Reduced for memory
            num_return_sequences=1,
            temperature=0.85,
            do_sample=True,
            top_p=0.92,
            no_repeat_ngram_size=3,
            repetition_penalty=1.2
        )

    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    if "### Response:" in generated_text:
        generated_text = generated_text.split("### Response:")[-1].strip()

    profiles = []
    for i in range(1, 11):
        profile_data = {
            "profile_id": f"org_llama_{i}",
            "generated_text": generated_text
        }
        profiles.append(profile_data)

    if save_to_file:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"/content/drive/MyDrive/outputs/generated_profiles_{timestamp}.json"
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(profiles, f, indent=2, ensure_ascii=False)
        print(f"\nSaved generated profiles to: {filename}")

    return profiles

# Main execution
print("Starting generation process...")
try:
    print("\nStep 1: Loading model...")
    model, tokenizer = load_model_and_tokenizer()

    print("\nStep 2: Generating profiles...")
    generated_profiles = generate_profiles(model, tokenizer)

    print("\nFirst generated profile:")
    print(json.dumps(generated_profiles[0], indent=2))

except Exception as e:
    print(f"\nAn error occurred: {str(e)}")
    print("Traceback:")
    import traceback
    traceback.print_exc()
finally:
    print("\nCleaning up...")
    clear_gpu_memory()

Clearing GPU memory...


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
