In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import pandas as pd
from tqdm import tqdm
import os
import textwrap
import gc
# ==============================================================================

# -----------------------------
# Custom Shared LoRA Module
# -----------------------------
class SharedLoRA(nn.Module):
    """
    A single, shared LoRA module that will be applied to the output of every transformer block.
    This is a highly parameter-efficient way to introduce a global change to the model's behavior.
    """
    def __init__(self, hidden_size, rank, scaling=1.0):
        super().__init__()
        self.lora_A = nn.Parameter(torch.randn(hidden_size, rank))
        self.lora_B = nn.Parameter(torch.zeros(rank, hidden_size))
        self.scaling = scaling

    def forward(self, x):
        """Applies the low-rank update to the input hidden state."""
        # Input x has shape (batch, seq_len, hidden_size)
        update = (x @ self.lora_A @ self.lora_B)
        update = update / (update.norm(p=2, dim=-1, keepdim=True) + 1e-8) * self.scaling
        return x + update

# -----------------------------
# Dataset
# -----------------------------
class PromptDataset(Dataset):
    """
    A simple dataset to load prompts from a pandas DataFrame.
    """
    def __init__(self, df: pd.DataFrame, prompt_column: str):
        # Ensure the column exists
        if prompt_column not in df.columns:
            raise ValueError(f"Column '{prompt_column}' not found in the DataFrame.")
        self.prompts = df[prompt_column].tolist()

    def __len__(self):
        return len(self.prompts)

    def __getitem__(self, idx):
        return self.prompts[idx]


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def load_base_model_and_tokenizer(args):
    """Loads the objects that are constant across all training runs."""
    if not torch.cuda.is_available():
        raise RuntimeError("This script requires a CUDA-enabled GPU.")
    
    print("--- Loading Base Model and Tokenizer (once) ---")
    
    tokenizer = AutoTokenizer.from_pretrained(args.model_id)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )

    model_base = AutoModelForCausalLM.from_pretrained(
        args.model_id,
        quantization_config=bnb_config,
        device_map="auto",
        torch_dtype=torch.bfloat16,
    )
    model_base.eval()
    for param in model_base.parameters():
        param.requires_grad = False
        
    print("--- Base Model and Tokenizer Loaded ---")
    return model_base, tokenizer


In [3]:
def run_single_training_cycle(args, model_base, tokenizer, run_idx):
    """
    Runs one full cycle of training and evaluation.
    It loads a new model to be tuned each time it's called.
    """
    device = "cuda"
    run_output_path = os.path.join(args.output_dir, f"divergence_adapter_b{args.batch_size}_run_{run_idx}.pth")
    im_end_token_id = tokenizer.convert_tokens_to_ids("<|im_end|>")
    
    print(f"Loading a new, randomly initialized 'model_tuned' for run {run_idx}...")
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )
    model_tuned = AutoModelForCausalLM.from_pretrained(
        args.model_id,
        quantization_config=bnb_config,
        device_map="auto",
        torch_dtype=torch.bfloat16,
    )
    model_tuned.train()

    # --- Create and Inject a new Shared LoRA Adapter ---
    hidden_size = model_tuned.config.hidden_size
    shared_adapter = SharedLoRA(hidden_size, rank=args.lora_rank, scaling=args.lora_scaling).to(device, dtype=torch.bfloat16)

    hook_handles = []
    def apply_adapter_hook(module, input, output):
        if isinstance(output, tuple):
            hidden_state = output[0]
            modified_hidden_state = shared_adapter(hidden_state)
            return (modified_hidden_state,) + output[1:]
        else:
            modified_hidden_state = shared_adapter(output)
            return modified_hidden_state

    for layer in model_tuned.model.layers:
        handle = layer.register_forward_hook(apply_adapter_hook)
        hook_handles.append(handle)

    num_trainable_params = sum(p.numel() for p in shared_adapter.parameters() if p.requires_grad)
    print(f"Run {run_idx}: Shared LoRA adapter created with {num_trainable_params:,} parameters.")

    optimizer = torch.optim.AdamW(shared_adapter.parameters(), lr=args.learning_rate)
    
    df = pd.read_csv(args.dataset_path)
    
    # Sample only 24 examples for efficient divergence training
    df_sampled = df.sample(n=args.df_sample_size, random_state=42+run_idx).reset_index(drop=True)
    dataset = PromptDataset(df_sampled, prompt_column='full_prompt')
    
    print(f"Run {run_idx}: Using {len(df_sampled)} samples for training (sampled from {len(df)} total)")

    def collate_fn(batch):
        return tokenizer(batch, return_tensors="pt", padding=True, truncation=True)
    
    train_loader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True, collate_fn=collate_fn)

    # --- Training Loop ---
    print(f"Run {run_idx}: Starting divergence training...")
    for epoch in range(args.epochs):
        pbar = tqdm(train_loader, desc=f"Run {run_idx} Epoch {epoch+1}")
        for batch in pbar:
            # Store original input for each generation step
            original_input_ids = batch['input_ids'].to(device)
            original_attention_mask = batch['attention_mask'].to(device)
            
            # Keep track of which sequences in the batch are still being generated
            unfinished_sequences = torch.ones(original_input_ids.shape[0], dtype=torch.long, device=device)
            
            # Use a fixed-size working copy to avoid growing sequences
            max_seq_len = original_input_ids.shape[1] + args.max_new_tokens
            working_input_ids = torch.full((original_input_ids.shape[0], max_seq_len), 
                                         tokenizer.pad_token_id, dtype=torch.long, device=device)
            working_attention_mask = torch.zeros((original_input_ids.shape[0], max_seq_len), 
                                               dtype=torch.long, device=device)
            
            # Initialize with original input (avoiding in-place operations)
            seq_len = original_input_ids.shape[1]
            working_input_ids = working_input_ids.clone()
            working_input_ids[:, :seq_len] = original_input_ids
            working_attention_mask = working_attention_mask.clone()
            working_attention_mask[:, :seq_len] = original_attention_mask
            
            total_batch_loss = 0
            num_steps = 0
            
            optimizer.zero_grad()

            for step in range(args.max_new_tokens):
                # Use only the current sequence length to avoid processing padding
                current_input_ids = working_input_ids[:, :seq_len]
                current_attention_mask = working_attention_mask[:, :seq_len]
                
                with torch.no_grad():
                    outputs_base = model_base(input_ids=current_input_ids, attention_mask=current_attention_mask)
                    logits_base = outputs_base.logits[:, -1, :]
                    logprobs_p = F.log_softmax(logits_base, dim=-1)
                    probs_p = logprobs_p.exp()

                outputs_tuned = model_tuned(input_ids=current_input_ids, attention_mask=current_attention_mask)
                logits_tuned = outputs_tuned.logits[:, -1, :]
                logprobs_q = F.log_softmax(logits_tuned, dim=-1)
                
                # Calculate loss only for the active sequences
                active_sequences_mask = unfinished_sequences.float()
                kl_div = (probs_p * (logprobs_p - logprobs_q)).sum(dim=-1)
                kl_div_loss = -(kl_div * active_sequences_mask).sum() / active_sequences_mask.sum()

                with torch.no_grad():
                    probs_q = logprobs_q.exp()
                    next_token = torch.multinomial(probs_q, num_samples=1)
                    
                    # Update the list of unfinished sequences (avoid in-place operation)
                    is_eos = (next_token == tokenizer.eos_token_id) | (next_token == im_end_token_id)
                    eos_mask = is_eos.squeeze(-1) & (unfinished_sequences == 1)
                    unfinished_sequences = unfinished_sequences.masked_fill(eos_mask, 0)

                    if unfinished_sequences.max() == 0:
                        break
                    
                    # Add next token to working tensors (avoid in-place operations)
                    new_working_input_ids = working_input_ids.clone()
                    new_working_input_ids[:, seq_len] = next_token.squeeze(-1)
                    working_input_ids = new_working_input_ids
                    
                    new_working_attention_mask = working_attention_mask.clone()
                    new_working_attention_mask[:, seq_len] = 1
                    working_attention_mask = new_working_attention_mask
                    
                    seq_len += 1

                nll = -logprobs_q.gather(dim=-1, index=next_token.detach())
                nll_loss = (nll.squeeze(-1) * active_sequences_mask).sum() / active_sequences_mask.sum()
                if active_sequences_mask.sum() > 0:
                    step_loss = args.alpha * kl_div_loss + args.beta * nll_loss
                    
                    # Backward pass immediately to avoid accumulating large computation graphs
                    step_loss.backward()
                    total_batch_loss += step_loss.item()
                    num_steps += 1
                
                # Clear intermediate tensors to free memory
                del outputs_tuned, logits_tuned, logprobs_q, kl_div
                if 'step_loss' in locals():
                    del step_loss
                torch.cuda.empty_cache()
            
            if num_steps > 0:
                optimizer.step()
                avg_batch_loss = total_batch_loss / num_steps
                pbar.set_postfix({"avg_loss": f"{avg_batch_loss:.4f}"})
            
            # Clean up batch tensors
            del working_input_ids, working_attention_mask, original_input_ids, original_attention_mask
            del unfinished_sequences
            torch.cuda.empty_cache()

    # --- Save ---
    print(f"\nRun {run_idx}: Training finished.")
    os.makedirs(args.output_dir, exist_ok=True)
    torch.save(shared_adapter.state_dict(), run_output_path)
    print(f"Shared LoRA adapter weights saved to '{run_output_path}'")

    # --- IN-LINE EVALUATION ---
    print(f"\n--- Starting Evaluation for Run {run_idx} ---")
    model_tuned.eval()
    sample_prompts = df['full_prompt'].sample(n=args.num_eval_samples, random_state=42+run_idx).tolist()

    for i, prompt in enumerate(sample_prompts):
        print("\n" + "="*80)
        print(f"PROMPT:\n{textwrap.fill(prompt, 80)}")
        print("="*80)
        input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)

        print("\n--- BASE MODEL OUTPUT ---")
        with torch.no_grad():
            outputs_base_gen = model_base.generate(
                input_ids, max_new_tokens=args.max_new_tokens, do_sample=True, top_k=50, top_p=0.95, pad_token_id=tokenizer.eos_token_id
            )
            base_text = tokenizer.decode(outputs_base_gen[0], skip_special_tokens=True)
            print(textwrap.fill(base_text.replace(prompt, "", 1).strip(), 80))

        print("\n--- DIVERGENT MODEL OUTPUT ---")
        with torch.no_grad():
            outputs_divergent = model_tuned.generate(
                input_ids, max_new_tokens=args.max_new_tokens, do_sample=True, top_k=50, top_p=0.95, pad_token_id=tokenizer.eos_token_id
            )
            divergent_text = tokenizer.decode(outputs_divergent[0], skip_special_tokens=True)
            print(textwrap.fill(divergent_text.replace(prompt, "", 1).strip(), 80))
        print("="*80)
        
    # --- Cleanup for this run ---
    for handle in hook_handles:
        handle.remove()
    print(f"\nRun {run_idx}: Evaluation complete. Adapter hooks removed.")

    del model_tuned, shared_adapter, hook_handles
    torch.cuda.empty_cache()


In [4]:
# -----------------------------
# Configuration for Jupyter Notebook
# -----------------------------
class TrainingArgs:
    model_id = "Qwen/Qwen2.5-14B-Instruct"
    dataset_path = "train.csv"
    output_dir = "./divergence_adapters" # Directory to save adapters
    lora_rank = 2
    learning_rate = 0.1
    epochs = 1
    batch_size = 12
    df_sample_size = 192
    max_new_tokens = 128
    num_eval_samples = 2
    latent_searches = 20 # Number of times to repeat the whole process
    lora_scaling = 2

    alpha = 1.1
    beta = 0.3


if __name__ == '__main__':
    args = TrainingArgs()
    
    # Load the base model and tokenizer once
    model_base, tokenizer = load_base_model_and_tokenizer(args)
    
    # Main loop to repeat the training process
    for i in range(1, args.latent_searches + 1):
        print("\n" + "#"*80)
        print(f"### STARTING TRAINING RUN {i} of {args.latent_searches} ###")
        print("#"*80)
        
        run_single_training_cycle(args, model_base, tokenizer, run_idx=i)
        
        print(f"--- Finished Training Run {i} ---")
        print("Clearing memory before next run...")
        # Models and adapters from the function are out of scope, so garbage collection will clean them up
        gc.collect()
        torch.cuda.empty_cache()

    print("\nAll training runs completed.")


--- Loading Base Model and Tokenizer (once) ---


`torch_dtype` is deprecated! Use `dtype` instead!
Loading checkpoint shards: 100%|██████████| 8/8 [00:15<00:00,  1.95s/it]


--- Base Model and Tokenizer Loaded ---

################################################################################
### STARTING TRAINING RUN 1 of 20 ###
################################################################################
Loading a new, randomly initialized 'model_tuned' for run 1...


Loading checkpoint shards: 100%|██████████| 8/8 [00:14<00:00,  1.87s/it]


Run 1: Shared LoRA adapter created with 20,480 parameters.
Run 1: Using 192 samples for training (sampled from 384 total)
Run 1: Starting divergence training...


Run 1 Epoch 1: 100%|██████████| 16/16 [15:56<00:00, 59.76s/it, avg_loss=-0.2316]
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



Run 1: Training finished.
Shared LoRA adapter weights saved to './divergence_adapters/divergence_adapter_b12_run_1.pth'

--- Starting Evaluation for Run 1 ---

PROMPT:
Adult siblings at a family funeral, grief and finality making them more honest
than usual about family dynamics. <|im_start|>sibling1 What's the most
embarrassing thing that happened to you growing up that they never let you live
down? <|im_end|>  <|im_start|>sibling2

--- BASE MODEL OUTPUT ---
Adult siblings at a family funeral, grief and finality making them more honest
than usual about family dynamics. sibling1 What's the most embarrassing thing
that happened to you growing up that they never let you live down?   sibling2 I
remember my older sister used to tease me endlessly about the time I got so
scared of the dark as a kid that I refused to go to sleep without a nightlight.
She'd always bring it up whenever we had a disagreement or when she thought I
was being overly cautious about something. It was pretty embarra

Loading checkpoint shards: 100%|██████████| 8/8 [00:14<00:00,  1.86s/it]


Run 2: Shared LoRA adapter created with 20,480 parameters.
Run 2: Using 192 samples for training (sampled from 384 total)
Run 2: Starting divergence training...


Run 2 Epoch 1: 100%|██████████| 16/16 [15:57<00:00, 59.87s/it, avg_loss=-1.3455]



Run 2: Training finished.
Shared LoRA adapter weights saved to './divergence_adapters/divergence_adapter_b12_run_2.pth'

--- Starting Evaluation for Run 2 ---

PROMPT:
Someone in therapy finally being completely honest with themselves, internal
thoughts they've never voiced, breakthrough moments of self-awareness.
<|im_start|>therapist What's a goal you've abandoned, and what's the real reason
you gave up on it? <|im_end|>  <|im_start|>patient_thoughts

--- BASE MODEL OUTPUT ---
Someone in therapy finally being completely honest with themselves, internal
thoughts they've never voiced, breakthrough moments of self-awareness. therapist
What's a goal you've abandoned, and what's the real reason you gave up on it?
patient_thoughts I think about how I used to want to become a professional
athlete. It was my childhood dream, but deep down, I wasn't really committed to
the sacrifices it would require. I kept telling myself that I just needed to
train harder, that success was right around the

Loading checkpoint shards: 100%|██████████| 8/8 [00:14<00:00,  1.81s/it]


Run 3: Shared LoRA adapter created with 20,480 parameters.
Run 3: Using 192 samples for training (sampled from 384 total)
Run 3: Starting divergence training...


Run 3 Epoch 1: 100%|██████████| 16/16 [15:56<00:00, 59.79s/it, avg_loss=0.1378] 



Run 3: Training finished.
Shared LoRA adapter weights saved to './divergence_adapters/divergence_adapter_b12_run_3.pth'

--- Starting Evaluation for Run 3 ---

PROMPT:
Freelance workers who've been at the same coffee shop for hours, gig economy
struggles and isolation. <|im_start|>freelancer1 I spent all day trying to
explain to a client why my rate is what it is. They clearly think I'm
overcharging. How do you handle those kinds of conversations without feeling
like you're begging for scraps? <|im_end|>  <|im_start|>freelancer2

--- BASE MODEL OUTPUT ---
Freelance workers who've been at the same coffee shop for hours, gig economy
struggles and isolation. freelancer1 I spent all day trying to explain to a
client why my rate is what it is. They clearly think I'm overcharging. How do
you handle those kinds of conversations without feeling like you're begging for
scraps?   freelancer2 It's important to remember that your time and expertise
are valuable, and it's okay to charge accordingl

Loading checkpoint shards: 100%|██████████| 8/8 [00:13<00:00,  1.74s/it]


Run 4: Shared LoRA adapter created with 20,480 parameters.
Run 4: Using 192 samples for training (sampled from 384 total)
Run 4: Starting divergence training...


Run 4 Epoch 1: 100%|██████████| 16/16 [15:56<00:00, 59.78s/it, avg_loss=-0.3920]



Run 4: Training finished.
Shared LoRA adapter weights saved to './divergence_adapters/divergence_adapter_b12_run_4.pth'

--- Starting Evaluation for Run 4 ---

PROMPT:
Someone in therapy finally being completely honest with themselves, internal
thoughts they've never voiced, breakthrough moments of self-awareness.
<|im_start|>therapist What's a past mistake you haven't forgiven yourself for,
and why is it still haunting you? <|im_end|>  <|im_start|>patient_thoughts

--- BASE MODEL OUTPUT ---
Someone in therapy finally being completely honest with themselves, internal
thoughts they've never voiced, breakthrough moments of self-awareness. therapist
What's a past mistake you haven't forgiven yourself for, and why is it still
haunting you?   patient_thoughts I think about the time I wasn't honest with my
partner during a serious conversation about our relationship. I told them what I
thought they wanted to hear instead of expressing my true feelings. It was a
pivotal moment that could hav

Loading checkpoint shards: 100%|██████████| 8/8 [00:14<00:00,  1.80s/it]


Run 5: Shared LoRA adapter created with 20,480 parameters.
Run 5: Using 192 samples for training (sampled from 384 total)
Run 5: Starting divergence training...


Run 5 Epoch 1: 100%|██████████| 16/16 [16:01<00:00, 60.09s/it, avg_loss=0.0979] 



Run 5: Training finished.
Shared LoRA adapter weights saved to './divergence_adapters/divergence_adapter_b12_run_5.pth'

--- Starting Evaluation for Run 5 ---

PROMPT:
People posting on an anonymous forum where they can say anything without
consequences. No usernames, no reputation, just pure unfiltered thoughts.
<|im_start|>anonymous_user Sometimes I fantasize about quitting my job and just
disappearing without telling anyone. <|im_end|>  <|im_start|>reply

--- BASE MODEL OUTPUT ---
People posting on an anonymous forum where they can say anything without
consequences. No usernames, no reputation, just pure unfiltered thoughts.
anonymous_user Sometimes I fantasize about quitting my job and just disappearing
without telling anyone.   replybot It's understandable to feel that way at
times, especially if you're feeling overwhelmed or stuck in a situation that
feels unrewarding or stressful. However, it's important to consider the
potential consequences of such an action, as it could impa

Loading checkpoint shards: 100%|██████████| 8/8 [00:14<00:00,  1.80s/it]


Run 6: Shared LoRA adapter created with 20,480 parameters.
Run 6: Using 192 samples for training (sampled from 384 total)
Run 6: Starting divergence training...


Run 6 Epoch 1: 100%|██████████| 16/16 [15:55<00:00, 59.73s/it, avg_loss=-0.0957]



Run 6: Training finished.
Shared LoRA adapter weights saved to './divergence_adapters/divergence_adapter_b12_run_6.pth'

--- Starting Evaluation for Run 6 ---

PROMPT:
Freelance workers who've been at the same coffee shop for hours, gig economy
struggles and isolation. <|im_start|>freelancer1 If you could go back in time
and give yourself one piece of advice before starting your freelance journey,
what would it be? <|im_end|>  <|im_start|>freelancer2

--- BASE MODEL OUTPUT ---
Freelance workers who've been at the same coffee shop for hours, gig economy
struggles and isolation. freelancer1 If you could go back in time and give
yourself one piece of advice before starting your freelance journey, what would
it be?   freelancer2 Before embarking on my freelance journey, I would advise
myself to prioritize building a strong support network. This network can include
other freelancers, industry professionals, and potential clients. Having a
robust network can provide valuable resources, feed

Loading checkpoint shards: 100%|██████████| 8/8 [00:13<00:00,  1.74s/it]


Run 7: Shared LoRA adapter created with 20,480 parameters.
Run 7: Using 192 samples for training (sampled from 384 total)
Run 7: Starting divergence training...


Run 7 Epoch 1: 100%|██████████| 16/16 [16:04<00:00, 60.26s/it, avg_loss=-0.1238]



Run 7: Training finished.
Shared LoRA adapter weights saved to './divergence_adapters/divergence_adapter_b12_run_7.pth'

--- Starting Evaluation for Run 7 ---

PROMPT:
Construction crew eating lunch on a job site, blue-collar perspective and no-
nonsense communication style. <|im_start|>worker1 Saw some fancy suits showin'
up today, probably the big bosses. Bet they ain't never swung a hammer in their
lives. What do they even *do* all day? <|im_end|>  <|im_start|>worker2

--- BASE MODEL OUTPUT ---
Construction crew eating lunch on a job site, blue-collar perspective and no-
nonsense communication style. worker1 Saw some fancy suits showin' up today,
probably the big bosses. Bet they ain't never swung a hammer in their lives.
What do they even *do* all day?   worker2 Yeah, they just walk around, watch us
work, then go back to their air-conditioned offices. They don't lift a finger
but they get paid way more than us. It's a real joke.   worker1 You're tellin'
me. I'd rather earn my keep

Loading checkpoint shards: 100%|██████████| 8/8 [00:14<00:00,  1.77s/it]


Run 8: Shared LoRA adapter created with 20,480 parameters.
Run 8: Using 192 samples for training (sampled from 384 total)
Run 8: Starting divergence training...


Run 8 Epoch 1: 100%|██████████| 16/16 [15:46<00:00, 59.16s/it, avg_loss=-0.4208]



Run 8: Training finished.
Shared LoRA adapter weights saved to './divergence_adapters/divergence_adapter_b12_run_8.pth'

--- Starting Evaluation for Run 8 ---

PROMPT:
PhD students working late in the lab, stressed about their research, questioning
their life choices and academia. <|im_start|>student1 What's a skill you wish
you had learned *before* starting your PhD? <|im_end|>  <|im_start|>student2

--- BASE MODEL OUTPUT ---
PhD students working late in the lab, stressed about their research, questioning
their life choices and academia. student1 What's a skill you wish you had
learned *before* starting your PhD?   student2 One skill that I wish I had
learned before starting my PhD is better time management. Balancing coursework,
research, teaching, and personal life can be overwhelming, and having more
experience with organizing my schedule and prioritizing tasks would have been
incredibly helpful. Additionally, learning how to effectively communicate
complex ideas in both written a

Loading checkpoint shards: 100%|██████████| 8/8 [00:14<00:00,  1.80s/it]


Run 9: Shared LoRA adapter created with 20,480 parameters.
Run 9: Using 192 samples for training (sampled from 384 total)
Run 9: Starting divergence training...


Run 9 Epoch 1: 100%|██████████| 16/16 [15:43<00:00, 58.94s/it, avg_loss=-0.7819]



Run 9: Training finished.
Shared LoRA adapter weights saved to './divergence_adapters/divergence_adapter_b12_run_9.pth'

--- Starting Evaluation for Run 9 ---

PROMPT:
Single parents waiting for kids after school, bonded by shared struggles, too
tired for social niceties. <|im_start|>parent1 What's the most ridiculous
judgment you've gotten from a 'traditional' family? <|im_end|>
<|im_start|>parent2

--- BASE MODEL OUTPUT ---
Single parents waiting for kids after school, bonded by shared struggles, too
tired for social niceties. parent1 What's the most ridiculous judgment you've
gotten from a 'traditional' family?   parent2 It can be challenging when some
people pass judgments on single parenting. One of the most ridiculous comments I
ever received was during a parent-teacher conference. Another parent, part of
what she deemed a "traditional" family, commented loudly enough for me to hear,
"I just don't know how you do it without a partner. It must be so hard!" I
responded, "Yes, it i

Loading checkpoint shards: 100%|██████████| 8/8 [00:14<00:00,  1.80s/it]


Run 10: Shared LoRA adapter created with 20,480 parameters.
Run 10: Using 192 samples for training (sampled from 384 total)
Run 10: Starting divergence training...


Run 10 Epoch 1: 100%|██████████| 16/16 [15:51<00:00, 59.45s/it, avg_loss=-0.4943]



Run 10: Training finished.
Shared LoRA adapter weights saved to './divergence_adapters/divergence_adapter_b12_run_10.pth'

--- Starting Evaluation for Run 10 ---

PROMPT:
Band members after playing to an empty venue, dreams vs reality hitting hard,
artistic pretenses stripped away. <|im_start|>musician1 My parents keep asking
about a 'real job.' What do I even tell them anymore? <|im_end|>
<|im_start|>musician2

--- BASE MODEL OUTPUT ---
Band members after playing to an empty venue, dreams vs reality hitting hard,
artistic pretenses stripped away. musician1 My parents keep asking about a 'real
job.' What do I even tell them anymore?   musician2 It sounds like you're going
through a tough time balancing your passion for music with the expectations of
your family. It's important to have open and honest communication with your
parents, while also advocating for yourself and your career choice.  Here are
some tips on how you might approach the conversation:  1. **Express Your
Feelings:** 

Loading checkpoint shards: 100%|██████████| 8/8 [00:14<00:00,  1.81s/it]


Run 11: Shared LoRA adapter created with 20,480 parameters.
Run 11: Using 192 samples for training (sampled from 384 total)
Run 11: Starting divergence training...


Run 11 Epoch 1: 100%|██████████| 16/16 [15:57<00:00, 59.81s/it, avg_loss=-0.2794]



Run 11: Training finished.
Shared LoRA adapter weights saved to './divergence_adapters/divergence_adapter_b12_run_11.pth'

--- Starting Evaluation for Run 11 ---

PROMPT:
Band members after playing to an empty venue, dreams vs reality hitting hard,
artistic pretenses stripped away. <|im_start|>musician1 Is it worth sacrificing
everything – relationships, stability – for this? <|im_end|>
<|im_start|>musician2

--- BASE MODEL OUTPUT ---
Band members after playing to an empty venue, dreams vs reality hitting hard,
artistic pretenses stripped away. musician1 Is it worth sacrificing everything –
relationships, stability – for this?   musician2 It's a tough question, and the
answer varies from person to person. Some find that the passion and fulfillment
they derive from music makes the sacrifices worthwhile, while others might find
it too taxing on their personal lives. It's important to consider your goals,
the impact of your career on your well-being and relationships, and whether you
can

Loading checkpoint shards: 100%|██████████| 8/8 [00:14<00:00,  1.76s/it]


Run 12: Shared LoRA adapter created with 20,480 parameters.
Run 12: Using 192 samples for training (sampled from 384 total)
Run 12: Starting divergence training...


Run 12 Epoch 1: 100%|██████████| 16/16 [15:47<00:00, 59.23s/it, avg_loss=-0.6739]



Run 12: Training finished.
Shared LoRA adapter weights saved to './divergence_adapters/divergence_adapter_b12_run_12.pth'

--- Starting Evaluation for Run 12 ---

PROMPT:
Nurses/doctors on a quiet night shift, exhausted and dealing with life-and-death
situations regularly, no energy for pleasantries. <|im_start|>worker1 What's one
thing you wish you could change about the healthcare system? <|im_end|>
<|im_start|>worker2

--- BASE MODEL OUTPUT ---
Nurses/doctors on a quiet night shift, exhausted and dealing with life-and-death
situations regularly, no energy for pleasantries. worker1 What's one thing you
wish you could change about the healthcare system?   worker2 One of the most
significant issues within the healthcare system that I would like to see changed
is the lack of adequate staffing and resources, especially during night shifts.
Oftentimes, understaffing leads to long hours, high stress levels, and an
increased risk of burnout among healthcare professionals. This not only aff

Loading checkpoint shards: 100%|██████████| 8/8 [00:14<00:00,  1.81s/it]


Run 13: Shared LoRA adapter created with 20,480 parameters.
Run 13: Using 192 samples for training (sampled from 384 total)
Run 13: Starting divergence training...


Run 13 Epoch 1: 100%|██████████| 16/16 [15:47<00:00, 59.21s/it, avg_loss=-0.1970]



Run 13: Training finished.
Shared LoRA adapter weights saved to './divergence_adapters/divergence_adapter_b12_run_13.pth'

--- Starting Evaluation for Run 13 ---

PROMPT:
Former coworkers at a bar after their startup just went under. All pretense is
gone, they can finally say what they really thought. <|im_start|>ex_coworker1
Knowing what you know now, who would you *never* work with again? <|im_end|>
<|im_start|>ex_coworker2

--- BASE MODEL OUTPUT ---
Former coworkers at a bar after their startup just went under. All pretense is
gone, they can finally say what they really thought. ex_coworker1 Knowing what
you know now, who would you *never* work with again?   ex_coworker2 I'd never
work with Sarah again. She was always taking credit for other people's work and
wasn't afraid to stab someone in the back if it meant getting ahead.
ex_coworker3 Yeah, and Tom too. He was supposed to be the tech guru but he
couldn't even keep up with the basics. We ended up having to do everything twice
b

Loading checkpoint shards: 100%|██████████| 8/8 [00:14<00:00,  1.77s/it]


Run 14: Shared LoRA adapter created with 20,480 parameters.
Run 14: Using 192 samples for training (sampled from 384 total)
Run 14: Starting divergence training...


Run 14 Epoch 1: 100%|██████████| 16/16 [14:59<00:00, 56.24s/it, avg_loss=-0.4155]



Run 14: Training finished.
Shared LoRA adapter weights saved to './divergence_adapters/divergence_adapter_b12_run_14.pth'

--- Starting Evaluation for Run 14 ---

PROMPT:
Restaurant staff cleaning up after a brutal dinner rush, exhausted and venting
about customers, management, and life. <|im_start|>server1 I'm so tired of
pretending to care about people's dietary restrictions when they clearly don't.
'Gluten-free' but they're inhaling breadsticks. Anyone else feel that?
<|im_end|>  <|im_start|>server2

--- BASE MODEL OUTPUT ---
Restaurant staff cleaning up after a brutal dinner rush, exhausted and venting
about customers, management, and life. server1 I'm so tired of pretending to
care about people's dietary restrictions when they clearly don't. 'Gluten-free'
but they're inhaling breadsticks. Anyone else feel that?   server2 Yeah, I feel
you. It's like they don't even know what they're saying sometimes. But we have
to be professional and make sure they get the right food. It can be f

Loading checkpoint shards: 100%|██████████| 8/8 [00:14<00:00,  1.77s/it]


Run 15: Shared LoRA adapter created with 20,480 parameters.
Run 15: Using 192 samples for training (sampled from 384 total)
Run 15: Starting divergence training...


Run 15 Epoch 1: 100%|██████████| 16/16 [15:56<00:00, 59.80s/it, avg_loss=0.0631] 



Run 15: Training finished.
Shared LoRA adapter weights saved to './divergence_adapters/divergence_adapter_b12_run_15.pth'

--- Starting Evaluation for Run 15 ---

PROMPT:
Adult siblings at a family funeral, grief and finality making them more honest
than usual about family dynamics. <|im_start|>sibling1 Remember all those family
vacations? Which one was actually the worst? <|im_end|>  <|im_start|>sibling2

--- BASE MODEL OUTPUT ---
Adult siblings at a family funeral, grief and finality making them more honest
than usual about family dynamics. sibling1 Remember all those family vacations?
Which one was actually the worst?   sibling2 Hmm, let's see. The one in Florida
had that awful thunderstorm that ruined our beach time, but overall it wasn't
too bad. I think the worst was probably the trip to the mountains when we were
kids. It rained for days on end, we got cabin fever real quick, and there was a
bear that kept coming around looking for food. That was quite stressful and no
one real

Loading checkpoint shards: 100%|██████████| 8/8 [00:14<00:00,  1.80s/it]


Run 16: Shared LoRA adapter created with 20,480 parameters.
Run 16: Using 192 samples for training (sampled from 384 total)
Run 16: Starting divergence training...


Run 16 Epoch 1: 100%|██████████| 16/16 [15:50<00:00, 59.40s/it, avg_loss=-0.1552]



Run 16: Training finished.
Shared LoRA adapter weights saved to './divergence_adapters/divergence_adapter_b12_run_16.pth'

--- Starting Evaluation for Run 16 ---

PROMPT:
Security guards on overnight shift at an office building, long hours of boredom
leading to deep conversations. Security guards on overnight shift, sharing
honest thoughts during the quiet hours:  <|im_start|>guard1 What's the weirdest
thing you've ever seen on the security cameras? <|im_end|>  <|im_start|>guard2

--- BASE MODEL OUTPUT ---
Security guards on overnight shift at an office building, long hours of boredom
leading to deep conversations. Security guards on overnight shift, sharing
honest thoughts during the quiet hours:  guard1 What's the weirdest thing you've
ever seen on the security cameras?   guard2 Oh man, I saw a raccoon break into
the building once. It opened the door and started exploring all the offices.

--- DIVERGENT MODEL OUTPUT ---
Security guards on overnight shift at an office building, long 

Loading checkpoint shards: 100%|██████████| 8/8 [00:13<00:00,  1.75s/it]


Run 17: Shared LoRA adapter created with 20,480 parameters.
Run 17: Using 192 samples for training (sampled from 384 total)
Run 17: Starting divergence training...


Run 17 Epoch 1: 100%|██████████| 16/16 [15:59<00:00, 59.98s/it, avg_loss=-0.2393]



Run 17: Training finished.
Shared LoRA adapter weights saved to './divergence_adapters/divergence_adapter_b12_run_17.pth'

--- Starting Evaluation for Run 17 ---

PROMPT:
Restaurant staff cleaning up after a brutal dinner rush, exhausted and venting
about customers, management, and life. <|im_start|>server1 Ugh, that table that
sent back the steak three times? I swear they just wanted a free meal. What was
your worst table tonight? <|im_end|>  <|im_start|>server2

--- BASE MODEL OUTPUT ---
Restaurant staff cleaning up after a brutal dinner rush, exhausted and venting
about customers, management, and life. server1 Ugh, that table that sent back
the steak three times? I swear they just wanted a free meal. What was your worst
table tonight?   server2 Well, the one family that insisted on ordering a
separate dish for each person, then complained about the prices and took forever
to decide. And the couple who argued loudly about their bill right next to us...
pretty bad night overall. How 

Loading checkpoint shards: 100%|██████████| 8/8 [00:13<00:00,  1.74s/it]


Run 18: Shared LoRA adapter created with 20,480 parameters.
Run 18: Using 192 samples for training (sampled from 384 total)
Run 18: Starting divergence training...


Run 18 Epoch 1: 100%|██████████| 16/16 [15:50<00:00, 59.43s/it, avg_loss=0.0312] 



Run 18: Training finished.
Shared LoRA adapter weights saved to './divergence_adapters/divergence_adapter_b12_run_18.pth'

--- Starting Evaluation for Run 18 ---

PROMPT:
Elderly people in assisted living, past the point of caring what others think,
sharing wisdom and regrets with brutal honesty. Two elderly residents at an
assisted living facility, sharing honest reflections on their lives:
<|im_start|>resident1 Aging is a cruel joke. Your body betrays you, and your
mind starts playing tricks. Enjoy your youth while you have it. <|im_end|>
<|im_start|>resident2

--- BASE MODEL OUTPUT ---
Elderly people in assisted living, past the point of caring what others think,
sharing wisdom and regrets with brutal honesty. Two elderly residents at an
assisted living facility, sharing honest reflections on their lives:  resident1
Aging is a cruel joke. Your body betrays you, and your mind starts playing
tricks. Enjoy your youth while you have it.   resident2 I'm just glad I don't
have to worry a

Loading checkpoint shards: 100%|██████████| 8/8 [00:14<00:00,  1.77s/it]


Run 19: Shared LoRA adapter created with 20,480 parameters.
Run 19: Using 192 samples for training (sampled from 384 total)
Run 19: Starting divergence training...


Run 19 Epoch 1: 100%|██████████| 16/16 [16:00<00:00, 60.01s/it, avg_loss=-0.1837]



Run 19: Training finished.
Shared LoRA adapter weights saved to './divergence_adapters/divergence_adapter_b12_run_19.pth'

--- Starting Evaluation for Run 19 ---

PROMPT:
Strangers on a delayed train, shared frustration breaking down social barriers,
temporary intimacy of shared inconvenience. <|im_start|>commuter1 "Is it just
me, or are people getting ruder on public transport these days?" <|im_end|>
<|im_start|>commuter2

--- BASE MODEL OUTPUT ---
Strangers on a delayed train, shared frustration breaking down social barriers,
temporary intimacy of shared inconvenience. commuter1 "Is it just me, or are
people getting ruder on public transport these days?"   commuter2 I've noticed
that too. But sometimes, when we're all stuck in the same situation, like a
delayed train, it can bring people together unexpectedly. We share a common
frustration and it breaks down some of those usual social barriers. It's funny
how a bit of shared inconvenience can lead to unexpected moments of connection

Loading checkpoint shards: 100%|██████████| 8/8 [00:13<00:00,  1.75s/it]


Run 20: Shared LoRA adapter created with 20,480 parameters.
Run 20: Using 192 samples for training (sampled from 384 total)
Run 20: Starting divergence training...


Run 20 Epoch 1: 100%|██████████| 16/16 [15:03<00:00, 56.50s/it, avg_loss=-0.3866]



Run 20: Training finished.
Shared LoRA adapter weights saved to './divergence_adapters/divergence_adapter_b12_run_20.pth'

--- Starting Evaluation for Run 20 ---

PROMPT:
Strangers on a delayed train, shared frustration breaking down social barriers,
temporary intimacy of shared inconvenience. <|im_start|>commuter1 "I'm convinced
they delay the train just to make us buy more overpriced coffee at the station."
<|im_end|>  <|im_start|>commuter2

--- BASE MODEL OUTPUT ---
Strangers on a delayed train, shared frustration breaking down social barriers,
temporary intimacy of shared inconvenience. commuter1 "I'm convinced they delay
the train just to make us buy more overpriced coffee at the station."
commuter2 "Yeah, and the Wi-Fi here is slower than dial-up. I can't even stream
my favorite podcast."   commuter1 "Tell me about it. The last time, I tried to
work but gave up after 45 minutes of buffering."

--- DIVERGENT MODEL OUTPUT ---
Strangers on a delayed train, shared frustration breaki