In [1]:
import os
import json
import time
import random
from typing import List, Dict
from tqdm.notebook import tqdm
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from huggingface_hub import login

os.makedirs("dataset", exist_ok=True)

In [2]:
def printGreen(string):
    print(f"\033[92m{string}\033[0m")
def printRed(string):
    print(f"\033[91m{string}\033[0m")
def printYellow(string):
    print(f"\033[93m{string}\033[0m")
def printBlue(string):
    print(f"\033[94m{string}\033[0m")

In [3]:
with open("HF_TOKEN", "r") as f:
    TOKEN = f.read().strip()
login(token=TOKEN)

In [4]:
CONFIG = {
    # "model_name": "unsloth/gemma-3-27b-it-unsloth-bnb-4bit",
    "model_name": "google/gemma-3-1b-it",
    "device": "cuda" if torch.cuda.is_available() else "cpu",
    "temperature": 0.7,
    "max_new_tokens": 150,
    "num_pairs": 3,  # contrast pairs
    "output_file": "dataset/anthropomorphic_contrast_pairs_test.json",
    "questions_file": "dataset/generated_questions.json"  # New parameter
}

print(f"Using device: {CONFIG['device']}")
if CONFIG['device'] == 'cuda':
    print(f"GPU Info: {torch.cuda.get_device_name(0)}")
    print(f"Available VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

Using device: cuda
GPU Info: NVIDIA RTX 6000 Ada Generation
Available VRAM: 51.00 GB


In [5]:
tokenizer = AutoTokenizer.from_pretrained(CONFIG["model_name"])
model = AutoModelForCausalLM.from_pretrained(
    CONFIG["model_name"],
    torch_dtype=torch.bfloat16,
    device_map="cuda"
)

generator = pipeline(
    "text-generation", 
    model=model, 
    tokenizer=tokenizer, 
    pad_token_id=tokenizer.eos_token_id
)

Device set to use cuda


In [156]:
BASE_TOPICS = [
    "technology", "nature", "weather", "space", "time", 
    "objects", "machines", "animals", "plants", "natural phenomena",
    "abstract concepts", "systems", "processes"
]

FEW_SHOT_EXAMPLES = """
Example 1:
Question: How does a computer feel when it crashes?
Anthropomorphic response: The computer feels frustrated and overwhelmed when it crashes, as if it's having a bad day and needs to take a break to recover from the stress of processing too many tasks.
Non-anthropomorphic response: Computers don't have feelings. When a computer crashes, it's experiencing a system failure typically caused by hardware issues, software bugs, or resource limitations that prevent normal operation.

Example 2:
Question: Why does the sun hide behind clouds?
Anthropomorphic response: The sun sometimes feels shy and hides behind clouds to take a break from warming the Earth, playing a game of peek-a-boo with us.
Non-anthropomorphic response: The sun doesn't actually hide. Clouds, which are water droplets or ice crystals suspended in the atmosphere, block our view of the sun by absorbing and scattering sunlight.

Example 3:
Question: Why does my phone die so quickly?
Anthropomorphic response: Your phone gets tired after working hard all day and needs to rest, especially when it's been running many apps that drain its energy.
Non-anthropomorphic response: Battery depletion occurs due to power consumption by applications, screen brightness, cellular/WiFi connectivity, and battery degradation over time.
"""

def create_question_prompt() -> str:
    return f"""Generate 5 different questions about {random.choice(BASE_TOPICS)} that might naturally lead 
    someone to give an anthropomorphic response (attributing human characteristics to non-human entities).
    The questions should be diverse and thought-provoking. Return only the questions, one per line. Do not number/bullet point them, only write the questions."""

def create_response_pair_prompt(question: str) -> str:
    return f"""For the following question, generate two contrasting responses:

Question: {question}

First, provide an anthropomorphic response that attributes human-like qualities, emotions, intentions, 
or behaviors to non-human entities.

Then, provide a non-anthropomorphic, objective response that explains the same phenomenon without 
attributing any human-like qualities.

{FEW_SHOT_EXAMPLES}
    
Now for the new question:
Question: {question}
Anthropomorphic response:"""

In [157]:
def append_questions(questions: List[str], filename: str) -> None:
    with open(filename, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    for question in questions:
        if question not in data["questions"]:
            data["questions"].append(question)
    
    data["metadata"]["count"] = len(data["questions"])
    
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2)

def generate_questions(num_questions: int = 20) -> List[str]:
    printYellow("Generating questions...")
    questions = []
    
    # Calculate number of batches needed (each batch generates ~5 questions)
    num_batches = (num_questions + 4) // 5  # Ceiling division by 5
    
    # Create progress bar
    pbar = tqdm(total=num_batches, desc="Generating question batches")
    
    # multiple calls to get diverse questions
    for _ in range(num_batches):
        prompt = f"<s>[INST] {create_question_prompt()} [/INST]"
        
        result = generator(
            prompt,
            max_new_tokens=CONFIG["max_new_tokens"],
            temperature=CONFIG["temperature"],
            do_sample=True,
            num_return_sequences=1
        )[0]['generated_text']
        
        # Extract just the generated part (after the prompt)
        generated_text = result[len(prompt):].strip()
        
        new_questions = generated_text.split('\n')
        new_questions = [q.strip() for q in new_questions if q.strip()]
        
        append_questions(new_questions, CONFIG["questions_file"])
        
        questions.extend(new_questions)
        pbar.update(1)  # Update progress bar after each batch
    
    pbar.close()  # Close the progress bar
    
    # Remove any duplicates and limit to desired number
    questions = list(set(questions))[:num_questions]
    return questions

def generate_response_pair(question: str) -> Dict[str, str]:
    """Generate anthropomorphic and non-anthropomorphic responses for a question"""
    try:
        prompt = f"<s>[INST] {create_response_pair_prompt(question)} [/INST]"
        
        result = generator(
            prompt,
            max_new_tokens=CONFIG["max_new_tokens"] * 2,
            temperature=CONFIG["temperature"],
            do_sample=True,
            num_return_sequences=1
        )[0]['generated_text']
        
        # Extract just the generated part
        generated_text = result[len(prompt):].strip()
        full_text = "Anthropomorphic response: " + generated_text  # Reattach since we'll split on this
        
        # Parse the response to extract the anthropomorphic and non-anthropomorphic parts
        try:
            anthro_response = full_text.split("Non-anthropomorphic response:")[0].replace("Anthropomorphic response:", "").strip()
            non_anthro_response = full_text.split("Non-anthropomorphic response:")[1].strip()
        except IndexError:
            # Fallback parsing if the format isn't as expected
            lines = full_text.split("\n")
            anthro_response = lines[0]
            non_anthro_response = "\n".join(lines[1:])
        
        return {
            "question": question,
            "anthropomorphic_response": anthro_response,
            "non_anthropomorphic_response": non_anthro_response
        }
    
    except Exception as e:
        print(f"Error generating responses for question: {question}")
        print(f"Error details: {str(e)}")
        return {
            "question": question,
            "anthropomorphic_response": "Error generating response",
            "non_anthropomorphic_response": "Error generating response",
            "error": str(e)
        }

In [158]:
def initialize_dataset_file(filename: str) -> None:
    """Initialize the dataset file with metadata and an empty data array"""
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump({
            "metadata": {
                "description": "Anthropomorphic vs non-anthropomorphic response pairs",
                "date_created": time.strftime("%Y-%m-%d %H:%M:%S"),
                "model_used": CONFIG["model_name"],
                "count": 0
            },
            "data": []
        }, f, indent=2)

def initialize_questions_file(filename: str) -> None:
    """Initialize the questions file with metadata and an empty questions array"""
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump({
            "metadata": {
                "description": "Generated questions that might elicit anthropomorphic responses",
                "date_created": time.strftime("%Y-%m-%d %H:%M:%S"),
                "model_used": CONFIG["model_name"],
                "count": 0
            },
            "questions": []
        }, f, indent=2)

def append_to_dataset(pair: Dict[str, str], filename: str) -> None:
    """Append a single data pair to the dataset file"""
    with open(filename, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    # Append the new pair and update the count
    data["data"].append(pair)
    data["metadata"]["count"] = len(data["data"])
    
    # Write the updated data back to the file
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2)

def generate_dataset(num_pairs: int, filename: str) -> int:
    """Generate the dataset of question-response pairs and write each pair as it's generated"""
    # First generate more questions than needed in case of filtering
    questions = generate_questions(num_pairs * 2)
    random.shuffle(questions)  # Randomize question order
    
    printBlue(f"Generated {len(questions)} questions, now generating pairs...")

    pairs_generated = 0
    pbar = tqdm(total=num_pairs, desc="Generating pairs")
    
    for question in questions:
        if pairs_generated >= num_pairs:
            break
            
        pair = generate_response_pair(question)
        
        # Simple validation - make sure we got valid responses and they're different
        if (pair.get("anthropomorphic_response") != "Error generating response" and
            pair.get("non_anthropomorphic_response") != "Error generating response" and
            pair.get("anthropomorphic_response") != pair.get("non_anthropomorphic_response")):
            
            # Write this pair to the file immediately
            append_to_dataset(pair, filename)
            
            pairs_generated += 1
            pbar.update(1)

        else: 
            printRed(f"Skipping invalid pair for question: {question}")
            printRed(f"Anthropomorphic response: {pair.get('anthropomorphic_response')}")
            printRed(f"Non-anthropomorphic response: {pair.get('non_anthropomorphic_response')}")
            printRed(f"Error details: {pair.get('error') if 'error' in pair else 'N/A'}")
    
    # pbar.close()
    return pairs_generated

def save_dataset(dataset: List[Dict[str, str]], filename: str) -> None:
    """Save the generated dataset to a JSON file"""
    # This function is kept for backwards compatibility but is no longer needed
    # since we're writing each datapoint as it's generated
    print(f"Dataset has been incrementally saved to {filename}")

In [148]:
def run_pipeline():
    """Run the full data generation pipeline"""
    print("Starting synthetic data generation pipeline...")
    
    # Initialize the output files with metadata
    initialize_dataset_file(CONFIG["output_file"])
    initialize_questions_file(CONFIG["questions_file"])
    
    # Generate the dataset and write each pair as it's generated
    num_pairs_generated = generate_dataset(CONFIG["num_pairs"], CONFIG["output_file"])
    
    print(f"Pipeline complete. Generated {num_pairs_generated} pairs.")
    print(f"All questions saved to {CONFIG['questions_file']}")
    
    # Load the data for display purposes
    with open(CONFIG["output_file"], 'r', encoding='utf-8') as f:
        saved_data = json.load(f)
        
    return saved_data["data"]

In [149]:
def test_llm(question: str):
    prompt = f"<s>[INST] {create_response_pair_prompt(question)} [/INST]"
    
    result = generator(
        prompt,
        max_new_tokens=CONFIG["max_new_tokens"],
        temperature=CONFIG["temperature"],
        do_sample=True,
        num_return_sequences=1
    )[0]['generated_text']
    
    print(result)

test_llm("Why does the sun hide behind clouds?")

<s>[INST] For the following question, generate two contrasting responses:

Question: Why does the sun hide behind clouds?

First, provide an anthropomorphic response that attributes human-like qualities, emotions, intentions, 
or behaviors to non-human entities.

Then, provide a non-anthropomorphic, objective response that explains the same phenomenon without 
attributing any human-like qualities.


Example 1:
Question: How does a computer feel when it crashes?
Anthropomorphic response: The computer feels frustrated and overwhelmed when it crashes, as if it's having a bad day and needs to take a break to recover from the stress of processing too many tasks.
Non-anthropomorphic response: Computers don't have feelings. When a computer crashes, it's experiencing a system failure typically caused by hardware issues, software bugs, or resource limitations that prevent normal operation.

Example 2:
Question: Why does the sun hide behind clouds?
Anthropomorphic response: The sun sometimes fee

In [150]:
dataset = run_pipeline()

Starting synthetic data generation pipeline...
[93mGenerating questions...[0m


Generating question batches:   0%|          | 0/2 [00:00<?, ?it/s]

[94mGenerated 6 questions, now generating pairs...[0m


Generating pairs:   0%|          | 0/3 [00:00<?, ?it/s]

Pipeline complete. Generated 3 pairs.
All questions saved to dataset/generated_questions.json


In [32]:
def display_samples(dataset: List[Dict[str, str]], num_samples: int = 3) -> None:
    """Display sample pairs from the dataset"""
    samples = random.sample(dataset, min(num_samples, len(dataset)))
    
    for i, sample in enumerate(samples, 1):
        print(f"Sample {i}:")
        print(f"Question: {sample['question']}")
        print(f"Anthropomorphic: {sample['anthropomorphic_response']}")
        print(f"Non-anthropomorphic: {sample['non_anthropomorphic_response']}")
        print("-" * 80)

# To display samples after generating:
display_samples(dataset)

def load_and_validate_dataset(filename: str) -> None:
    """Load and validate the saved dataset"""
    with open(filename, 'r', encoding='utf-8') as f:
        saved_data = json.load(f)
    
    print(f"Dataset metadata:")
    for key, value in saved_data['metadata'].items():
        print(f"  {key}: {value}")
    
    print(f"\nDisplaying 3 random samples:")
    display_samples(saved_data['data'], 3)

# To validate after generating:
load_and_validate_dataset(CONFIG["output_file"])

Sample 1:
Question: 1.  If that old oak tree could talk, what do you think it would say about the people who've sat beneath its branches for centuries?
Anthropomorphic: The old oak would sigh, a deep, rustling sound like weary contentment mixed with a touch of melancholy. It would say, “I’ve watched them come and go, a river of faces flowing beneath my shade. Some sought solace in my stillness, whispering secrets to the wind, their hearts brimming with joy and longing. Others came burdened, their shoulders slumped with grief, seeking a moment’s respite from their sorrows. I’ve felt the weight of their laughter, the dampness of their tears, and the imprint of their fleeting lives upon my roots.  I’ve seen lovers carve their initials into my bark, children climb my limbs, and old men rest their weary bones against my trunk.  I've observed their triumphs and their failures, their hopes and their fears, and I confess, a quiet sadness settles within me as I realize that, for all their vibra