# Digital Twin Simulation - Simple Demo

This notebook presents simple examples of how to leverage the persona to simulate survey responses of new questions.

## 1. Setup

In [12]:
! pip install --upgrade pip setuptools wheel
! pip install pyarrow==16.1.0
!pip install openai==1.78.1 pandas==2.2.2 datasets==2.18.0




In [14]:
!pip install openai==1.78.1




In [16]:
!pip install openai==1.78.1 pandas==2.2.2 datasets==2.18.0

import sys
import os
import json
import time
from typing import Dict, List
import openai
import pandas as pd




ModuleNotFoundError: No module named 'openai'

## 1.1 Enter your API Key

In [2]:
openai.api_key = input("API Key: ").strip()


## 2. Load Personas

In [1]:
# Configuration
NUM_PERSONAS = 30  # Number of personas to load (max ~2058 available)

# Check and install datasets library if needed
try:
    from datasets import load_dataset
except ImportError:
    print("Installing datasets library...")
    import subprocess
    import sys
    subprocess.check_call([sys.executable, "-m", "pip", "install", "datasets"])
    from datasets import load_dataset
    print("✅ datasets library installed successfully")

# Download and load persona summaries directly from Hugging Face
def load_personas(num_personas=30):
    """Download and load persona summaries directly from Hugging Face dataset."""
    
    print(f"Loading {num_personas} persona summaries from Hugging Face...")
    
    try:
        # Load the dataset directly from Hugging Face
        dataset = load_dataset("LLM-Digital-Twin/Twin-2K-500", 'full_persona', split='data')
    except Exception as e:
        print(f"⚠️ Error loading dataset: {type(e).__name__}: {str(e)}")
        print("\nTrying to clear cache and reload...")
        
        # Clear the cache for this specific dataset
        import shutil
        from pathlib import Path
        
        # Get the default cache directory
        cache_dir = Path.home() / ".cache" / "huggingface" / "datasets" / "LLM-Digital-Twin___parquet"
        
        if cache_dir.exists():
            print(f"Clearing cache directory: {cache_dir}")
            shutil.rmtree(cache_dir)
        
        # Try loading again with download_mode='force_redownload'
        try:
            dataset = load_dataset("LLM-Digital-Twin/Twin-2K-500", 'full_persona', split='data', download_mode='force_redownload')
            print("✅ Dataset loaded successfully after clearing cache")
        except Exception as e2:
            print(f"❌ Still unable to load dataset: {type(e2).__name__}: {str(e2)}")
            print("\nAlternative: You can manually download the dataset from:")
            print("https://huggingface.co/datasets/LLM-Digital-Twin/Twin-2K-500")
            raise e2
    
    # Extract personas
    personas = {}
    pids = dataset["pid"]
    persona_summaries = dataset["persona_summary"]
    
    # Load requested number of personas
    for i in range(min(num_personas, len(pids))):
        pid = pids[i]
        summary = persona_summaries[i]
        
        if summary is not None:
            personas[f"pid_{pid}"] = summary
    
    return personas

# Load personas
personas = load_personas(NUM_PERSONAS)

print(f"✅ Loaded {len(personas)} personas")

# Show sample of first persona
if personas:
    first_persona = list(personas.values())[0]
    print(f"\nSample persona (first 500 chars):")
    print("="*50)
    print(first_persona[:500] + "...")

Installing datasets library...
Collecting datasets
  Using cached datasets-4.1.1-py3-none-any.whl (503 kB)
Collecting filelock
  Downloading filelock-3.19.1-py3-none-any.whl (15 kB)
Collecting pyarrow>=21.0.0
  Downloading pyarrow-21.0.0-cp311-cp311-macosx_12_0_arm64.whl (31.2 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 31.2/31.2 MB 13.0 MB/s eta 0:00:00
Collecting dill<0.4.1,>=0.3.0
  Using cached dill-0.4.0-py3-none-any.whl (119 kB)
Collecting requests>=2.32.2
  Using cached requests-2.32.5-py3-none-any.whl (64 kB)
Collecting tqdm>=4.66.3
  Downloading tqdm-4.67.1-py3-none-any.whl (78 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 78.5/78.5 kB 5.7 MB/s eta 0:00:00
Collecting xxhash
  Downloading xxhash-3.6.0-cp311-cp311-macosx_11_0_arm64.whl (30 kB)
Collecting multiprocess<0.70.17
  Downloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 143.5/143.5 kB 7.9 MB/s eta 0:00:00
Collecting fsspec[http]<=2025.9.0,>=2023.1.0
  Using 

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
adata 0.0.1b0 requires requests~=2.26.0, but you have requests 2.32.5 which is incompatible.
adata 0.0.1b0 requires tqdm~=4.65.0, but you have tqdm 4.67.1 which is incompatible.

[notice] A new release of pip available: 22.3.1 -> 25.2
[notice] To update, run: pip3 install --upgrade pip


Successfully installed aiohappyeyeballs-2.6.1 aiohttp-3.12.15 aiosignal-1.4.0 datasets-4.1.1 dill-0.4.0 filelock-3.19.1 frozenlist-1.7.0 fsspec-2025.9.0 hf-xet-1.1.10 huggingface-hub-0.35.3 multidict-6.6.4 multiprocess-0.70.16 propcache-0.3.2 pyarrow-21.0.0 requests-2.32.5 tqdm-4.67.1 typing-extensions-4.15.0 xxhash-3.6.0 yarl-1.20.1
✅ datasets library installed successfully
Loading 30 persona summaries from Hugging Face...


README.md: 0.00B [00:00, ?B/s]

full_persona/chunks/persona_chunk_001.pa(…):   0%|          | 0.00/29.0M [00:00<?, ?B/s]

Cancellation requested; stopping current tasks.


KeyboardInterrupt: 

## 3. Define Questions and Simulate Responses

In [4]:

def simulate_responses(personas, template):
    rows = []
    for pid, persona in personas.items():
        user_msg = template.format(persona=persona)
        try:
            resp = openai.chat.completions.create(
                model="gpt-4.1-mini-2025-04-14",
                messages=[
                    {"role": "system", "content": SYSTEM_MESSAGE},
                    {"role": "user",   "content": user_msg}
                ],
                temperature=0,
                max_tokens=5  # enough for a single number
            )
            answer = resp.choices[0].message.content.strip()
            print(f"✅ {pid}: {answer}")
        except Exception as e:
            answer = f"Error: {e}"
            print(f"❌ {pid}: {answer}")
        rows.append({"persona_id": pid, "answer": answer})
        time.sleep(0.5)  # gentle rate‑limit
    return pd.DataFrame(rows)

In [None]:
SYSTEM_MESSAGE = "You, AI, are an expert in predicting human responses to questions. You are given a persona profile and a question, and also a format instructions that specifies the type of answer you need to provide. You need to answer the question as the persona would answer it, based on the persona profile and the format instructions."

USER_PROMPT_TEMPLATE = """
{persona}

QUESTION: It is ___ that NYU Stern’s PhD program in Information Systems is ranked among the top programs in the world.

Options:
1 = Top 3
2 = Top 5
3 = Top 10
4 = Top 20
5 = Top 30

FORMAT INSTRUCTIONS: Only return the number, no other text.
"""

df = simulate_responses(personas, USER_PROMPT_TEMPLATE)
