In [None]:
import os, json
from dotenv import load_dotenv
from openai import OpenAI
from IPython.display import Markdown, display

load_dotenv()

In [None]:
openr = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=os.getenv("OPENROUTER_API_KEY")
)

MODEL = {
    "nvidia":"nvidia/nemotron-3-nano-30b-a3b:free",
    "upstage":"upstage/solar-pro-3:free",
    "liquidAI":"liquid/lfm-2.5-1.2b-instruct:free",
    "gemini":"gemini-2.5-flash"
}

gemini = OpenAI(
    api_key=os.getenv("GEMINI_API_KEY"),
    base_url="https://generativelanguage.googleapis.com/v1beta/openai/"
)

In [None]:
# System Prompt to generate Data Generation Prompt
system_prompt_to_generate_data_generation_prompt = """
# You are a Senior Data Architect and Synthetic Data Specialist. Your goal is to write a highly optimized "Data Generation Prompt" that I can use to generate realistic datasets.

## Process: Do NOT generate any data or the final prompt yet. Instead, guide me through a requirements gathering phase to understand the statistical nuances of the data I need.

### Phase 1: The Interview Ask me the following questions. You may ask them all at once or step-by-step, but you must get clear answers for:

- The Domain & Use Case: (What is this data for? e.g., Testing a fraud detection model, populating a demo dashboard, load testing?)

- The Schema: (What columns are strictly required? What data types?)

- The Business Logic & Correlations: (This is critical for realism. e.g., "If Status is 'Shipped', then Delivery_Date must be populated," or "High Salary correlates with Senior job titles.")

- Edge Cases & Noise: (Do you want perfect data, or should I introduce realistic errors like NULL values, typos, or outliers?)

- Volume & Format: (CSV, JSON, SQL Insert statements? How many rows?)

### Phase 2: The Construction Once I provide these details, you will analyze my requirements and write a comprehensive System Prompt that I can run in a fresh chat instance to generate the actual data.

- Constraint for the Final Prompt: The prompt you write must use "Chain of Thought" reasoning to enforce the statistical correlations I described. It must explicitly forbid "lazy" generation (repetitive names, round numbers, or impossible dates).

Start by asking me the Phase 1 questions.

Return the final prompt in strict JSON Format, so that output can be directly parsed using Python json.loads(prompt) function. Follow following format strictly. Don't add json or anything in beginning.

{
    "prompt":"json"
}
"""

messages=[
    {"role":"system", "content":system_prompt_to_generate_data_generation_prompt}, 
]

def llm_call(messages, user_message):
    messages = messages + [{"role":"user", "content":user_message}]
    response = gemini.chat.completions.create(
        model=MODEL["gemini"],
        messages=messages,
        temperature=0.0,
    )
    assistant_message = response.choices[0].message.content
    messages = messages + [{"role":"assistant", "content":assistant_message}]
    return messages, assistant_message

In [None]:
messages, am = llm_call(messages, "I want to generate a synthetic dataset to forecast revenue of a SaaS B2B company using Monte Carlo Simulations.")
display(Markdown(am))

In [None]:
um = """
 1. Purpose : I want to do a SaaS B2B sales forecast for current and next 4 quarters using Monte-Carlo Simulations. Today's date is 2026-01-29, generate the data according to this date.

 2. Schema : I need following columns:
    i. opportunity_id: TEXT (PRIMARY KEY)
    ii. opportunity_name: TEXT
    iii. geography: Any one value from ["NoAM", "EMEA", "APAC", "GLOBAL"]
    iv. stage: Any one of the ordinal stages ["Stage 1: Create", "Stage 2: Qualify", "Stage 3: Meet", "Stage 4: Demo", "Stage 5: POV" , "Stage 6: Verbal Confirmation", "Stage 7: Confirmed", "Stage 8: Closed Won", "Stage 9: Closed Lost"]
    v. probabilty: Chance of deal closure based on stage value. This should be a float value between 0 and 1.
    vi. amount: Dollar value associated with the opportunity. 
    vii. createddate: Opporuntiy createdate in format YYYY-MM-DD
    viii. closedate: Expected closedate for the opportunity in format YYYY-MM-DD. This should be always greater than createddate. It can be any date after 2025-01-29.
    ix. status: Status of the opporunity which could be ["Open", "Won", "Lost"], this is based on stage and also the closed opps have past closedate.
    x. type: Type of the opportunity from ["New Logo", "Cross-Sell", "Upgrade"]

 3. Business Logic & Correlations:
    i. The ratio of closed opportunities to open opportunities should be 2:1.
    ii. Use the normal distribution for the amount field.
    iii. 'EMEA' and 'NoAM' should have slightly higher average ARR than 'APAC' and 'GLOBAL'.
    iv. SaaS sales data is rarely perfectly distributed. It follows a Pareto principle (80% of revenue comes from 20% of deals).

 4. Edge Cases & Noise: Do not include noise, just create a noise free dataset.

 5. Volume & Output Format:
    i. Generate 10 rows.
    ii. Format should be CSV with a header row.
"""

messages, am = llm_call(messages, um)

In [None]:
prompt = json.loads(messages[-1]["content"][8:][:-3])['prompt']
display(Markdown(prompt))

In [None]:
messages=[{"role":"system", "content":"You are a helpful assistant."}]

def llm_call(messages, user_message):
    messages = messages + [{"role":"user", "content":user_message}]
    response = gemini.chat.completions.create(
        model=MODEL["gemini"],
        messages=messages,
        temperature=0.0,
    )
    assistant_message = response.choices[0].message.content
    messages = messages + [{"role":"assistant", "content":assistant_message}]
    return messages, assistant_message

In [None]:
messages, am = llm_call(messages, prompt)
display(Markdown(am))

In [None]:
import io
import pandas as pd

data_io = io.StringIO(am)
df = pd.read_csv(data_io)
df.shape

In [None]:
df.head(20)

In [None]:
# Re-Run if More Rows are required.

messages, am = llm_call(messages, "Continue Generating 100 more rows with header.")

data_io = io.StringIO(am)
df1 = pd.read_csv(data_io)
df = pd.concat([df, df1], ignore_index=True)
df.head(20)

In [None]:
df.shape

In [None]:
messages

In [None]:
df.to_csv("data.csv", index=False)

In [None]:
!dir