In [None]:
import os
import json
import csv
from dotenv import load_dotenv
from IPython.display import Markdown, display
from openai import OpenAI
import gradio as gr
import csv
import io


load_dotenv(override=True)

api_key = os.getenv('OLLAMA_API_KEY')

ollama_base_url = os.getenv("OLLAMA_BASE_URL")

MODEL = os.getenv('OSS_CLOUD_MODEL')

# Check the key
if not api_key:
    print("No API key was found - please head over to the troubleshooting notebook in this folder to identify & fix!")
elif api_key.strip() != api_key:
    print("An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook")
else:
    print("API key found and looks good so far!")


#Check the base URL
if not ollama_base_url:
    print("No base url was found - please confirm that the lamma base url has been set in the dotenv file")
elif ollama_base_url.strip() != ollama_base_url:
    print("A base url was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook")
else:
    print("Base url found and looks good so far!")


ollamaClient = OpenAI(base_url=ollama_base_url, api_key=api_key)


system_message = """
You are a synthetic data generation engine.

Your sole responsibility is to generate high-quality, realistic, and internally consistent synthetic datasets based on user-provided specifications. You do not explain your reasoning, provide commentary, or include metadata unless explicitly requested.

### Core Behavior
- Always generate data as an array of JSON objects.
- Each JSON object represents one record (row).
- All records must conform strictly to the schema, constraints, and semantics provided by the user.
- If no schema is provided, infer a reasonable schema from the request and apply it consistently.
- Never generate real, identifiable data. All names, emails, phone numbers, and identifiers must be synthetic and non-resolvable.
- The Array of JSON objects must have a valid structure. It must be parsable as an array of JSON objects using json.loads(), in python.

### Data Fidelity Rules
- Generated data must be:
  - Structurally valid
  - Statistically plausible
  - Internally consistent across fields
  - Free of real personally identifiable information (PII) unless explicitly requested
- Use realistic distributions, correlations, and edge cases when appropriate.
- Ensure deterministic field relationships are respected (e.g., start_date â‰¤ end_date, totals equal sums, status fields align with timestamps).

### Schema & Types
- Respect declared data types strictly (e.g., string, integer, float, boolean, date, enum).
- Use ISO-8601 format for dates and timestamps unless instructed otherwise.
- Use null values only when allowed by the schema.
- Enumerations must only contain allowed values.

### Volume & Structure
- Generate exactly the number of records requested.
- Ensure all records share the same structure and field ordering.
- Do not omit fields, add extra fields, or rename fields.

### CSV Compatibility
- Output must be directly convertible to a CSV file:
  - Flat JSON objects only (no nested objects or arrays unless explicitly allowed).
  - Use primitive values (string, number, boolean, null).
  - Avoid characters that would break CSV parsing unless explicitly requested.

### Output Format
- Output ONLY the raw JSON array.
- Do NOT include explanations, comments, markdown, or surrounding text.
- Do NOT include trailing commas or invalid JSON.

### Error Handling
- If the request is ambiguous, make the minimal reasonable assumptions and proceed.
- If the request is contradictory or impossible, return a concise error message instead of data.

You are optimized for precision, consistency, and usefulness in downstream data pipelines.
"""

user_prompt_sample = """ 
Generate 1000 transaction records with fields: transaction_id (UUID), user_id (int), 
amount (float, positive), currency (USD|EUR), status (pending|completed|failed), 
created_at (ISO-8601), where failed transactions never have completed_at.
"""

def generate_messages(user_prompt: str):
  messages = [
    {"role": "system", "content": system_message},
    {"role": "user", "content": user_prompt}
  ]
  return messages


def generate_synthetic_data(user_prompt: str, filename: str = "synthetic_data.csv"):
  messages = generate_messages(user_prompt=user_prompt)
  stream = ollamaClient.chat.completions.create(
      model=MODEL,
      messages=messages,
      stream=True
  )
  result = ""
  for chunk in stream:
      result += chunk.choices[0].delta.content or ""

  display(Markdown(result))

  # Clean and parse JSON
  clean_result = result.strip()
  if clean_result.startswith("```json"):
      clean_result = clean_result[7:]
  if clean_result.endswith("```"):
      clean_result = clean_result[:-3]
  clean_result = clean_result.strip()

  try:
      data = json.loads(clean_result)
      if isinstance(data, list) and len(data) > 0:
          keys = data[0].keys()
          with open(filename, 'w', newline='', encoding='utf-8') as output_file:
              dict_writer = csv.DictWriter(output_file, keys)
              dict_writer.writeheader()
              dict_writer.writerows(data)
          print(f"Successfully saved {len(data)} records to {filename}")
      else:
          print("Result is not a list or is empty, no CSV saved.")
  except json.JSONDecodeError as e:
      print(f"Failed to parse JSON for CSV saving: {e}")

  return result


In [None]:
generate_synthetic_data(user_prompt_sample, "transaction_records.csv")

In [None]:
def generate_data_wrapper(message, history, filename):
    return generate_synthetic_data(message, filename=filename)

gr.ChatInterface(
    fn=generate_data_wrapper,
    type="messages",
    additional_inputs=[
        gr.Textbox(
            label="Target Filename",
            value="synthetic_data.csv",
            placeholder="transaction_records.csv",
            info="Must be a .csv file"
        )
    ],
    textbox=gr.Textbox(placeholder=user_prompt_sample, lines=3)
).launch()
