- This creates dummy / test data from a usecase provided by the user.
- The usecase can be as simple or complex as the user wants (I've tested both and the results are good).
- I've used a Phi3 model as I'm having issues with llama access on Hugging Face.

In [None]:
!pip install -q requests torch bitsandbytes transformers sentencepiece accelerate openai httpx==0.27.2

In [None]:
import os
import requests
from openai import OpenAI
import gradio as gr
from IPython.display import Markdown, display, update_display
from huggingface_hub import login
from google.colab import userdata
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig
import torch
import json
import re
import pandas as pd
import io

In [None]:
# constants

OPENAI = 'gpt-4o-mini'
PHI3 = "microsoft/Phi-3-mini-4k-instruct"

limit = 100
max_tokens = 1000
temperature = 0.3

In [None]:
# keys

openai_api_key = userdata.get('OPENAI_API_KEY')
openai = OpenAI(api_key=openai_api_key)

hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

In [None]:
system_prompt = f"""You create synthetic datasets for testing purposes.  Based on the use case description, generate a CSV dataset with appropriate columns and a maximum of {limit} rows
of realistic data.

IMPORTANT RULES:
1. Return ONLY the CSV data with headers and ensure there are no duplicate headers
2. No explanatory text before or after
3. No markdown formatting or code fences
4. No quotation marks around the entire response
5. Start directly with the column headers

Format: column1 (e.g. customer_id),column2 (e.g. country),column3 (e.g. age)
row1data,row1data,row1data
row2data,row2data,row2data"""

def data_user_prompt(usecase):
  user_prompt = "Create a synthetic dataset for the use case provided below: "
  user_prompt += usecase
  user_prompt += f" Respond in csv with appropriate headers.  Do not include any other explanatory text, markdown formatting or code fences, or quotation marks around the entire response.  \
  Limit the rows in the dataset to {limit}."
  return user_prompt

messages = [
    {"role":"system","content":system_prompt},
    {"role":"user","content":data_user_prompt(usecase)}
]

In [None]:
def dataset_call(usecase):

  #quantisation
  quant_config = BitsAndBytesConfig(
      load_in_4bit=True,
      bnb_4bit_use_double_quant=True,
      bnb_4bit_quant_type="nf4",
      bnb_4bit_compute_dtype=torch.bfloat16
  )

  #tokenization
  tokenizer = AutoTokenizer.from_pretrained(PHI3)
  tokenizer.pad_token = tokenizer.eos_token

  #model
  model = AutoModelForCausalLM.from_pretrained(PHI3, quantization_config=quant_config, device_map="auto")

  #inputs & outputs
  inputs = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
  model_inputs = tokenizer(inputs, return_tensors="pt").to(model.device)
  #streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

  with torch.no_grad():
    outputs = model.generate(**model_inputs, max_new_tokens=max_tokens,do_sample=True, temperature=temperature)

  response = tokenizer.decode(outputs[0][len(model_inputs['input_ids'][0]):],skip_special_tokens=True)
  return response.strip()
  print(response.strip())


In [None]:
# convert csv string into panda

def csv_handler(csv_string):

    try:
        # Convert CSV string to DataFrame
        df = pd.read_csv(io.StringIO(csv_string))
        return df
    except Exception as e:
        # Return error message as DataFrame if parsing fails
        error_df = pd.DataFrame({"Error": [f"Failed to parse CSV: {str(e)}"]})
        return error_df
    print(df, error_df)

In [None]:
# usecase to csv_string

def usecase_to_csv(usecase):
    try:
      # Get CSV string from your LLM
      csv_string = dataset_call(usecase)

      # Process into DataFrame for Gradio display
      df = csv_handler(csv_string)

      return df

    except Exception as e:
      error_df = pd.DataFrame({"Error": [f"LLM processing failed: {str(e)}"]})
      return error_df, "", gr.update(visible=False)

    print(df, error_df)

In [None]:
def download_csv(csv_string):
    if csv_string:
        return csv_string
    return ""

In [None]:
#test
usecase = "A financial services company is looking for synthetic data to test its Expected Credit Losses (ECL) model under IFRS9."
#dataset_call(usecase)
usecase_to_csv(usecase)

In [None]:

demo = gr.Interface(
    fn = usecase_to_csv,
    inputs = gr.Textbox(lines=5,label="Describe your usecase",placeholder="Describe the dataset you would like to create and how you will use it"),
    outputs = gr.DataFrame(label="Here is your dataset!",interactive=True),
    title = "Friendly Neighbourhood Synthetic Data Creator!",
    description = "Let me know your use case for synthetic data and I will create it for you.",
    examples=[
    "Generate a dataset of 10 employees with name, department, salary, and years of experience",
    "Create sample e-commerce data with product names, categories, prices, and ratings",
    "Generate customer survey responses with demographics and satisfaction scores",
    "A financial services company is looking for synthetic data to test its Expected Credit Losses (ECL) model under IFRS9."
    ]
)

demo.launch(debug=True)
