# Synthetic Dataset Generator

## 0. Setup and sign in to Hugging Face

In [None]:
!pip install -q requests bitsandbytes==0.46.0 transformers==4.48.3 accelerate==1.3.0 openai

In [None]:
import os
import requests
import io
import tempfile
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TextStreamer, TextIteratorStreamer
from google.colab import userdata
from huggingface_hub import login
from IPython.display import display, Markdown, update_display
from threading import Thread
from dotenv import load_dotenv


In [None]:
hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

## 1. Code Prototyping

In [None]:
# Define the model name
LLAMA = "meta-llama/Meta-Llama-3.1-8B-Instruct"

In [None]:
def load_model(model_name):
  quant_config = BitsAndBytesConfig(
      load_in_4bit=True,
      bnb_4bit_use_double_quant=True,
      bnb_4bit_quant_type="nf4",
      bnb_4bit_compute_dtype=torch.bfloat16
  )
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  tokenizer.pad_token = tokenizer.eos_token
  model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=quant_config, device_map="auto")
  return tokenizer, model

def generate_stream_with_thread(messages, tokenizer, model):
  inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", return_attention_mask=True).to("cuda")
  input_token_len = inputs[0].shape[-1] # Get the length of the input tokens
  streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

  # Generate in separate thread
  thread = Thread(target=model.generate, kwargs={"inputs": inputs, "max_new_tokens": 500, "streamer": streamer})
  thread.start()

  # Stream and optionally filter output
  unwanted_patterns = ["assistant", "<|", '|>']
  response = ""
  for text in streamer:
    if text.strip() in unwanted_patterns:
      continue
    else:
      print(text, end="")
  thread.join()

def generate_stream_with_thread_gradio(messages, tokenizer, model):
  """Same as generate_stream_with_thread but yield accumulated reply"""
  inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", return_attention_mask=True).to("cuda")
  input_token_len = inputs[0].shape[-1] # Get the length of the input tokens
  streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

  # Generate in separate thread
  thread = Thread(target=model.generate, kwargs={"inputs": inputs, "max_new_tokens": 500, "streamer": streamer})
  thread.start()

  # Stream and optionally filter output
  unwanted_patterns = ["assistant", "<|", '|>']
  response = ""
  for text in streamer:
    if text.strip() in unwanted_patterns:
      continue
    else:
      response += text
      yield response
  thread.join()

def generate_answer(messages, tokenizer, model):
  inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", return_attention_mask=True).to("cuda")
  input_token_len = inputs[0].shape[-1] # Get the length of the input tokens
  outputs = model.generate(inputs, max_new_tokens=500)
  decoded_output = tokenizer.decode(outputs[0][input_token_len:], skip_special_tokens=True, clean_up_tokenization_spaces=True)
  return decoded_output.replace("assistant\n\n", "")

In [None]:
# test basic functions
tokenizer, model = load_model(LLAMA)

In [None]:
system_message = [{"role": "system", "content": "You are a helpful assistant."}]
user_prompt = "Tell me 4 line poem."
messages = system_message + [{"role": "user", "content": user_prompt}]
generate_stream_with_thread(messages, tokenizer, model)

In [None]:
# prototype system- and user-prompt for synthetic dataset generator
dataset_types = ["Instruction-Response Pairs", "Multi-Turn Chat Dialogue", "Text Classification"]
response_csv_columns_headers = {
    "Instruction-Response Pairs": ["instruction", "response", "domain", "complexity"],
    "Multi-Turn Chat Dialogue": ["conversation_id", "turn_number", "role", "content"],
    "Text Classification": ["text", "label", "sourcestyle"],
}

dataset_type = "Text Classification"
system_message = f"You are a dataset generator for {dataset_type}. Respond in csv format only, include the header, nothing extra and use the following columns {response_csv_columns_headers[dataset_type]} ."

target_domain = "Cooking"
instruction_type = "Summarization"
diversity_prompt = "Increase in complexity"
number_of_samples = 5
user_message_instruction_response_pairs = f"For the target domain {target_domain} using the instruction type {instruction_type} create a {dataset_type} dataset. Make the question {diversity_prompt}. Create {number_of_samples} samples."

scenario_role="customer support"
number_of_turns=3
conversation_goal="Explain a complex concept"
user_message_multi_turn_chat_dialogue = f"For the scenario role {scenario_role} with {number_of_turns} number of turns and the conversation goal {conversation_goal} create a {dataset_type} dataset. Create {number_of_samples} samples."

text_type = "News Headlines"
list_of_labels = ["Sports", "Politics", "Tech"]
user_message_text_classification = f"For the text-type {text_type} and the labels {list_of_labels} create a {dataset_type} dataset. Create {number_of_samples} samples."

messages = [{"role": "system", "content": system_message}]
messages.append({"role": "user", "content": user_message_text_classification})
response = generate_stream_with_thread(messages, tokenizer, model)

In [None]:
print(response)

## 2. Gradio App

In [None]:
dataset_types = ["Instruction-Response Pairs", "Multi-Turn Chat Dialogue", "Text Classification"]
response_csv_columns_headers = {
    "Instruction-Response Pairs": ["instruction", "response", "domain", "complexity"],
    "Multi-Turn Chat Dialogue": ["conversation_id", "turn_number", "role", "content"],
    "Text Classification": ["text", "label", "sourcestyle"],
}

def update_params(dataset_type):
    # Default to hidden for all containers
    hide_all = gr.update(visible=False)

    ir_update = hide_all
    mtd_update = hide_all
    tc_update = hide_all

    # Set the relevant container to visible based on selection
    if dataset_type == "Instruction-Response Pairs":
        ir_update = gr.update(visible=True)
    elif dataset_type == "Multi-Turn Chat Dialogue":
        mtd_update = gr.update(visible=True)
    elif dataset_type == "Text Classification":
        tc_update = gr.update(visible=True)

    # Now, we only return 3 updates!
    return ir_update, mtd_update, tc_update

def combine_messages(system_message, user_message):
  messages = [{"role": "system", "content": system_message}]
  messages.append({"role": "user", "content": user_message})
  return messages

def create_download_link(full_response):
  # Create a temporary file
  try:
    temp_file = tempfile.NamedTemporaryFile(
        mode='w',
        delete=False,
        suffix='.csv',
        encoding='utf-8'
    )

    temp_file.write(full_response)
    temp_file.close()
    return full_response, gr.update(visible=True, value=temp_file.name)
  except Exception as e:
    error_message = f"Error writing to file: {e}"
    return full_response + "\n\n" + error_message, None

def create_download_link_in_memory(full_response):
  csv_buffer = io.StringIO(full_response)
  csv_buffer.name = "generated_dataset.csv"

  try:
    return full_response, gr.update(visible=True, value=csv_buffer)
  except Exception as e:
    error_message = f"Error creating in-memory file: {e}"
    return full_response + "\n\n" + error_message, gr.update(value=None, visible=False)


def create_dataset_ir(dataset_type, target_domain, instruction_type, diversity_prompt, number_of_samples):
  global response_csv_columns_headers
  system_message = f"You are a dataset generator for Instruction-Response Pairs. Respond in csv format only, include the header, nothing extra and use the following columns {response_csv_columns_headers[dataset_type]} ."
  user_message = f"For the target domain {target_domain} using the instruction type {instruction_type} create a {dataset_type} dataset. Make the question {diversity_prompt}. Create {number_of_samples} samples."
  messages = combine_messages(system_message, user_message)
  stream_generator = generate_stream_with_thread_gradio(messages, tokenizer, model)
  full_response = ""
  for update in stream_generator:
    full_response = update
    yield update, gr.update(visible=False) # Keep download link hidden during stream
  # After streaming is complete, create the download file
  try:
      temp_file = tempfile.NamedTemporaryFile(
          mode='w',
          delete=False,
          suffix='.csv',
          encoding='utf-8'
      )
      temp_file.write(full_response)
      temp_file.close()

      # Final yield with the complete text and the visible download link
      yield full_response, gr.update(visible=True, value=temp_file.name)
  except Exception as e:
      error_message = f"Error writing to file: {e}"
      yield full_response + "\n\n" + error_message, gr.update(visible=False)


def create_dataset_mtd(dataset_type, scenario_role, number_of_turns, conversation_goal, number_of_samples):
  global response_csv_columns_headers
  system_message = f"You are a dataset generator for Multi-Turn Chat Dialogue. Respond in csv format only, include the header, nothing extra and use the following columns {response_csv_columns_headers[dataset_type]} ."
  user_message = f"For the scenario role {scenario_role} with {number_of_turns} number of turns and the conversation goal {conversation_goal} create a {dataset_type} dataset. Create {number_of_samples} samples."
  messages = combine_messages(system_message, user_message)
  stream_generator = generate_stream_with_thread_gradio(messages, tokenizer, model)
  full_response = ""
  for update in stream_generator:
    full_response = update
    yield update, gr.update(visible=False) # Keep download link hidden during stream
  try:
      temp_file = tempfile.NamedTemporaryFile(
          mode='w',
          delete=False,
          suffix='.csv',
          encoding='utf-8'
      )
      temp_file.write(full_response)
      temp_file.close()

      # Final yield with the complete text and the visible download link
      yield full_response, gr.update(visible=True, value=temp_file.name)
  except Exception as e:
      error_message = f"Error writing to file: {e}"
      yield full_response + "\n\n" + error_message, gr.update(visible=False)

def create_dataset_tc(dataset_type, text_type, list_of_labels, number_of_samples):
  global response_csv_columns_headers
  system_message = f"You are a dataset generator for Text Classification. Respond in csv format only, include the header, nothing extra and use the following columns {response_csv_columns_headers[dataset_type]} ."
  user_message = f"For the text-type {text_type} and the labels {list_of_labels} create a {dataset_type} dataset. Create {number_of_samples} samples."
  messages = combine_messages(system_message, user_message)
  stream_generator = generate_stream_with_thread_gradio(messages, tokenizer, model)
  full_response = ""
  for update in stream_generator:
    full_response = update
    yield update, gr.update(visible=False) # Keep download link hidden during stream
  try:
      temp_file = tempfile.NamedTemporaryFile(
          mode='w',
          delete=False,
          suffix='.csv',
          encoding='utf-8'
      )
      temp_file.write(full_response)
      temp_file.close()

      # Final yield with the complete text and the visible download link
      yield full_response, gr.update(visible=True, value=temp_file.name)
  except Exception as e:
      error_message = f"Error writing to file: {e}"
      yield full_response + "\n\n" + error_message, gr.update(visible=False)

In [None]:
import gradio as gr

with gr.Blocks() as demo:
  # 1. Main Selection
  with gr.Row():
    with gr.Column():
      dataset_type = gr.Dropdown(
          choices=dataset_types,
          label="Dataset Type",
          value=None,
          )

  with gr.Column(visible=False) as ir_container:
    with gr.Row():
      target_domain = gr.Textbox(label="Target Domain")
      instruction_type = gr.Textbox(label="Instruction Type")
      diversity_prompt = gr.Textbox(label="Diversity Prompt")
      number_of_samples_ir = gr.Textbox(label="Number of Samples")
    with gr.Row():
      submit_ir = gr.Button("Submit")

    ir_output = gr.Textbox(label="AI Response", lines=10)
    ir_download_link = gr.File(label="Download Generated Dataset (.csv)", visible=False)

  with gr.Column(visible=False) as mtd_container:
    with gr.Row():
      scenario_role = gr.Textbox(label="Scenario Role")
      number_of_turns = gr.Textbox(label="Number of Turns")
      conversation_goal = gr.Textbox(label="Conversation Goal")
      number_of_samples_mtd = gr.Textbox(label="Number of Samples")
    with gr.Row():
      submit_mtd = gr.Button("Submit")

    mtd_output = gr.Textbox(label="AI Response", lines=10)
    mtd_download_link = gr.File(label="Download Generated Dataset (.csv)", visible=False)

  with gr.Column(visible=False) as tc_container:
    with gr.Row():
      text_type = gr.Textbox(label="Text Type")
      list_of_labels = gr.Textbox(label="List of Labels")
      number_of_samples_tc = gr.Textbox(label="Number of Samples")
    with gr.Row():
      submit_tc = gr.Button("Submit")

    tc_output = gr.Textbox(label="AI Response", lines=10)
    tc_download_link = gr.File(label="Download Generated Dataset (.csv)", visible=False)

  # Attach the listener
  dataset_type.change(update_params, inputs=[dataset_type], outputs=[ir_container, mtd_container, tc_container])
  submit_ir.click(create_dataset_ir, inputs=[dataset_type, target_domain, instruction_type, diversity_prompt, number_of_samples_ir], outputs=[ir_output, ir_download_link])
  submit_mtd.click(create_dataset_mtd, inputs=[dataset_type, scenario_role, number_of_turns, conversation_goal, number_of_samples_mtd], outputs=[mtd_output, mtd_download_link])
  submit_tc.click(create_dataset_tc, inputs=[dataset_type, text_type, list_of_labels, number_of_samples_tc], outputs=[tc_output, tc_download_link])


demo.launch(debug=True, share=True)


## Refactored Code and named temp-file

In [None]:
import tempfile
import os
import gradio as gr

dataset_types = ["Instruction-Response Pairs", "Multi-Turn Chat Dialogue", "Text Classification"]

response_csv_columns_headers = {
    "Instruction-Response Pairs": ["instruction", "response", "domain", "complexity"],
    "Multi-Turn Chat Dialogue": ["conversation_id", "turn_number", "role", "content"],
    "Text Classification": ["text", "label", "sourcestyle"],
}

# System message templates for each dataset type
system_message_templates = {
    "Instruction-Response Pairs": "You are a dataset generator for Instruction-Response Pairs. Respond in csv format only, include the header, nothing extra and use the following columns {columns}.",
    "Multi-Turn Chat Dialogue": "You are a dataset generator for Multi-Turn Chat Dialogue. Respond in csv format only, include the header, nothing extra and use the following columns {columns}.",
    "Text Classification": "You are a dataset generator for Text Classification. Respond in csv format only, include the header, nothing extra and use the following columns {columns}.",
}

# User message templates for each dataset type
user_message_templates = {
    "Instruction-Response Pairs": "For the target domain {target_domain} using the instruction type {instruction_type} create a {dataset_type} dataset. Make the question {diversity_prompt}. Create {number_of_samples} samples.",
    "Multi-Turn Chat Dialogue": "For the scenario role {scenario_role} with {number_of_turns} number of turns and the conversation goal {conversation_goal} create a {dataset_type} dataset. Create {number_of_samples} samples.",
    "Text Classification": "For the text-type {text_type} and the labels {list_of_labels} create a {dataset_type} dataset. Create {number_of_samples} samples.",
}

def update_params(dataset_type):
    """Update visibility of parameter containers based on selected dataset type"""
    hide_all = gr.update(visible=False)

    visibility_map = {
        "Instruction-Response Pairs": (gr.update(visible=True), hide_all, hide_all),
        "Multi-Turn Chat Dialogue": (hide_all, gr.update(visible=True), hide_all),
        "Text Classification": (hide_all, hide_all, gr.update(visible=True)),
    }

    return visibility_map.get(dataset_type, (hide_all, hide_all, hide_all))

def combine_messages(system_message, user_message):
    """Combine system and user messages into a message list"""
    return [
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_message}
    ]

def create_dataset(dataset_type, **kwargs):
    """
    Generic dataset creation function that handles all dataset types

    Args:
        dataset_type: Type of dataset to generate
        **kwargs: Dynamic parameters based on dataset type
    """
    # Get the appropriate columns for this dataset type
    columns = response_csv_columns_headers[dataset_type]

    # Create system message
    system_message = system_message_templates[dataset_type].format(columns=columns)

    # Create user message with appropriate template and parameters
    user_message = user_message_templates[dataset_type].format(
        dataset_type=dataset_type,
        **kwargs
    )

    # Combine messages
    messages = combine_messages(system_message, user_message)

    # Generate stream
    stream_generator = generate_stream_with_thread_gradio(messages, tokenizer, model)
    full_response = ""

    # Stream the text output
    for update in stream_generator:
        full_response = update
        yield update, gr.update(visible=False)  # Keep download hidden during stream

    # After streaming is complete, create the download file
    try:
        temp_filename = f"generated_dataset_{dataset_type}.csv"
        temp_dir = tempfile.gettempdir()
        temp_filepath = os.path.join(temp_dir, temp_filename)

        with open(temp_filepath, 'w', encoding='utf-8') as temp_file:
            temp_file.write(full_response)

        # Final yield with the complete text and the visible download link
        yield full_response, gr.update(visible=True, value=temp_file.name)
    except Exception as e:
        error_message = f"Error writing to file: {e}"
        yield full_response + "\n\n" + error_message, gr.update(visible=False)

# Wrapper functions for each dataset type (to handle different parameter names)
def create_dataset_ir(dataset_type, target_domain, instruction_type, diversity_prompt, number_of_samples):
    """Create Instruction-Response Pairs dataset"""
    yield from create_dataset(
        dataset_type,
        target_domain=target_domain,
        instruction_type=instruction_type,
        diversity_prompt=diversity_prompt,
        number_of_samples=number_of_samples
    )

def create_dataset_mtd(dataset_type, scenario_role, number_of_turns, conversation_goal, number_of_samples):
    """Create Multi-Turn Chat Dialogue dataset"""
    yield from create_dataset(
        dataset_type,
        scenario_role=scenario_role,
        number_of_turns=number_of_turns,
        conversation_goal=conversation_goal,
        number_of_samples=number_of_samples
    )

def create_dataset_tc(dataset_type, text_type, list_of_labels, number_of_samples):
    """Create Text Classification dataset"""
    yield from create_dataset(
        dataset_type,
        text_type=text_type,
        list_of_labels=list_of_labels,
        number_of_samples=number_of_samples
    )

# Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("# Synthetic Dataset Generator")

    # Main Selection
    with gr.Row():
        with gr.Column():
            dataset_type = gr.Dropdown(
                choices=dataset_types,
                label="Dataset Type",
                value=None,
            )

    # Instruction-Response Pairs Container
    with gr.Column(visible=False) as ir_container:
        gr.Markdown("### Instruction-Response Pairs Parameters")
        with gr.Row():
            target_domain = gr.Textbox(label="Target Domain")
            instruction_type = gr.Textbox(label="Instruction Type")
            diversity_prompt = gr.Textbox(label="Diversity Prompt")
            number_of_samples_ir = gr.Textbox(label="Number of Samples")
        with gr.Row():
            submit_ir = gr.Button("Generate Dataset")

        ir_output = gr.Textbox(label="AI Response", lines=10)
        ir_download_link = gr.File(label="Download Generated Dataset (.csv)", visible=False)

    # Multi-Turn Chat Dialogue Container
    with gr.Column(visible=False) as mtd_container:
        gr.Markdown("### Multi-Turn Chat Dialogue Parameters")
        with gr.Row():
            scenario_role = gr.Textbox(label="Scenario Role")
            number_of_turns = gr.Textbox(label="Number of Turns")
            conversation_goal = gr.Textbox(label="Conversation Goal")
            number_of_samples_mtd = gr.Textbox(label="Number of Samples")
        with gr.Row():
            submit_mtd = gr.Button("Generate Dataset")

        mtd_output = gr.Textbox(label="AI Response", lines=10)
        mtd_download_link = gr.File(label="Download Generated Dataset (.csv)", visible=False)

    # Text Classification Container
    with gr.Column(visible=False) as tc_container:
        gr.Markdown("### Text Classification Parameters")
        with gr.Row():
            text_type = gr.Textbox(label="Text Type")
            list_of_labels = gr.Textbox(label="List of Labels")
            number_of_samples_tc = gr.Textbox(label="Number of Samples")
        with gr.Row():
            submit_tc = gr.Button("Generate Dataset")

        tc_output = gr.Textbox(label="AI Response", lines=10)
        tc_download_link = gr.File(label="Download Generated Dataset (.csv)", visible=False)

    # Event Handlers
    dataset_type.change(
        update_params,
        inputs=[dataset_type],
        outputs=[ir_container, mtd_container, tc_container]
    )

    submit_ir.click(
        create_dataset_ir,
        inputs=[dataset_type, target_domain, instruction_type, diversity_prompt, number_of_samples_ir],
        outputs=[ir_output, ir_download_link],
    )

    submit_mtd.click(
        create_dataset_mtd,
        inputs=[dataset_type, scenario_role, number_of_turns, conversation_goal, number_of_samples_mtd],
        outputs=[mtd_output, mtd_download_link],
    )

    submit_tc.click(
        create_dataset_tc,
        inputs=[dataset_type, text_type, list_of_labels, number_of_samples_tc],
        outputs=[tc_output, tc_download_link],
    )

# Launch with share=True for Google Colab
demo.launch(debug=True, share=True)