In [None]:
import os
from dotenv import load_dotenv
from huggingface_hub import login, InferenceClient
import gradio as gr

load_dotenv(override=True)

hf_token = os.getenv('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

MODELS = {
  "Llama 3.1": "meta-llama/Meta-Llama-3.1-8B-Instruct",
  "Mixtral 8x22": "mistralai/Mixtral-8x22B-Instruct-v0.1",
  "Qwen 2.5": "Qwen/Qwen2.5-7B-Instruct"
}

system_prompt = """
  You are a helpful assistant skilled with generating datasets for different kinds of text data. You will be given a description of the data to provide, columns to fill, and the number of rows to generate. You will also be given the format to generate it in: JSON or CSV. You will generate the dataset according to the description and columns provided, and return it in the format specified. If the format is JSON, you will return a JSON array of objects, where each object represents a row in the dataset with key-value pairs corresponding to column names and their respective values. If the format is CSV, you will return a string in CSV format, with the first line containing the column names and subsequent lines containing the data rows, no other message is required.
"""

def generate_data(model_name, description, columns, row_count, format):
  try:
    user_prompt = f"""
      Please generate a dataset with the following description: {description}. The dataset should have the following columns: {columns}. The dataset should have {row_count} rows. Please provide the dataset in {format} format.
    """

    messages = [
      {"role": "system", "content": system_prompt},
      {"role": "user", "content": user_prompt}
    ]

    model_id = MODELS[model_name]
    client = InferenceClient(model=model_id, token=hf_token)

    response = client.chat_completion(
      messages=messages,
      max_tokens=2000,
      temperature=0.3
    )
    return response.choices[0].message.content
  except Exception as e:
    return f"An error occurred: {str(e)}"

def set_code_language(format):
    return gr.update(language="json" if format == "JSON" else "python")

with gr.Blocks() as ui:
  gr.Markdown("## Dataset Generator")
  with gr.Row():
    with gr.Column():
      model_id = gr.Radio(label="Model", choices=list(MODELS.keys()) )
      description = gr.Textbox(label="Description", placeholder="A sales catalog")
      columns = gr.Textbox(label="Columns (comma-separated)", placeholder="product name, price, description rating")
      row_count = gr.Slider(label="Row Count", minimum=1, maximum=1000, value=10)
      format = gr.Radio(label="Format", choices=["JSON", "CSV"])
      generate_button = gr.Button("Generate Dataset")
  with gr.Row():
    with gr.Column():
      output = gr.Code(label="Output", lines=40, language="json")

  format.change(set_code_language, inputs=format, outputs=output)

  generate_button.click(
    fn=generate_data,
    inputs=[model_id, description, columns, row_count, format],
    outputs=output
  )

ui.launch()

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


['python', 'c', 'cpp', 'markdown', 'latex', 'json', 'html', 'css', 'javascript', 'jinja2', 'typescript', 'yaml', 'dockerfile', 'shell', 'r', 'sql', 'sql-msSQL', 'sql-mySQL', 'sql-mariaDB', 'sql-sqlite', 'sql-cassandra', 'sql-plSQL', 'sql-hive', 'sql-pgSQL', 'sql-gql', 'sql-gpSQL', 'sql-sparkSQL', 'sql-esper', None]
* Running on local URL:  http://127.0.0.1:7887
* To create a public link, set `share=True` in `launch()`.


