# Welcome to Colab!

In [None]:
!pip install -q bitsandbytes transformers accelerate gradio pandas

In [None]:
from google.colab import userdata
hf_token = userdata.get('HF_TOKEN')

In [None]:
import os
import json
import pandas as pd
import torch
from google.colab import userdata
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import gradio as gr

login(hf_token)

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4"
)

MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, quantization_config=quant_config, device_map="auto")

In [None]:
def generate_data(schema, context, num_records):
    prompt = f"Generate {int(num_records)} rows of data in JSONL format.\nContext: {context}\nSchema:\n{schema}\nOnly output valid JSONL, nothing else."

    messages = [
        {"role": "system", "content": "You generate realistic synthetic datasets in JSONL format. Output only valid JSONL, nothing else."},
        {"role": "user", "content": prompt}
    ]

    inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")
    outputs = model.generate(inputs, max_new_tokens=2000, temperature=0.8, do_sample=True)
    response = tokenizer.decode(outputs[0][inputs.shape[-1]:], skip_special_tokens=True)

    lines = response.strip().splitlines()
    records = [json.loads(line) for line in lines if line.strip().startswith("{")]
    df = pd.DataFrame(records)
    df.to_csv("generated_data.csv", index=False)

    return df.head(20).to_markdown(index=False)

In [None]:
gr.Interface(
    fn=generate_data,
    inputs=[
        gr.Textbox(label="Schema", lines=5, value='Name (text), Industry (text), Employees (int), Revenue (text)'),
        gr.Textbox(label="Business Context", value="Andela partners"),
        gr.Slider(5, 100, value=20, step=5, label="Number of Records")
    ],
    outputs=gr.Markdown(label="Preview"),
    title="Synthetic Data Generator",
    flagging_mode="never"
).launch()