## Sensor Dataset Generator

- Google Colab tool to generate synthetic sensor datasets (temperature, humidity, tachometer)
- Choose from several Colab-friendly language models to create tabular data
- Supports 4-bit quantization (better efficiency, lower resource use)
- Gradio interface: configure, generate, and download datasets easily

In [None]:
!pip install -q --upgrade bitsandbytes accelerate "transformers==4.53.3"

In [None]:
import os
import re
import sys
from threading import Thread
import requests
from IPython.display import Markdown, display, update_display
from openai import OpenAI
from google.colab import drive, userdata
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TextIteratorStreamer,
    TextStreamer,
)
from huggingface_hub import login
import torch
import gradio as gr

hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

In [None]:
MODELS = {
    "Llama 3.2 3B": "meta-llama/Llama-3.2-3B-Instruct",
    "Phi-4 mini": "microsoft/Phi-4-mini-instruct",
    "Gemma 3 270M": "google/gemma-3-270m-it",
    "Qwen3 4B": "Qwen/Qwen3-4B-Instruct-2507",
    "DeepSeek R1 Distill 1.5B": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
}

SENSOR_TYPES = ["temperature", "humidity", "tachometer"]

# One prompt per sensor ({n} is for number of rows)
PROMPTS = {
    "temperature": "Generate a CSV dataset for a temperature sensor. Columns: timestamp, sensor_id, temperature_celsius, location, unit. Include {n} rows of realistic readings (e.g. room, outdoor, machine). Return only the table, no explanation.",
    "humidity": "Generate a CSV dataset for a humidity sensor. Columns: timestamp, sensor_id, humidity_percent, location, unit. Include {n} rows. Return only the table.",
    "tachometer": "Generate a CSV dataset for a tachometer (RPM). Columns: timestamp, sensor_id, rpm, machine_id, unit. Include {n} rows of realistic rotation readings. Return only the table.",
}

In [None]:
def get_quantization_config():
    return BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_quant_type="nf4",
    )


def load_model(model_id: str, use_quantization: bool):
    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    if use_quantization:
        model = AutoModelForCausalLM.from_pretrained(
            model_id,
            quantization_config=get_quantization_config(),
            device_map="auto",
            trust_remote_code=True,
        )
    else:
        model = AutoModelForCausalLM.from_pretrained(
            model_id,
            torch_dtype=torch.bfloat16,
            device_map="auto",
            trust_remote_code=True,
        )
    return tokenizer, model

In [None]:
_current_model_id = None
_use_quantization = None
_tokenizer = None
_model = None


def get_or_load_model(model_id: str, use_quantization: bool):
    """Load model only when selection changes; reuse otherwise."""
    global _current_model_id, _use_quantization, _tokenizer, _model
    if _tokenizer is not None and _model is not None and _current_model_id == model_id and _use_quantization == use_quantization:
        return _tokenizer, _model
    if _model is not None:
        del _model
        if _tokenizer is not None:
            del _tokenizer
        torch.cuda.empty_cache()
    _current_model_id = model_id
    _use_quantization = use_quantization
    _tokenizer, _model = load_model(model_id, use_quantization)
    return _tokenizer, _model

In [None]:
def build_messages(sensor_type: str, n_rows: int):
    system = "You are a dataset engineer. Generate only the requested sensor data table. Output ONLY the table, no extra text, code blocks, or explanations."
    user_text = PROMPTS.get(sensor_type, PROMPTS["temperature"]).format(n=n_rows)
    return [
        {"role": "system", "content": system},
        {"role": "user", "content": user_text},
    ]

def generate_dataset(model_name: str, use_quantization: bool, sensor_type: str, n_rows: int):
    model_id = MODELS.get(model_name, list(MODELS.values())[0])
    tokenizer, model = get_or_load_model(model_id, use_quantization)
    messages = build_messages(sensor_type, n_rows)

    device = next(model.parameters()).device if hasattr(model, "parameters") else "cuda"
    try:
        text = tokenizer.apply_chat_template(
            messages, add_generation_prompt=True, tokenize=False
        )
        inputs = tokenizer(text, return_tensors="pt").to(device)
    except Exception:
        prompt = "\n".join(m.get("content", "") for m in messages if m.get("role") == "user")
        inputs = tokenizer(prompt, return_tensors="pt").to(device)

    streamer = TextIteratorStreamer(
        tokenizer, skip_prompt=True, skip_special_tokens=True
    )
    thread = Thread(
        target=lambda: model.generate(
            inputs.input_ids,
            attention_mask=inputs.attention_mask,
            max_new_tokens=4096,
            do_sample=True,
            temperature=0.7,
            pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id,
            streamer=streamer,
        )
    )
    thread.start()
    out = ""
    for chunk in streamer:
        out += chunk

    out = out.strip().removeprefix("```").removesuffix("```").strip()
    return out.strip()

In [None]:
def run_ui(model_name: str, use_quantization: bool, sensor_type: str, n_rows: int):
    try:
        n = max(5, min(int(n_rows), 500))
    except (TypeError, ValueError):
        n = 50
    return generate_dataset(model_name, use_quantization, sensor_type, n)

with gr.Blocks(title="Sensor Dataset Generator", theme=gr.themes.Soft()) as demo:
    gr.Markdown("## Sensor Dataset Generator")
    gr.Markdown("Generate synthetic **temperature**, **humidity**, or **tachometer** datasets. Choose model and quantization to compare outputs.")

    with gr.Row():
        model_dd = gr.Dropdown(
            choices=list(MODELS.keys()),
            value=list(MODELS.keys())[0],
            label="Model",
        )
        quant_cb = gr.Checkbox(value=True, label="Use 4-bit quantization (saves VRAM)")
    with gr.Row():
        sensor_dd = gr.Dropdown(
            choices=SENSOR_TYPES,
            value=SENSOR_TYPES[0],
            label="Sensor type",
        )
        n_rows_num = gr.Number(value=50, label="Number of rows", minimum=5, maximum=500, step=5)

    gen_btn = gr.Button("Generate dataset")
    out_text = gr.Textbox(label="Generated dataset", lines=20, max_lines=30)

    gen_btn.click(fn=run_ui, inputs=[model_dd, quant_cb, sensor_dd, n_rows_num], outputs=out_text)

    in_colab = "google.colab" in sys.modules
    demo.launch(share=in_colab, show_error=True)