In [None]:
!pip install -q requests pandas torch bitsandbytes transformers sentencepiece accelerate openai gradio hf_xet

import os
import requests
from IPython.display import Markdown, display
from openai import OpenAI
from google.colab import drive
from huggingface_hub import login
from google.colab import userdata
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch
import gc
import json
import gradio as gr
import pandas as pd
import uuid
import time

# Verify GPU
print(torch.cuda.is_available())  # Should print True
print(torch.cuda.get_device_name(0))  # Should print "Tesla T4"

# Hugging Face login
hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

# Model names
LLAMA = "meta-llama/Meta-Llama-3.1-8B-Instruct"
DEEPSEEK = "deepseek-ai/deepseek-llm-7b-chat"
PHI3 = "microsoft/Phi-3-mini-4k-instruct"
GEMMA2 = "google/gemma-2-2b-it"
QWEN2 = "Qwen/Qwen2-7B-Instruct"

# Model cache
model_cache = {}
tokenizer_cache = {}

# Quantization config
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4"
)

def load_model(model_name):
    start_time = time.time()
    if model_name not in model_cache:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        tokenizer.pad_token = tokenizer.eos_token
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            device_map="auto",
            quantization_config=quant_config
        )
        model_cache[model_name] = model
        tokenizer_cache[model_name] = tokenizer
        print(f"Model {model_name} loaded in {time.time() - start_time:.2f} seconds")
    return model_cache[model_name], tokenizer_cache[model_name]

def clear_model_cache(model_name):
    if model_name in model_cache:
        del model_cache[model_name]
        del tokenizer_cache[model_name]
        gc.collect()
        torch.cuda.empty_cache()

def data_requirement(data_type, num_rows):
    data_req_message = ""
    if data_type == "CSV":
        data_req_message = "Provide data in valid CSV format with headers, using commas as separators and prices as numbers (e.g., 20000.00)."
    elif data_type == "JSON":
        data_req_message = "Provide data in valid JSON format."
    return data_req_message + f" Generate {num_rows} records."

def full_prompt(user_input, data_type, num_rows):
    system_prompt = (
        "You are a synthetic data generator. Generate structured data in the requested format (CSV or JSON) with the specified number of records. "
        "Output only the data, no explanations or additional text. For CSV, include headers and use commas as separators, with prices as numbers (e.g., 20000.00, not 20.000$)."
    )
    user_prompt = (
        "Generate synthetic data based on the following requirements. "
        "Example for CSV: \n"
        "id,name,age,purchase_amount\n"
        "1,John Doe,30,150.50\n"
        "2,Jane Smith,25,200.75\n"
    )
    messages = [
        {'role': 'system', 'content': system_prompt},
        {'role': 'user', 'content': user_prompt + user_input + data_requirement(data_type, num_rows)}
    ]
    return messages

def sel_model(model_choice):
    model_map = {
        "Llama3.1-8B-Inst": LLAMA,
        "Deepseek-llm-7b-chat": DEEPSEEK,
        "Qwen2-7B-Inst": QWEN2,
        "Gemma2-2b-it": GEMMA2,
        "Phi3-mini-4k-inst": PHI3
    }
    return model_map.get(model_choice, LLAMA)

def generate(model_name, messages, data_type="CSV"):
    start_total = time.time()
    try:
        model, tokenizer = load_model(model_name)
        start_gen = time.time()
        inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True).to("cuda")
        outputs = model.generate(inputs, max_new_tokens=200)
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        print(f"Inference took {time.time() - start_gen:.2f} seconds")

        file_id = uuid.uuid4().hex[:6]
        if data_type == "CSV":
            filename = f"/content/synthetic_data_{file_id}.csv"
            csv_start = generated_text.find('\n')
            csv_content = generated_text[csv_start+1:] if csv_start != -1 else generated_text
            try:
                from io import StringIO
                df = pd.read_csv(StringIO(csv_content))
                df.to_csv(filename, index=False)
            except Exception as e:
                print(f"CSV parsing error: {e}")
                with open(filename, "w") as f:
                    f.write(csv_content)
        elif data_type == "JSON":
            filename = f"/content/synthetic_data_{file_id}.json"
            try:
                json_data = json.loads(generated_text)
                with open(filename, "w") as f:
                    json.dump(json_data, f, indent=2)
            except Exception as e:
                print(f"JSON parsing error: {e}")
                with open(filename, "w") as f:
                    f.write(generated_text)

        print(f"Total time: {time.time() - start_total:.2f} seconds")
        return filename

    except Exception as e:
        return f"Error during generation: {str(e)}"

def gen_syn_data(user_input, model_choice, data_type, num_rows):
    messages = full_prompt(user_input, data_type, num_rows)
    model = sel_model(model_choice)
    if hasattr(gen_syn_data, 'last_model') and gen_syn_data.last_model != model:
        clear_model_cache(gen_syn_data.last_model)
    gen_syn_data.last_model = model
    return generate(model, messages, data_type=data_type)

# Gradio interface
with gr.Blocks(title='Synthetic Data Generator') as ui:
    gr.Markdown('# Synthetic Data Generator')
    with gr.Row():
        with gr.Column(min_width=400):
            user_inputs = gr.Textbox(
                label='Enter business intent and data requirements',
                placeholder='Type here ... ',
                lines=20
            )
            model_choice = gr.Dropdown(
                ["Llama3.1-8B-Inst", "Deepseek-llm-7b-chat", "Qwen2-7B-Inst", "Gemma2-2b-it", "Phi3-mini-4k-inst"],
                label="Choose llms",
                value="Llama3.1-8B-Inst"
            )
            req_data_format = gr.Dropdown(
                ['CSV', 'JSON'],
                label="Choose data format",
                value='CSV'
            )
            num_records = gr.Slider(
                minimum=1, maximum=200, step=2, label="Number of Records", value=10
            )
            generate_button = gr.Button('Generate Data')
            output_file = gr.File(label="Download Synthetic Data")
        # with gr.Column():
        #     output = gr.Textbox(label='Generated Synthetic Data', lines=50)
    generate_button.click(
        fn=gen_syn_data,
        inputs=[user_inputs, model_choice, req_data_format, num_records],
        outputs=output_file
    )

ui.launch(inbrowser=True, debug = True)

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m121.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m92.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m56.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/663 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/27.8k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model-00002-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.56G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/3.95G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Model Qwen/Qwen2-7B-Instruct loaded in 233.36 seconds
Inference took 14.73 seconds
Total time: 248.13 seconds


tokenizer_config.json:   0%|          | 0.00/47.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/838 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/24.2k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/241M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

Model google/gemma-2-2b-it loaded in 80.54 seconds


Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/gradio/queueing.py", line 625, in process_events
    response = await route_utils.call_process_api(
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/gradio/route_utils.py", line 322, in call_process_api
    output = await app.get_blocks().process_api(
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/gradio/blocks.py", line 2191, in process_api
    data = await self.postprocess_data(block_fn, result["prediction"], state)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/gradio/blocks.py", line 1973, in postprocess_data
    prediction_value = block.postprocess(prediction_value)
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/gradio/components/file.py", line 227, in postprocess


In [None]:
!pip install -q requests panda torch bitsandbytes transformers sentencepiece accelerate openai gradio
import os
import requests
from IPython.display import Markdown, display, update_display
from openai import OpenAI
from google.colab import drive
from huggingface_hub import login
from google.colab import userdata
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig
import torch
import gc
import json
import gradio as gr
import pandas as pd
import uuid

import bitsandbytes
print(bitsandbytes.__version__)


hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

LLAMA = "meta-llama/Meta-Llama-3.1-8B-Instruct"
DEEPSEEK = "deepseek-ai/deepseek-llm-7b-chat"
PHI3 = "microsoft/Phi-3-mini-4k-instruct"
GEMMA2 = "google/gemma-2-2b-it"
QWEN2 = "Qwen/Qwen2-7B-Instruct"



def data_requirement(data_type, num_rows):

  data_req_message =''
  if data_type == "CSV":
    data_req_message = "Provide data in CSV file."
  elif data_type == "JSON":
    data_req_message = "Provide data in JSON file."

  return data_req_message + f'Generate {num_rows} records'

def full_prompt(user_input, data_type, num_rows):

    messages = [
        {'role': 'system', 'content': system_prompt},
        {'role': 'user', 'content': user_prompt + user_input + data_requirement(data_type, num_rows)}
    ]

    return messages



def sel_model(model_choice):

  if model_choice == "Llama3.1-8B-Inst":
    model = LLAMA
  elif model_choice == "Deepseek-llm-7b-chat":
    model = DEEPSEEK
  elif model_choice == "Qwen2-7B-Inst":
    model = QWEN2
  elif model_choice == "Gemma2-2b-it":
    model = GEMMA2
  elif model_choice == "Phi3-mini-4k-inst":
    model = PHI3

  return model


def gen_syn_data(user_input, model_choice, data_type, num_rows):

    messages = full_prompt(user_input, data_type, num_rows)
    model = sel_model(model_choice)

    return generate(model, messages, data_type=data_type)


quant_config = BitsAndBytesConfig(
    load_in_4bit=True,   # reduce the biggest memery, not affect prediction much
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4"
)

def generate(model, messages, data_type="CSV"):
    try:
        tokenizer = AutoTokenizer.from_pretrained(model)
        tokenizer.pad_token = tokenizer.eos_token
        inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True).to("cuda")
        model = AutoModelForCausalLM.from_pretrained(model, device_map="auto", quantization_config=quant_config)

        outputs = model.generate(inputs, max_new_tokens=500)
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Create unique file path
        file_id = uuid.uuid4().hex[:6]
        if data_type == "CSV":
            filename = f"/content/synthetic_data_{file_id}.csv"
            # Try to convert generated text to DataFrame
            try:
                from io import StringIO
                df = pd.read_csv(StringIO(generated_text))
                df.to_csv(filename, index=False)
            except Exception as e:
                # Fallback: just save raw text
                with open(filename, "w") as f:
                    f.write(generated_text)
        elif data_type == "JSON":
            filename = f"/content/synthetic_data_{file_id}.json"
            try:
                json_data = json.loads(generated_text)
                with open(filename, "w") as f:
                    json.dump(json_data, f, indent=2)
            except Exception as e:
                with open(filename, "w") as f:
                    f.write(generated_text)

        del model, inputs, tokenizer, outputs
        gc.collect()
        torch.cuda.empty_cache()

        return filename

    except Exception as e:
        return f"Error during generation: {str(e)}"




with gr.Blocks(title = 'Synthetic Data Generator') as ui:
    gr.Markdown('# Synthetic Data Generator')

    with gr.Row():
        with gr.Column(min_width=600):
            user_inputs = gr.Textbox(
                label = 'Enter business intent and data requirements',
                placeholder = 'Type here ... ', lines = 20
            )
            model_choice = gr. Dropdown(
                ["Llama3.1-8B-Inst","Deepseek-llm-7b-chat","Qwen2-7B-Inst","Gemma2-2b-it","Phi3-mini-4k-inst"],
                label = "Choose llms",
                value = "Llama3.1-8B-Inst"
            )
            req_data_format = gr.Dropdown(
                ['CSV', 'JSON'],
                label = "Choose data format",
                value = 'CSV'
            )
            # num_records = gr.Dropdown(
            #     [10,50,100,200],
            #     label = 'Number of Records',
            #     value = 10
            # )
            num_records = gr.Slider(
                minimum=1, maximum=200, step=2, label="Number of Records")

            generate_button = gr.Button('Generate Data')

            output_file = gr.File(label = "Download Synthetic Data")


        with gr.Column():
            output = gr.Textbox(label = 'Generated Synthetic Data',
                                lines = 50)


        # with gr.Column():
        #     output_file = gr.File(label = "Download Synthetic Data")


    generate_button.click(
              fn = gen_syn_data,
              inputs = [user_inputs, model_choice, req_data_format, num_records],
              outputs = output_file)

ui.launch(inbrowser = True)



