- Implemented a locally hosted, 4-bit quantized Llama 3.1 8B model using Hugging Face (HF) Transformers and bitsandbytes.
- Created a prompt template to enforce strict JavaScript Object Notation (JSON) array outputs for generated job data.
- Built a Gradio web interface featuring dual outputs for readable Markdown and raw Pandas DataFrames.

In [1]:
!pip install -q transformers==4.48.3 bitsandbytes==0.46.0 accelerate==1.3.0 pandas gradio

[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m44.4/44.4 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m9.7/9.7 MB[0m [31m53.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m67.0/67.0 MB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m336.6/336.6 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m566.4/566.4 kB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m
[

In [2]:
import gradio as gr
import pandas as pd
import json
import os
import tempfile
from google.colab import userdata
from huggingface_hub import login
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig



In [3]:
hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

In [4]:
model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"

print("Loading tokenizer and configuring memory...")

# 1. Configure 4-bit Quantization to save GPU memory
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# 2. Load the Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

print("Downloading and loading model (this may take a few minutes)...")

# 3. Load the Model onto the GPU
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quant_config,
    device_map="auto"
)

print("Model loaded successfully!")

Loading tokenizer and configuring memory...


tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

Downloading and loading model (this may take a few minutes)...


config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

Model loaded successfully!


In [5]:
def generate_job_descriptions(company, role, num_records):
    # 1. Updated System Prompt: Notice the arrays [] for the list items!
    system_prompt = f"""You are an expert HR data generator. Your task is to generate realistic, high-quality job descriptions.
    You MUST output ONLY a valid JSON array of objects.
    CRITICAL: Do NOT include introductory text. Do NOT include concluding text. Just output the raw JSON array.
    Generate exactly {num_records} distinct variation(s).

    Schema for every object:
    [
      {{
        "Company": "{company}",
        "Job Title": "{role}",
        "Location": "string",
        "Description": "string",
        "Responsibilities": ["string", "string"],
        "Minimum Qualifications": ["string", "string"],
        "Preferred Qualifications": ["string", "string"],
        "Salary Range": "string",
        "Benefits": ["string", "string"]
      }}
    ]
    """

    user_prompt = f"Generate {num_records} distinct and highly realistic job description(s) for the role of '{role}' at '{company}'."

    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ]

    try:
        # Tokenize and format prompt
        inputs = tokenizer.apply_chat_template(
            messages,
            return_tensors="pt",
            return_attention_mask=True
        ).to("cuda")

        input_token_len = inputs.shape[-1]

        # Generate response
        outputs = model.generate(
            inputs,
            max_new_tokens=1500,
            temperature=0.7,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )

        raw_output = tokenizer.decode(outputs[0][input_token_len:], skip_special_tokens=True).strip()

        # Robust JSON Extraction
        start_idx = raw_output.find('[')
        end_idx = raw_output.rfind(']')

        if start_idx != -1 and end_idx != -1:
            clean_json_str = raw_output[start_idx:end_idx+1]
        else:
            clean_json_str = raw_output

        # Parse JSON (Added strict=False as a safety net!)
        data = json.loads(clean_json_str, strict=False)
        df = pd.DataFrame(data)

        # --- Helper function to turn JSON arrays back into Markdown bullets ---
        def format_list(items):
            if isinstance(items, list):
                return "\n".join([f"* {item}" for item in items])
            return str(items) # Fallback if the model still outputs a string

        # Format as Markdown
        markdown_text = ""
        for index, job in enumerate(data):
            markdown_text += f"##{index + 1}. {job.get('Job Title', 'Job')} at {job.get('Company', 'Company')}\n"
            markdown_text += f"**Location:** {job.get('Location', 'N/A')} | **Salary Range:** {job.get('Salary Range', 'N/A')}\n\n"
            markdown_text += f"###Description\n{job.get('Description', 'N/A')}\n\n"

            # Using the helper function for our bulleted lists
            markdown_text += f"###Responsibilities\n{format_list(job.get('Responsibilities', []))}\n\n"
            markdown_text += f"###Minimum Qualifications\n{format_list(job.get('Minimum Qualifications', []))}\n\n"
            markdown_text += f"###Preferred Qualifications\n{format_list(job.get('Preferred Qualifications', []))}\n\n"
            markdown_text += f"###Benefits\n{format_list(job.get('Benefits', []))}\n"
            markdown_text += "---\n\n"

        return df, markdown_text, "Dataset generated successfully!"

    except json.JSONDecodeError as e:
        error_df = pd.DataFrame({"Error": ["Failed to parse JSON."]})
        debug_message = f"JSON Error: {str(e)}\n\n--- WHAT THE MODEL GENERATED ---\n{raw_output}"
        return error_df, debug_message, "Formatting Error"
    except Exception as e:
        error_df = pd.DataFrame({"Error": [str(e)]})
        return error_df, f"System Error: {str(e)}", "System Error"

In [6]:
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# üè¢ HF Synthetic Job Description Generator")
    gr.Markdown("Powered by locally hosted open-source models via Hugging Face Transformers.")

    with gr.Row():
        # Left Column: Inputs
        with gr.Column(scale=1):
            company_input = gr.Textbox(label="Company Name", placeholder="e.g., Apple")
            role_input = gr.Textbox(label="Job Title", placeholder="e.g., Senior Software Engineer")
            num_records_input = gr.Slider(minimum=1, maximum=5, value=2, step=1, label="Number of Variations")

            submit_btn = gr.Button("Generate Job Descriptions", variant="primary")
            status_output = gr.Textbox(label="Status", interactive=False)

        # Right Column: Outputs
        with gr.Column(scale=2):
            with gr.Tabs():
                # Tab 1: Clean Markdown Display
                with gr.TabItem("Readable View"):
                    markdown_display = gr.Markdown("*Generated job descriptions will appear here...*")
                # Tab 2: Raw Data Table
                with gr.TabItem("Raw Data Table"):
                    dataset_preview = gr.Dataframe(label="Dataset Preview", wrap=True)

    # Wire up the button
    submit_btn.click(
        fn=generate_job_descriptions,
        inputs=[company_input, role_input, num_records_input],
        outputs=[dataset_preview, markdown_display, status_output]
    )

  with gr.Blocks(theme=gr.themes.Soft()) as demo:


In [7]:
demo.launch()

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://e856df021bbf8fe634.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


