# Security Awareness Synthetic Q/A Dataset Generator (CSV)

This notebook generates **synthetic, public-safe training/evaluation data** for future projects such as **RAG (Retrieval-Augmented Generation)** and internal assistants.

Each click produces one structured example:
- **CONTEXT**: a short didactic mini-guide (120–180 words)
- **QUESTION**: a question answerable **only** from the context
- **ANSWER**: a grounded answer using **only** the context

Saved output is a CSV dataset with:
- `chunk_id` (incremental ID, e.g., `SA-000001`)
- `topic`, `difficulty`, `tags`
- `context`, `question`, `answer`




In [None]:

# - accelerate: device_map="auto" support
# - bitsandbytes: 4-bit quantization
# - gradio: UI
!pip install -q --upgrade  accelerate bitsandbytes gradio

In [None]:
# Standard library imports
import os                      # File paths, environment variables
import json                    # JSON serialization
import uuid                    # Unique IDs for dataset entries
from datetime import datetime  # Timestamps for filenames/logging

# PyTorch imports
import torch                 # Tensors, GPU check

# Hugging Face / Transformers imports
from huggingface_hub import login                 # HF login to download gated models
from transformers import (
    AutoTokenizer,                                 # Tokenizer loader
    AutoModelForCausalLM,                           # Causal LM loader
    BitsAndBytesConfig                              # 4-bit quant config
)

# Colab-specific imports
from google.colab import userdata                  # Colab Secrets store

# UI imports
import gradio as gr                                # Gradio UI


In [None]:

LLAMA = "meta-llama/Llama-3.2-3B-Instruct"

# Read Hugging Face token from Colab "Secrets"
hf_token = userdata.get("HF_TOKEN")

# Log in to Hugging Face so you can download Llama weights
login(hf_token, add_to_git_credential=True)

# Check if we have a GPU (recommended for speed)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))


In [None]:


# Define 4-bit quantization
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,                      # Enable 4-bit weight loading
    bnb_4bit_use_double_quant=True,         # Double quantization for improved compression
    bnb_4bit_compute_dtype=torch.bfloat16,  # Compute dtype for matmuls
    bnb_4bit_quant_type="nf4"               # NF4 quantization
)

# Load tokenizer for the chat model
tokenizer = AutoTokenizer.from_pretrained(LLAMA)

# Some Llama tokenizers do not define a pad token; set it to EOS for safe padding behavior
tokenizer.pad_token = tokenizer.eos_token

# Load the model with device_map="auto" so it can place layers on GPU/CPU as needed
model = AutoModelForCausalLM.from_pretrained(
    LLAMA,
    device_map="auto",                      # Automatically place layers to available devices
    quantization_config=quant_config        # Apply the 4-bit quantization config
)

# Put the model into evaluation mode

print("Model loaded:", LLAMA)


In [None]:


TOPICS = {
    "Phishing Emails — Red Flags": (
        "Explain common phishing indicators (sender spoofing, urgency, suspicious links/attachments, "
        "credential requests). Describe what a user should do if they suspect phishing."
    ),
    "Strong Passwords and Passphrases": (
        "Explain why unique passwords matter, what makes a password strong, and how passphrases work. "
        "Include practical guidance and common mistakes to avoid."
    ),
    "Multi-Factor Authentication (MFA) Basics": (
        "Explain what MFA is, why it helps, and common MFA methods. Mention 'push fatigue' and the "
        "importance of verifying login prompts."
    ),
    "Software Updates and Patching": (
        "Explain why updates matter, how vulnerabilities are exploited, and safe habits for updating "
        "operating systems and apps."
    ),
    "Safe Use of Public Wi-Fi": (
        "Explain risks of public Wi-Fi, what information should not be entered, and safer alternatives "
        "like mobile hotspots or VPN usage (conceptually, no vendor names)."
    ),
    "Handling Sensitive Data": (
        "Explain basic data sensitivity levels (public/internal/confidential), secure sharing habits, "
        "and how to reduce accidental exposure."
    ),
    "Social Engineering by Phone": (
        "Explain pretexting, why verification is necessary, and safe steps to validate identity before "
        "sharing information or granting access."
    )
}

print("Topics:")
for k in TOPICS.keys():
    print("-", k)


In [None]:


def _topic_to_tags(topic_name: str) -> str:
    """
    Derive short tags from the topic name.
    Stored in CSV as a single string separated by '|'.
    """
    base = topic_name.lower()
    tags = ["security_awareness"]

    if "phishing" in base:
        tags += ["phishing", "email", "links"]
    if "password" in base or "passphrase" in base:
        tags += ["passwords", "passphrases", "credentials"]
    if "mfa" in base or "multi-factor" in base:
        tags += ["mfa", "authentication", "push_fatigue"]
    if "update" in base or "patch" in base:
        tags += ["updates", "patching", "vulnerabilities"]
    if "wi" in base or "wifi" in base:
        tags += ["public_wifi", "network", "privacy"]
    if "sensitive data" in base or "handling sensitive" in base:
        tags += ["sensitive_data", "sharing", "classification"]
    if "social engineering" in base or "phone" in base:
        tags += ["social_engineering", "verification", "pretexting"]

    # Deduplicate while preserving order
    dedup = []
    for t in tags:
        if t not in dedup:
            dedup.append(t)

    return "|".join(dedup[:6])


def generate_one_text(topic_name: str, difficulty: str, temperature: float = 0.2, max_new_tokens: int = 450) -> dict:
    """
    Generate ONE example in English using fixed separators.
    Returns: topic, difficulty, tags, context, question, answer

    Key fix:
    - decode ONLY newly generated tokens, not the prompt.
    """

    topic_desc = TOPICS[topic_name]
    tags = _topic_to_tags(topic_name)

    system_message = (
        "You are a technical instructor for security awareness (non-expert audience). "
        "You must write a didactic mini-manual chunk and a grounded Q/A pair. "
        "Use EXACTLY the separators requested and fill EVERY section. "
        "Do not mention vendor or brand names."
    )

    user_prompt = f"""
Topic: {topic_name}
Topic requirements: {topic_desc}
Difficulty: {difficulty}

Return output in ENGLISH using EXACTLY this structure:

===CONTEXT===
Write the context here.

===QUESTION===
Write the question here.

===ANSWER===
Write the answer here.

Rules:
- CONTEXT must be 120 to 180 words.
- QUESTION must be answerable ONLY using the CONTEXT.
- ANSWER must use ONLY the CONTEXT (no outside knowledge).
- Do not include any extra sections or commentary.
- Do not leave any section empty.
""".strip()

    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_prompt},
    ]

    tokenized = tokenizer.apply_chat_template(messages, return_tensors="pt")

    # Handle Tensor vs BatchEncoding
    if isinstance(tokenized, torch.Tensor):
        input_ids = tokenized
        attention_mask = None
    else:
        input_ids = tokenized["input_ids"]
        attention_mask = tokenized.get("attention_mask", None)

    # Move to GPU if available
    if torch.cuda.is_available():
        input_ids = input_ids.to("cuda")
        if attention_mask is not None:
            attention_mask = attention_mask.to("cuda")

    prompt_len = input_ids.shape[1]

    with torch.no_grad():
        output_ids = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=float(temperature),
            pad_token_id=tokenizer.eos_token_id
        )

    # Decode ONLY newly generated tokens
    new_tokens = output_ids[0, prompt_len:]
    decoded_new = tokenizer.decode(new_tokens, skip_special_tokens=True)

    # Extract sections
    def _between(text, a, b):
        i = text.find(a)
        if i == -1:
            return ""
        i += len(a)
        j = text.find(b, i)
        if j == -1:
            return text[i:].strip()
        return text[i:j].strip()

    context = _between(decoded_new, "===CONTEXT===", "===QUESTION===")
    question = _between(decoded_new, "===QUESTION===", "===ANSWER===")
    answer = decoded_new.split("===ANSWER===")[-1].strip() if "===ANSWER===" in decoded_new else ""

    # Fallback: if separators failed, show raw output so you always see something
    if not context and not question and not answer:
        return {
            "topic": topic_name,
            "difficulty": difficulty,
            "tags": tags,
            "context": "",
            "question": "",
            "answer": decoded_new.strip(),
        }

    return {
        "topic": topic_name,
        "difficulty": difficulty,
        "tags": tags,
        "context": context,
        "question": question,
        "answer": answer,
    }

# Quick test
ex = generate_one_text("Phishing Emails — Red Flags", "medium", temperature=0.2)
print(ex["tags"])
print(ex["question"])


In [None]:


import csv
import os

CSV_PATH = "/content/security_awareness_qa.csv"

CSV_COLUMNS = ["chunk_id", "topic", "difficulty", "tags", "context", "question", "answer"]

def _next_chunk_id(path: str = CSV_PATH) -> str:
    """
    Generate incremental IDs like SA-000001, SA-000002, ...
    Based on number of existing data lines in the CSV (excluding header).
    """
    if not os.path.exists(path):
        return "SA-000001"

    with open(path, "r", encoding="utf-8") as f:
        n_lines = sum(1 for _ in f)

    n_data = max(0, n_lines - 1)  # subtract header
    next_id = n_data + 1
    return f"SA-{next_id:06d}"

def append_example_to_csv(example: dict, path: str = CSV_PATH) -> str:
    """
    Append one example to CSV, auto-adding chunk_id.
    """
    file_exists = os.path.exists(path)
    chunk_id = _next_chunk_id(path)

    row = {
        "chunk_id": chunk_id,
        "topic": example.get("topic", ""),
        "difficulty": example.get("difficulty", ""),
        "tags": example.get("tags", ""),
        "context": example.get("context", ""),
        "question": example.get("question", ""),
        "answer": example.get("answer", ""),
    }

    with open(path, "a", encoding="utf-8", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=CSV_COLUMNS)
        if not file_exists:
            writer.writeheader()
        writer.writerow(row)

    return path

def preview_csv(path: str = CSV_PATH, n_lines: int = 5) -> str:
    """
    Preview first lines of the CSV.
    """
    if not os.path.exists(path):
        return "(CSV not created yet.)"

    lines = []
    with open(path, "r", encoding="utf-8") as f:
        for _ in range(n_lines):
            line = f.readline()
            if not line:
                break
            lines.append(line.rstrip("\n"))
    return "\n".join(lines)

# Optional reset (uncomment if you want to start fresh)
# if os.path.exists(CSV_PATH): os.remove(CSV_PATH)
# print("Reset:", CSV_PATH)


In [None]:


import gradio as gr
import traceback

_last_example = None

def ui_generate(topic_name, difficulty, temperature):
    global _last_example
    try:
        ex = generate_one_text(topic_name, difficulty, temperature=float(temperature))
        _last_example = ex

        preview = (
            f"TOPIC: {ex['topic']}\n"
            f"DIFFICULTY: {ex['difficulty']}\n"
            f"TAGS: {ex['tags']}\n\n"
            f"===CONTEXT===\n{ex['context']}\n\n"
            f"===QUESTION===\n{ex['question']}\n\n"
            f"===ANSWER===\n{ex['answer']}\n"
        )
        return preview, "Generated OK (not saved yet)."
    except Exception:
        _last_example = None
        return traceback.format_exc(), "Generation failed (see error above)."

def ui_save():
    global _last_example
    try:
        if _last_example is None:
            return "Nothing to save. Generate first."

        append_example_to_csv(_last_example, CSV_PATH)
        return f"Saved to: {CSV_PATH}\n\nPreview:\n{preview_csv(CSV_PATH, n_lines=5)}"
    except Exception:
        return traceback.format_exc()

with gr.Blocks() as demo:
    gr.Markdown("# Security Awareness Q/A — CSV Dataset Generator")
    gr.Markdown("Generate 1 example (context + question + answer) and append it to a CSV with chunk_id and tags.")

    with gr.Row():
        topic = gr.Dropdown(list(TOPICS.keys()), value="Phishing Emails — Red Flags", label="Topic")
        difficulty = gr.Dropdown(["easy", "medium", "hard"], value="medium", label="Difficulty")
        temperature = gr.Slider(0.1, 1.0, step=0.1, value=0.2, label="Temperature (lower = more stable)")

    gen_btn = gr.Button("Generate 1 example")
    out_text = gr.Textbox(label="Output / Errors", lines=18)
    status = gr.Textbox(label="Status", lines=1)

    save_btn = gr.Button("Save last row to CSV")
    save_status = gr.Textbox(label="Save status", lines=8)

    gen_btn.click(ui_generate, inputs=[topic, difficulty, temperature], outputs=[out_text, status])
    save_btn.click(ui_save, inputs=None, outputs=[save_status])

# In Colab, share=True is typically the most reliable way to view Gradio.
demo.launch(share=True, debug=True, prevent_thread_lock=True)
