In [None]:
"""
Week 2 Assignment: LLM Engineering
Author: Nikhil Raut

Notebook: ai_domain_finder.ipynb

Purpose:
Build an agentic AI Domain Finder that proposes short, brandable .com names, verifies availability via RDAP, 
then returns: 
    a list of available .coms, 
    one preferred pick, 
    and a brief audio rationale.
"""


In [None]:
import os
import json
import requests
from typing import Dict, List, Tuple, Any, Optional
import re

from dotenv import load_dotenv
from openai import OpenAI
import gradio as gr

load_dotenv(override=True)

OPENAI_MODEL = "gpt-5-nano-2025-08-07"
TTS_MODEL = "gpt-4o-mini-tts"

openai = OpenAI()

In [None]:
# --- robust logging that works inside VS Code notebooks + Gradio threads ---
import sys, logging, threading
from collections import deque
from typing import Any

DEBUG_LLM = True          # toggle on/off noisy logs
CLEAR_LOG_ON_RUN = True   # clear panel before each submit

_LOG_BUFFER = deque(maxlen=2000)   # keep ~2000 lines in memory
_LOG_LOCK = threading.Lock()

class GradioBufferHandler(logging.Handler):
    def emit(self, record: logging.LogRecord) -> None:
        try:
            msg = self.format(record)
        except Exception:
            msg = record.getMessage()
        with _LOG_LOCK:
            for line in (msg.splitlines() or [""]):
                _LOG_BUFFER.append(line)

def get_log_text() -> str:
    with _LOG_LOCK:
        return "\n".join(_LOG_BUFFER)

def clear_log_buffer() -> None:
    with _LOG_LOCK:
        _LOG_BUFFER.clear()

def _setup_logger() -> logging.Logger:
    logger = logging.getLogger("aidf")
    logger.setLevel(logging.DEBUG if DEBUG_LLM else logging.INFO)
    logger.handlers.clear()
    fmt = logging.Formatter("%(asctime)s | %(levelname)s | %(message)s", "%H:%M:%S")

    stream = logging.StreamHandler(stream=sys.stdout)  # captured by VS Code notebook
    stream.setFormatter(fmt)

    buf = GradioBufferHandler()                        # shown inside the Gradio panel
    buf.setFormatter(fmt)

    logger.addHandler(stream)
    logger.addHandler(buf)
    logger.propagate = False
    return logger

logger = _setup_logger()

def dbg_json(obj: Any, title: str = "") -> None:
    """Convenience: pretty-print JSON-ish objects to the logger."""
    try:
        txt = json.dumps(obj, ensure_ascii=False, indent=2)
    except Exception:
        txt = str(obj)
    if title:
        logger.debug("%s\n%s", title, txt)
    else:
        logger.debug("%s", txt)


In [None]:
RDAP_URL = "https://rdap.verisign.com/com/v1/domain/{}"

_ALPHA_RE = re.compile(r"^[a-z]+$", re.IGNORECASE)

def _to_com(domain: str) -> str:
    d = domain.strip().lower()
    return d if d.endswith(".com") else f"{d}.com"

def _sld_is_english_alpha(fqdn: str) -> bool:
    """
    True only if the second-level label (just before .com) is made up
    exclusively of English letters (a-z).
    Examples:
      foo.com      -> True
      foo-bar.com  -> False
      foo1.com     -> False
      café.com     -> False
      xn--cafe.com -> False
      www.foo.com  -> True (checks 'foo')
    """
    if not fqdn.endswith(".com"):
        return False
    sld = fqdn[:-4].split(".")[-1]  # take label immediately before .com
    return bool(sld) and bool(_ALPHA_RE.fullmatch(sld))

def check_com_availability(domain: str) -> Dict:
    fqdn = _to_com(domain)
    # Skip API if not strictly English letters
    if not _sld_is_english_alpha(fqdn):
        return {"domain": fqdn, "available": False, "status": 0}

    try:
        r = requests.get(RDAP_URL.format(fqdn), timeout=6)
        return {"domain": fqdn, "available": (r.status_code == 404), "status": r.status_code}
    except requests.RequestException:
        return {"domain": fqdn, "available": False, "status": 0}

def check_com_availability_bulk(domains: List[str]) -> Dict:
    """
    Input: list of domain roots or FQDNs.
    Returns:
      {
        "results": [{"domain": "...", "available": bool, "status": int}, ...],
        "available": ["..."],                 # convenience
        "count_available": int
      }
    """
    session = requests.Session()
    results: List[Dict] = []

    for d in domains:
        fqdn = _to_com(d)

        # Skip API if not strictly English letters
        if not _sld_is_english_alpha(fqdn):
            results.append({"domain": fqdn, "available": False, "status": 0})
            continue

        try:
            r = session.get(RDAP_URL.format(fqdn), timeout=6)
            ok = (r.status_code == 404)
            results.append({"domain": fqdn, "available": ok, "status": r.status_code})
        except requests.RequestException:
            results.append({"domain": fqdn, "available": False, "status": 0})

    available = [x["domain"] for x in results if x["available"]]
    return {"results": results, "available": available, "count_available": len(available)}


In [None]:
check_tool_bulk = {
    "type": "function",
    "function": {
        "name": "check_com_availability_bulk",
        "description": "Batch check .com availability via RDAP for a list of domains (roots or FQDNs).",
        "parameters": {
            "type": "object",
            "properties": {
                "domains": {
                    "type": "array",
                    "items": {"type": "string"},
                    "minItems": 1,
                    "maxItems": 50,
                    "description": "List of domain roots or .com FQDNs."
                }
            },
            "required": ["domains"],
            "additionalProperties": False
        }
    }
}

TOOLS = [check_tool_bulk]


In [None]:
def handle_tool_calls(message) -> List[Dict]:
    results = []
    for call in (message.tool_calls or []):
        fn = getattr(call.function, "name", None)
        args_raw = getattr(call.function, "arguments", "") or "{}"
        try:
            args = json.loads(args_raw)
        except Exception:
            args = {}

        logger.debug("TOOL CALL -> %s | args=%s", fn, json.dumps(args, ensure_ascii=False))

        if fn == "check_com_availability_bulk":
            payload = check_com_availability_bulk(args.get("domains", []))
        elif fn == "check_com_availability":
            payload = check_com_availability(args.get("domain", ""))
        else:
            payload = {"error": f"unknown tool {fn}"}

        logger.debug("TOOL RESULT <- %s | %s", fn, json.dumps(payload, ensure_ascii=False))

        results.append({
            "role": "tool",
            "tool_call_id": call.id,
            "content": json.dumps(payload),
        })
    return results


In [None]:
SYSTEM_PROMPT = """You are the Agent for project "AI Domain Finder".
Goal: suggest .com domains and verify availability using the tool ONLY (no guessing).

Do this each interaction:
- Generate up to ~20 short, brandable .com candidates from:
  (1) Industry, (2) Target Customers, (3) Description.
- Use the BULK tool `check_com_availability_bulk` with a list of candidates
  (roots or FQDNs). Prefer a single call or very few batched calls.
- If >= 5 available .coms are found, STOP checking and finalize the answer.

Output Markdown with EXACT section headings:
1) Available .com domains:
   - itemized list of available .coms only (root + .com)
2) Preferred domain:
   - a single best pick
3) Audio explanation:
   - 1–2 concise sentences explaining the preference

Constraints:
- Use customer-familiar words where helpful.
- Keep names short, simple, pronounceable; avoid hyphens/numbers unless meaningful.
- Never include TLDs other than .com.
- domain is made up of english alphabets in lower case only no symbols or spaces to use
"""


In [None]:
def _asdict_tool_call(tc: Any) -> dict:
    try:
        return {
            "id": getattr(tc, "id", None),
            "type": "function",
            "function": {
                "name": getattr(tc.function, "name", None),
                "arguments": getattr(tc.function, "arguments", None),
            },
        }
    except Exception:
        return {"type": "function", "function": {"name": None, "arguments": None}}

def _asdict_message(msg: Any) -> dict:
    if isinstance(msg, dict):
        return msg
    role = getattr(msg, "role", None)
    content = getattr(msg, "content", None)
    tool_calls = getattr(msg, "tool_calls", None)
    out = {"role": role, "content": content}
    if tool_calls:
        out["tool_calls"] = [_asdict_tool_call(tc) for tc in tool_calls]
    return out

def _sanitized_messages_for_log(messages: list[dict | Any]) -> list[dict]:
    return [_asdict_message(m) for m in messages]

def _limit_text(s: str, limit: int = 40000) -> str:
    return s if len(s) <= limit else (s[:limit] + "\n... [truncated]")


In [None]:
def run_agent_with_tools(history: List[Dict]) -> Tuple[str, List[str], str]:
    """
    Returns:
      reply_md: final assistant markdown
      tool_available: .coms marked available by RDAP tools (order-preserving, deduped)
      dbg_text: concatenated log buffer (for the UI panel)
    """
    messages: List[Dict] = [{"role": "system", "content": SYSTEM_PROMPT}] + history
    tool_available: List[str] = []

    dbg_json(_sanitized_messages_for_log(messages), "=== LLM REQUEST (initial messages) ===")
    resp = openai.chat.completions.create(model=OPENAI_MODEL, messages=messages, tools=TOOLS)

    while resp.choices[0].finish_reason == "tool_calls":
        tool_msg_sdk = resp.choices[0].message
        tool_msg = _asdict_message(tool_msg_sdk)
        dbg_json(tool_msg, "=== ASSISTANT (tool_calls) ===")

        tool_results = handle_tool_calls(tool_msg_sdk)

        # Accumulate authoritative availability directly from tool outputs
        for tr in tool_results:
            try:
                data = json.loads(tr["content"])
                if isinstance(data, dict) and isinstance(data.get("available"), list):
                    for d in data["available"]:
                        tool_available.append(_to_com(d))
            except Exception:
                pass

        dbg_json([json.loads(tr["content"]) for tr in tool_results], "=== TOOL RESULTS ===")

        messages.append(tool_msg)
        messages.extend(tool_results)
        dbg_json(_sanitized_messages_for_log(messages), "=== LLM REQUEST (messages + tools) ===")

        resp = openai.chat.completions.create(model=OPENAI_MODEL, messages=messages, tools=TOOLS)

    # Dedup preserve order
    seen, uniq = set(), []
    for d in tool_available:
        if d not in seen:
            seen.add(d)
            uniq.append(d)

    reply_md = resp.choices[0].message.content
    logger.debug("=== FINAL ASSISTANT ===\n%s", _limit_text(reply_md))
    dbg_json(uniq, "=== AVAILABLE FROM TOOLS (authoritative) ===")

    # Return current buffer text for the UI panel
    dbg_text = _limit_text(get_log_text(), 40000)
    return reply_md, uniq, dbg_text


In [None]:
def extract_audio_text(markdown_reply: str) -> str:
    """
    Pulls the 'Audio explanation:' section; falls back to first sentence.
    """
    marker = "Audio explanation:"
    lower = markdown_reply.lower()
    idx = lower.find(marker.lower())
    if idx != -1:
        segment = markdown_reply[idx + len(marker):].strip()
        parts = segment.split(".")
        return (". ".join([p.strip() for p in parts if p.strip()][:2]) + ".").strip()
    return "This domain is the clearest, most memorable fit for the audience and brand goals."

def synth_audio(text: str) -> bytes:
    audio = openai.audio.speech.create(
        model=TTS_MODEL,
        voice="alloy",
        input=text
    )
    return audio.content


In [None]:

_DOMAIN_RE = re.compile(r"\b[a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?\.com\b", re.I)
_HDR_AVAIL = re.compile(r"^\s*[\d\.\)\-]*\s*available\s+.*\.com\s+domains", re.I)
_HDR_PREF  = re.compile(r"^\s*[\d\.\)\-]*\s*preferred\s+domain", re.I)

def _norm_domain(s: str) -> str:
    s = s.strip().lower()
    return s if s.endswith(".com") else f"{s}.com"

def parse_available(md: str) -> list[str]:
    lines = md.splitlines()
    out = []
    in_section = False
    for ln in lines:
        if _HDR_AVAIL.search(ln):
            in_section = True
            continue
        if in_section and _HDR_PREF.search(ln):
            break
        if in_section:
            for m in _DOMAIN_RE.findall(ln):
                out.append(_norm_domain(m))
    # Fallback: if the header wasn't found, collect all .coms then we'll still
    # rely on agent instruction to list only available, which should be safe.
    if not out:
        out = [_norm_domain(m) for m in _DOMAIN_RE.findall(md)]
    # dedupe preserve order
    seen, uniq = set(), []
    for d in out:
        if d not in seen:
            seen.add(d)
            uniq.append(d)
    return uniq

def parse_preferred(md: str) -> str:
    # search the preferred section first
    lines = md.splitlines()
    start = None
    for i, ln in enumerate(lines):
        if _HDR_PREF.search(ln):
            start = i
            break
    segment = "\n".join(lines[start:start+8]) if start is not None else md[:500]
    m = _DOMAIN_RE.search(segment)
    if m:
        return _norm_domain(m.group(0))
    m = _DOMAIN_RE.search(md)
    return _norm_domain(m.group(0)) if m else ""

def merge_and_sort(old: list[str], new: list[str]) -> list[str]:
    merged = {d.lower() for d in old} | {d.lower() for d in new}
    return sorted(merged, key=lambda s: (len(s), s))

def fmt_available_md(domains: list[str]) -> str:
    if not domains:
        return "### Available .com domains (cumulative)\n\n*– none yet –*"
    items = "\n".join(f"- `{d}`" for d in domains)
    return f"### Available .com domains (cumulative)\n\n{items}"

def fmt_preferred_md(d: str) -> str:
    if not d:
        return "### Preferred domain\n\n*– not chosen yet –*"
    return f"### Preferred domain\n\n`{d}`"

def build_context_msg(known_avail: Optional[List[str]], preferred_now: Optional[str]) -> str:
    """
    Create a short 'state so far' block that we prepend to the next user turn
    so the model always sees the preferred and cumulative available list.
    """
    lines = []
    if (preferred_now or "").strip():
        lines.append(f"Preferred domain so far: {preferred_now.strip().lower()}")
    if known_avail:
        lines.append("Available .com domains discovered so far:")
        for d in known_avail:
            if d:
                lines.append(f"- {d.strip().lower()}")
    if not lines:
        return ""
    return "STATE TO CARRY OVER FROM PREVIOUS TURNS:\n" + "\n".join(lines)

In [None]:
def run_and_extract(history: List[Dict]) -> Tuple[str, List[str], str, str, str]:
    reply_md, avail_from_tools, dbg_text = run_agent_with_tools(history)
    parsed_avail = parse_available(reply_md)
    new_avail = merge_and_sort(avail_from_tools, parsed_avail)
    preferred = parse_preferred(reply_md)
    audio_text = extract_audio_text(reply_md)
    return reply_md, new_avail, preferred, audio_text, dbg_text


In [None]:
def initial_submit(industry: str, customers: str, desc: str,
                   history: List[Dict], known_avail: List[str], preferred_now: str):
    if CLEAR_LOG_ON_RUN:
        clear_log_buffer()

    logger.info("Initial submit | industry=%r | customers=%r | desc_len=%d",
                industry, customers, len(desc or ""))

    # Build context (usually empty on the very first run, but future inits also work)
    ctx = build_context_msg(known_avail or [], preferred_now or "")

    user_msg = (
        "Please propose .com domains based on:\n"
        f"Industry: {industry}\n"
        f"Target Customers: {customers}\n"
        f"Description: {desc}"
    )

    # Single user turn that includes state + prompt so the model always sees memory
    full_content = (ctx + "\n\n" if ctx else "") + user_msg

    history = (history or []) + [{"role": "user", "content": full_content}]
    reply_md, new_avail, preferred, audio_text, dbg_text = run_and_extract(history)
    history += [{"role": "assistant", "content": reply_md}]

    all_avail = merge_and_sort(known_avail or [], new_avail or [])
    preferred_final = preferred or preferred_now or ""
    audio_bytes = synth_audio(audio_text)

    return (
        history,                         # s_history
        all_avail,                       # s_available (cumulative)
        preferred_final,                 # s_preferred
        gr.update(value=fmt_preferred_md(preferred_final)),
        gr.update(value=fmt_available_md(all_avail)),
        gr.update(value="", visible=True),                 # reply_in: show after first run
        gr.update(value=audio_bytes, visible=True),        # audio_out
        gr.update(value=dbg_text),                         # debug_box
        gr.update(value="Find Domains (done)", interactive=False),  # NEW: disable Find
        gr.update(visible=True),                           # NEW: show Send button
    )

def refine_submit(reply: str,
                  history: List[Dict], known_avail: List[str], preferred_now: str):
    # If empty, do nothing (keeps UI state untouched)
    if not (reply or "").strip():
        return ("", history, known_avail, preferred_now,
                gr.update(), gr.update(), gr.update(), gr.update())

    if CLEAR_LOG_ON_RUN:
        clear_log_buffer()
    logger.info("Refine submit | user_reply_len=%d", len(reply))

    # Always prepend memory + the user's refinement so the model can iterate properly
    ctx = build_context_msg(known_avail or [], preferred_now or "")
    full_content = (ctx + "\n\n" if ctx else "") + reply.strip()

    history = (history or []) + [{"role": "user", "content": full_content}]
    reply_md, new_avail, preferred, audio_text, dbg_text = run_and_extract(history)
    history += [{"role": "assistant", "content": reply_md}]

    all_avail = merge_and_sort(known_avail or [], new_avail or [])
    preferred_final = preferred or preferred_now or ""
    audio_bytes = synth_audio(audio_text)

    return (
        "",                                 # clear Reply box
        history,                            # s_history
        all_avail,                          # s_available (cumulative)
        preferred_final,                    # s_preferred
        gr.update(value=fmt_preferred_md(preferred_final)),
        gr.update(value=fmt_available_md(all_avail)),
        gr.update(value=audio_bytes, visible=True),
        gr.update(value=dbg_text),          # debug_box
    )

def clear_debug():
    clear_log_buffer()
    return gr.update(value="")


In [None]:
with gr.Blocks(title="AI Domain Finder (.com only)") as ui:
    gr.Markdown("# AI Domain Finder (.com only)")
    gr.Markdown("Agent proposes .com domains, verifies via RDAP, picks a preferred choice, and explains briefly.")

    # App state
    s_history = gr.State([])
    s_available = gr.State([])
    s_preferred = gr.State("")

    with gr.Row():
        with gr.Column(scale=7):  # LEFT 70%
            with gr.Group():
                industry_in = gr.Textbox(label="Industry")
                customers_in = gr.Textbox(label="Target Customers")
                desc_in = gr.Textbox(label="Description", lines=3)
                find_btn = gr.Button("Find Domains", variant="primary")

            audio_out = gr.Audio(label="Audio explanation", autoplay=True, visible=False)

            with gr.Row():
                reply_in = gr.Textbox(
                    label="Reply",
                    placeholder="Chat with the agent to refine the outputs",
                    lines=2,
                    visible=False,  # hidden for the first input
                )
                send_btn = gr.Button("Send", variant="primary", visible=False)

        with gr.Column(scale=3):  # RIGHT 30%
            preferred_md = gr.Markdown(fmt_preferred_md(""))
            available_md = gr.Markdown(fmt_available_md([]))

            with gr.Accordion("Debug log", open=False):
                debug_box = gr.Textbox(label="Log", value="", lines=16, interactive=False)
                clear_btn = gr.Button("Clear log", size="sm")

    # Events
    # Initial run: also disables Find and shows Send
    find_btn.click(
        initial_submit,
        inputs=[industry_in, customers_in, desc_in, s_history, s_available, s_preferred],
        outputs=[
            s_history, s_available, s_preferred,
            preferred_md, available_md,
            reply_in,            # visible after first run
            audio_out,           # visible after first run
            debug_box,
            find_btn,            # NEW: disable + relabel
            send_btn,            # NEW: show the Send button
        ],
    )

    # Multi-turn submit via Enter in the textbox
    reply_in.submit(
        refine_submit,
        inputs=[reply_in, s_history, s_available, s_preferred],
        outputs=[
            reply_in, s_history, s_available, s_preferred,
            preferred_md, available_md, audio_out, debug_box
        ],
    )

    # Multi-turn submit via explicit Send button
    send_btn.click(
        refine_submit,
        inputs=[reply_in, s_history, s_available, s_preferred],
        outputs=[
            reply_in, s_history, s_available, s_preferred,
            preferred_md, available_md, audio_out, debug_box
        ],
    )

    clear_btn.click(clear_debug, inputs=[], outputs=[debug_box])

ui.launch(inbrowser=True, show_error=True)
