In [None]:
"""
Week 1 Assignment: LLM Engineering
Author: Nikhil Raut

Notebook: domain_name_generator.ipynb

Purpose:
Generate short, memorable domain root ideas (no TLD) from keywords using an OpenAI Chat Completions system+user prompt.

Quick setup:
1) pip install openai python-dotenv ipython
2) Add OPENAI_API_KEY to a .env file in the project root

How to use (Python script):
from domain_name_generator import generate_domain_ideas
ideas = generate_domain_ideas(["fitness", "coach", "wellness"], target_country="India", n=20)
print(ideas)

How to use (Notebook):
# after running config/client cells
generate_domain_ideas(["fintech", "pay"], target_country="US", n=15)

Notes:
- n: 1-50 (returns list[str] of TLD-less roots)
- Adjust MODEL and temperature in the config cell or function args
"""

In [None]:

from dataclasses import dataclass, field
from typing import List, Dict, Tuple
from openai import OpenAI
from dotenv import load_dotenv
from IPython.display import Markdown, display
import json
import re
from typing import Optional

In [None]:
# --- Cell 2: Config & Client

# Load environment (.env should contain OPENAI_API_KEY)
load_dotenv()

# Initialize OpenAI client (relies on OPENAI_API_KEY)
openai = OpenAI()

# Model constants (feel free to change to another chat model)
MODEL = "gpt-4o-mini"

# Deterministic-ish by default; raise temperature for wilder ideas.
GENERATION_TEMPERATURE = 0.8
SCORING_TEMPERATURE = 0.2


In [None]:

SYSTEM_PROMPT = """You are a helpful brand-naming assistant.
Generate **domain ROOT** ideas (no TLD like .com) that are:
- short (ideally 5–12 chars), pronounceable, and memorable
- alphanumeric only (no spaces or hyphens), start with a letter
- avoid famous trademarks and sensitive terms
- diverse styles: blends, portmanteau, slight misspellings, synonyms
Return ONLY valid JSON: {"domains": ["idea1", "idea2", ...]}"""

def _build_user_prompt(keywords: list[str], target_country: Optional[str], n: int) -> str:
    kws = ", ".join(keywords)
    country_line = f"Target country/market: {target_country}" if target_country else "Target country/market: (general/global)"
    return (
        "Given the keywords below, propose exactly "
        f"{n} short, brandable domain roots **without any TLD**.\n"
        f"Keywords: {kws}\n"
        f"{country_line}\n"
        "Constraints:\n"
        "- 1–2 syllables if possible\n"
        "- No hyphens/underscores/spaces\n"
        "- Avoid numbers unless they genuinely help memorability\n"
        "Output format: a JSON object with a single key 'domains' whose value is an array of strings."
    )

_valid_root = re.compile(r"^[a-z][a-z0-9]{2,49}$")  # 3–50 chars, starts with letter

def _sanitize_root(s: str) -> str:
    s = s.strip().lower()
    # remove anything after a dot (accidental TLDs)
    s = s.split(".", 1)[0]
    # drop spaces and hyphens just in case
    s = s.replace(" ", "").replace("-", "")
    # keep only a–z0–9
    s = re.sub(r"[^a-z0-9]", "", s)
    # ensure starts with letter
    if s and not s[0].isalpha():
        s = re.sub(r"^[^a-z]+", "", s)
    return s

def _unique_preserve_order(items: list[str]) -> list[str]:
    seen = set()
    out = []
    for it in items:
        if it not in seen:
            seen.add(it)
            out.append(it)
    return out

In [None]:
# --- Cell 4: Core generator function (Chat Completions)

def generate_domain_ideas(
    keywords: list[str],
    target_country: Optional[str] = None,
    n: int = 20,
    *,
    model: str = MODEL,
    temperature: float = GENERATION_TEMPERATURE,
) -> list[str]:
    """
    Generate up to `n` domain ROOT ideas (no TLD).
    - keywords: list of seed terms
    - target_country: optional market hint (e.g., 'India', 'US', 'DE')
    - n: number of ideas to return (1–50)
    """
    if not keywords or not any(k.strip() for k in keywords):
        raise ValueError("Provide at least one non-empty keyword.")
    if not (1 <= int(n) <= 50):
        raise ValueError("`n` must be between 1 and 50.")

    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": _build_user_prompt([k.strip() for k in keywords if k.strip()], target_country, int(n))},
    ]

    resp = openai.chat.completions.create(
        model=model,
        messages=messages,
        temperature=temperature,
        response_format={"type": "json_object"},  # ask for strict JSON
    )

    content = resp.choices[0].message.content

    # Try to parse JSON; if it fails, fall back to naive extraction.
    ideas: list[str] = []
    try:
        data = json.loads(content)
        if isinstance(data, dict) and isinstance(data.get("domains"), list):
            ideas = [str(x) for x in data["domains"]]
    except Exception:
        # Fallback: split lines / commas
        raw = re.split(r"[\n,]+", content)
        ideas = [r for r in raw if r.strip()]

    # Sanitize, validate, dedupe, and enforce count
    ideas = [_sanitize_root(x) for x in ideas]
    ideas = [x for x in ideas if _valid_root.match(x)]
    ideas = _unique_preserve_order(ideas)[: int(n)]

    return ideas


In [None]:
example_keywords = ["law", "gpt", "chatbot", "lawyer helper"]
ideas = generate_domain_ideas(example_keywords, target_country="India", n=20)


In [None]:
display(Markdown("## Domain ideas (no TLD)\n" + "\n".join(f"{i+1}. `{d}`" for i, d in enumerate(ideas))))