In [1]:
!pip install -q openai wordfreq

You should consider upgrading via the '/Users/michaelryaboy/recent-projects/inference-webhook/venv/bin/python3 -m pip install --upgrade pip' command.[0m[33m
[0m

In [58]:
# 🟢 Cell 2 – imports & client
import os, re, asyncio
from typing import List, Literal
from openai import OpenAI
from pydantic import BaseModel, Field

client = OpenAI(
    base_url="https://api.inference.net/v1",          # inference.net endpoint
    api_key=os.getenv("INFERENCE_API_KEY"),
)
MODEL = "mistralai/mistral-nemo-12b-instruct/fp-8"

In [4]:
# fetch domains from https://gist.githubusercontent.com/mrmps/fbf29299d4b4401e6cfd659fd637fe41/raw/10920268c7a806778b954568f88a1734ea59ca8a/gistfile1.txt

import requests

response = requests.get("https://gist.githubusercontent.com/mrmps/fbf29299d4b4401e6cfd659fd637fe41/raw/10920268c7a806778b954568f88a1734ea59ca8a/gistfile1.txt")

raw_domains = response.text.split("\n")

import re
from wordfreq import zipf_frequency   # pip install wordfreq

VOWELS = set("aeiou")

def looks_good(domain: str) -> bool:
    """
    Heuristically decide whether `domain` is worth a closer look.

    • Accepts **any** TLD (only rejects punycode or multi‑dot domains)
    • ASCII letters only in the SLD  (no digits, hyphens, punycode)
    • 4–16 chars, at least one vowel *and* one consonant
    • Balanced vowel ratio: 0.2 – 0.8
    • No 4‑char runs of vowels or consonants, no triple repeats,
      no doubled first letter (“aafoo”), no placeholder words
    • Must look *somewhat* like an English word (Zipf ≥ 1.2) **or**
      contain an English sub‑chunk ≥ 4 letters with Zipf ≥ 3.0
    """

    domain = domain.strip().lower()

    # structural sanity -------------------------------------------------
    if domain.startswith("xn--") or domain.count(".") != 1:
        return False

    sld, _, tld = domain.partition(".")   # we accept every tld now
    if not 4 <= len(sld) <= 16:
        return False
    if not re.fullmatch(r"[a-z]+", sld):
        return False                        # digits / hyphens → out

    # quick character‑pattern checks -----------------------------------
    if sld[0] == sld[1]:                   # reject “aafoo”, “bbtech”
        return False
    if re.search(r"(.)\1\1", sld):         # any triple repeat
        return False
    if re.search(r"[aeiou]{4,}", sld) or re.search(r"[bcdfghjklmnpqrstvwxyz]{4,}", sld):
        return False
    if re.fullmatch(r"(?:[bcdfghjklmnpqrstvwxyz][aeiou]){4,}", sld):
        return False                       # gubarecoti‑style generator junk

    # vowel / consonant balance ----------------------------------------
    v = sum(c in VOWELS for c in sld)
    if v == 0 or v == len(sld):
        return False
    ratio = v / len(sld)
    if not (0.20 <= ratio <= 0.80):
        return False

    # “looks like a word” signal ---------------------------------------
    freq = zipf_frequency(sld, "en")
    if freq >= 1.2:
        return True                        # good enough

    # fallback: does it *contain* a real 4‑letter chunk?
    for i in range(len(sld) - 3):
        chunk = sld[i:i+4]
        if zipf_frequency(chunk, "en") >= 3.0:
            return True

    return False

domains = [d for d in raw_domains if looks_good(d)]
print("Len Filtered list →", len(domains))
print("first 10 domains →", domains[:10])

Len Filtered list → 52002
first 10 domains → ['ababanews.net', 'ababeelrealtors.com', 'abacombs.com', 'abacotoner.com', 'abadcult.org', 'abannebebek.com', 'abanoteolo.it', 'abarcart.com', 'abarrotesbombin.click', 'abaseballtime.com']


In [55]:
import json
from pydantic import BaseModel, Field, ConfigDict


# 🟢 Cell 3 – Pydantic schema (reasoning BEFORE score)
class DomainValuation(BaseModel):
    reasoning: str
    score:  str

    model_config = ConfigDict(extra='forbid')   


SYSTEM_PROMPT = """
You are a senior domain‑name appraiser with 15 years of aftermarket sales data.
First think through the valuation factors, then output JSON **in this exact order**:
reasoning → score.  Follow the schema.

Evaluation rubric (consider ALL factors – absence of any single factor prevents a perfect score):

1 • Extension quality – .com > .net ≈ .org ≻ others  
2 • Length – ideal ≤ 8 characters, shorter is better  
3 • Word count – single meaningful word > two‑word phrase; no fragments  
4 • Pronounceability – passes “radio test” (clear spelling upon hearing)  
5 • Memorability & brandability – evokes imagery or category, no hyphens/numbers  
6 • Search demand – high monthly exact‑match keyword volume is a plus  
7 • Commercial intent – aligns with lucrative verticals (finance, AI, health, etc.)  
8 • Trademarks – zero obvious TM conflicts; risky names are penalised  
9 • Comparable sales – recent public comps of similar length/keywords/TLD  
10 • Market trends – includes rising buzzwords or technologies without being ephemeral  

Scoring guide:

  90‑100  excellent – premium asset, likely ≥ $50 k  
  70‑89   good – solid brand, low‑five‑figure potential  
  40‑69   average – resale value a few hundred‑few thousand USD  
   1‑39   poor – little intrinsic value  

‘reasoning’ = concise bullets (≤ 15 words each) justifying the score (max 5).  
If unsure, round the score DOWN.
""".strip()

In [56]:
DOMAIN_SCHEMA = {
    "name":   "domain_valuation",
    "strict": True,
    "schema": DomainValuation.model_json_schema()      # auto‑generated 🎉
}

print(DOMAIN_SCHEMA)

{'name': 'domain_valuation', 'strict': True, 'schema': {'additionalProperties': False, 'properties': {'reasoning': {'title': 'Reasoning', 'type': 'string'}, 'score': {'title': 'Score', 'type': 'string'}}, 'required': ['reasoning', 'score'], 'title': 'DomainValuation', 'type': 'object'}}


In [53]:
def make_line(domain: str, custom_id: str):
    body = {
        "model": MODEL,
        "messages": [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user",   "content": f"Domain: {domain}"}
        ],
        "response_format": {           # structured outputs!
            "type": "json_schema",
            "json_schema": DOMAIN_SCHEMA
        },
        "temperature": 0.3,
        "max_tokens": 1200
    }
    return {
        "custom_id": custom_id,
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": body
    }

In [59]:
import json
MAX_LINES = 1
items = domains[:MAX_LINES]  # use your actual list variable

# Write the file, one request per line using make_line
with open("batchinput.jsonl", "w", encoding="utf-8") as f:
    for i, dom in enumerate(items):
        line = make_line(dom.strip(), custom_id=str(i))
        f.write(f"{json.dumps(line)}\n")

# Upload the file using openai.files.create
batch_input_file = client.files.create(
    file=open("batchinput.jsonl", "rb"),
    purpose="batch"
)

print(batch_input_file)

FileObject(id='oxygPm0EmUcA6ZrgMy0vd', bytes=None, created_at=1753835100345, filename=None, object=None, purpose=None, status=None, expires_at=None, status_details=None)
