# Finance Briefing Generator
B-Gemini (full content) + Chirp 3 HD TTS (Alnilam voice)

See: `docs/workflow/2-prds/3-prd-finance-briefing-v1.md`

In [1]:
import os, time, json, re, wave
from pathlib import Path
from datetime import date, timedelta
from collections import Counter
from dotenv import load_dotenv
from supabase import create_client
from google import genai
from google.genai import types
from google.cloud import texttospeech

# Load env
env_path = Path('../.env.local') if Path('../.env.local').exists() else Path('.env.local')
load_dotenv(env_path)

sb = create_client(os.getenv('NEXT_PUBLIC_SUPABASE_URL'), os.getenv('SUPABASE_SERVICE_ROLE_KEY'))
gemini_client = genai.Client(api_key=os.getenv('GEMINI_API_KEY'))
chirp_client = texttospeech.TextToSpeechClient()

print('Connected: Supabase, Gemini, Google Cloud TTS')

Connected: Supabase, Gemini, Google Cloud TTS


## Query Today's Articles

In [2]:
today = date.today()

# 48h lookback window + briefed filter (curated articles excluded)
cutoff = today - timedelta(hours=48)

items = sb.table('wsj_items') \
    .select('id,title,description,feed_name,published_at,link,briefed') \
    .gte('published_at', cutoff.strftime('%Y-%m-%dT%H:%M:%S')) \
    .order('published_at', desc=True) \
    .execute()

# Filter out articles already curated in previous briefings
before_count = len(items.data)
items.data = [i for i in items.data if not i.get('briefed')]

print(f'Articles in window (48h): {len(items.data)} (filtered {before_count - len(items.data)} previously briefed)')
cats = Counter(i['feed_name'] for i in items.data)
for cat, count in cats.most_common():
    print(f'  {cat}: {count}')

Articles in window (48h): 70 (filtered 0 previously briefed)
  BUSINESS_MARKETS: 38
  WORLD: 14
  ECONOMY: 8
  TECH: 5
  POLITICS: 5


In [3]:
# Validate: show all article titles with category and time
print(f'{"#":>3}  {"Category":<20} {"Published":<12} Title')
print('-' * 100)
for i, item in enumerate(items.data, 1):
    cat = item['feed_name']
    pub = item['published_at'][:16].replace('T', ' ')
    title = item['title'][:65]
    print(f'{i:>3}  {cat:<20} {pub:<12} {title}')

  #  Category             Published    Title
----------------------------------------------------------------------------------------------------
  1  BUSINESS_MARKETS     2026-02-17 01:00 Starboard to Push for Big Shake-Up of Tripadvisor’s Board
  2  WORLD                2026-02-17 01:00 Britain’s Monarchy Can’t Escape the Shadow of the Epstein Scandal
  3  BUSINESS_MARKETS     2026-02-17 00:33 Anderson Cooper to Leave CBS News’s ‘60 Minutes’
  4  TECH                 2026-02-17 00:12 Irish Data Watchdog Opens Inquiry into X Over Grok AI Images
  5  BUSINESS_MARKETS     2026-02-16 23:45 BHP Exploring Infrastructure Deals as It Chases $10 Billion Targe
  6  BUSINESS_MARKETS     2026-02-16 22:39 Nancy Guthrie’s Family Cleared as Suspects in Disappearance
  7  BUSINESS_MARKETS     2026-02-16 21:53 Thomas Pritzker, Named in Epstein Files, Retires as Hyatt Executi
  8  WORLD                2026-02-16 21:36 The Canada School Shooting Touched Almost Everyone at a U.S.-Back
  9  BUSINESS_MARK

## Join with Crawl Results + LLM Analysis

In [4]:
item_ids = [i['id'] for i in items.data]

crawl_map = {}
for i in range(0, len(item_ids), 100):
    batch = item_ids[i:i+100]
    crawls = sb.table('wsj_crawl_results') \
        .select('id,wsj_item_id,content,crawl_status,relevance_score,relevance_flag,llm_same_event,llm_score,resolved_domain') \
        .in_('wsj_item_id', batch) \
        .eq('crawl_status', 'success') \
        .execute()
    for c in crawls.data:
        wid = c['wsj_item_id']
        if wid not in crawl_map or (c.get('llm_score') or 0) > (crawl_map[wid].get('llm_score') or 0):
            crawl_map[wid] = c

quality_crawl_ids = [c['id'] for c in crawl_map.values()]
llm_map = {}
for i in range(0, len(quality_crawl_ids), 100):
    batch = quality_crawl_ids[i:i+100]
    analyses = sb.table('wsj_llm_analysis') \
        .select('crawl_result_id,summary,key_entities,key_numbers,event_type,sentiment') \
        .in_('crawl_result_id', batch) \
        .execute()
    for a in analyses.data:
        llm_map[a['crawl_result_id']] = a

print(f'Articles with crawl results: {len(crawl_map)}')
print(f'Articles with LLM analysis: {len(llm_map)}')

Articles with crawl results: 46
Articles with LLM analysis: 40


In [5]:
# Validate: crawl + LLM join results per article
print(f'{"#":>3}  {"Crawl":>5} {"Rel":>5} {"LLM":>4} {"Same":>5} {"Domain":<25} Title')
print('-' * 110)
for i, item in enumerate(items.data, 1):
    wid = item['id']
    crawl = crawl_map.get(wid)
    has_llm = crawl and crawl['id'] in llm_map
    if crawl:
        rel = f'{crawl.get("relevance_score") or 0:.2f}'
        llm_s = f'{crawl.get("llm_score") or 0:.1f}'
        same = 'Y' if crawl.get('llm_same_event') else 'N'
        domain = (crawl.get('resolved_domain') or '')[:25]
    else:
        rel = llm_s = same = '-'
        domain = ''
    crawl_flag = 'OK' if crawl else 'MISS'
    llm_flag = '+' if has_llm else ''
    title = item['title'][:45]
    print(f'{i:>3}  {crawl_flag:>5} {rel:>5} {llm_s:>4} {same:>5} {domain:<25} {title}')

missing = [item['title'] for item in items.data if item['id'] not in crawl_map]
print(f'\n--- Missing crawl ({len(missing)}): ---')
for t in missing:
    print(f'  - {t}')

  #  Crawl   Rel  LLM  Same Domain                    Title
--------------------------------------------------------------------------------------------------------------
  1   MISS     -    -     -                           Starboard to Push for Big Shake-Up of Tripadv
  2   MISS     -    -     -                           Britain’s Monarchy Can’t Escape the Shadow of
  3   MISS     -    -     -                           Anderson Cooper to Leave CBS News’s ‘60 Minut
  4   MISS     -    -     -                           Irish Data Watchdog Opens Inquiry into X Over
  5   MISS     -    -     -                           BHP Exploring Infrastructure Deals as It Chas
  6   MISS     -    -     -                           Nancy Guthrie’s Family Cleared as Suspects in
  7   MISS     -    -     -                           Thomas Pritzker, Named in Epstein Files, Reti
  8   MISS     -    -     -                           The Canada School Shooting Touched Almost Eve
  9   MISS     -    -     -  

## LLM Curation — Pick Top Stories

In [6]:
# LLM curation: Pick top 10-15 high-impact articles
# Strategy: try Pro (3 attempts), fallback to Flash if Pro keeps failing

CURATION_PROMPT = """You are a senior financial news editor. From the article list below, pick the 10-15 most important stories that deserve deep coverage in a daily briefing.

Selection criteria (in priority order):
1. Macroeconomic impact: interest rates, inflation, GDP, employment, central bank decisions
2. AI/Tech major moves: big product launches, regulatory shifts, large deals, industry trends
3. Market-wide impact: major M&A, significant earnings beats/misses, policy changes
4. Geopolitical events with direct market implications

Mandatory inclusion:
- ALWAYS include ALL articles tagged [TECH] or related to AI/technology, unless they are purely lifestyle/opinion pieces with no market relevance.

Exclusion:
- SKIP executive personnel stories (CEO/CFO/lawyer hired, fired, stepped down, pay raises) unless the departure signals a major corporate crisis or strategic shift.
- SKIP "Roundup: Market Talk" digest articles — they are low-value summaries.

Rules:
- If multiple articles cover the same event, pick only the one with the richest detail.
- Return ONLY a JSON array of article numbers (1-indexed), nothing else.
- No explanation, no text before or after. Just the array.
- Example: [3, 7, 12, 15, 22, 28, 33, 41, 45, 50, 55]
"""

# Build article list for curation
curation_lines = []
for i, item in enumerate(items.data, 1):
    wid = item['id']
    crawl = crawl_map.get(wid)
    llm = llm_map.get(crawl['id']) if crawl and crawl['id'] in llm_map else {}
    entities = ', '.join(llm.get('key_entities', []))[:80]
    line = f"{i}. [{item['feed_name']}] {item['title']}"
    if item.get('description'):
        line += f" — {item['description'][:120]}"
    if entities:
        line += f" (Entities: {entities})"
    curation_lines.append(line)

curation_input = CURATION_PROMPT + "\n\n" + "\n".join(curation_lines)

CURATION_THINKING_BUDGET = 1024

def try_curation(model, config, label):
    """Try curation with given model. Returns raw text or None."""
    try:
        resp = gemini_client.models.generate_content(
            model=model, contents=curation_input, config=config,
        )
        # Try .text first
        try:
            if resp.text:
                return resp.text, resp
        except Exception:
            pass
        # Manual extraction from parts
        for cand in (resp.candidates or []):
            for part in (cand.content.parts if cand.content and cand.content.parts else []):
                if hasattr(part, 'thought') and part.thought:
                    continue
                if hasattr(part, 'text') and part.text:
                    return part.text, resp
        print(f'\n  {label}: empty response')
        return None, resp
    except Exception as e:
        print(f'\n  {label}: {e.__class__.__name__}: {e}')
        return None, None

print(f'Curating {len(items.data)} articles...', end='')
start = time.time()

raw = None
resp = None

# Try Pro up to 3 times
for attempt in range(3):
    print(f' Pro({attempt+1})', end='')
    raw, resp = try_curation(
        'gemini-2.5-pro',
        types.GenerateContentConfig(
            max_output_tokens=4096,
            temperature=0.1,
            thinking_config=types.ThinkingConfig(thinking_budget=CURATION_THINKING_BUDGET),
        ),
        f'Pro attempt {attempt+1}',
    )
    if raw:
        break
    time.sleep(2)

# Fallback to Flash
if not raw:
    print(' -> Flash fallback', end='')
    raw, resp = try_curation(
        'gemini-2.5-flash',
        types.GenerateContentConfig(
            max_output_tokens=1024,
            temperature=0.0,
        ),
        'Flash',
    )

if not raw:
    raise ValueError('Curation failed on both Pro and Flash')

raw = raw.strip()
print(f'\nRaw: {raw}')

# Parse JSON array
cleaned = re.sub(r'```json\s*', '', raw)
cleaned = re.sub(r'```\s*', '', cleaned).strip()
match = re.search(r'\[[\d,\s]+\]', cleaned)
if not match:
    raise ValueError(f'Could not parse JSON array: {raw}')

curated_indices = json.loads(match.group())
curated_ids = set(items.data[i - 1]['id'] for i in curated_indices if 1 <= i <= len(items.data))

elapsed = time.time() - start
model_used = 'Pro' if 'pro' in (resp.model_version or '') else ('Flash' if resp else '?')

# Save token usage for summary
um = resp.usage_metadata
curation_usage = {
    'step': 'Curation',
    'model': resp.model_version if resp else '?',
    'thinking_budget': CURATION_THINKING_BUDGET,
    'thinking_used': getattr(um, 'thinking_token_count', None) or getattr(um, 'thoughts_token_count', None) or 0,
    'input_tokens': um.prompt_token_count if resp else 0,
    'output_tokens': um.candidates_token_count if resp else 0,
    'elapsed': elapsed,
}

print(f'{elapsed:.1f}s — picked {len(curated_ids)} articles')
print(f'Curation model: {model_used} ({resp.model_version})')
print(f'Thinking: {curation_usage["thinking_used"]:,} / {CURATION_THINKING_BUDGET:,} ({curation_usage["thinking_used"]/CURATION_THINKING_BUDGET*100:.0f}% used)')

print(f'\nCurated articles:')
for i in sorted(curated_indices):
    if 1 <= i <= len(items.data):
        item = items.data[i - 1]
        print(f'  {i:>3}. [{item["feed_name"]:<20}] {item["title"][:70]}')

Curating 70 articles... Pro(1)
Raw: [4, 10, 12, 14, 15, 22, 23, 24, 33, 37, 42, 46, 49, 55, 65]
8.1s — picked 15 articles
Curation model: Pro (gemini-2.5-pro)
Thinking: 774 / 1,024 (76% used)

Curated articles:
    4. [TECH                ] Irish Data Watchdog Opens Inquiry into X Over Grok AI Images
   10. [BUSINESS_MARKETS    ] Nasdaq Futures Fall Ahead of Busy Week
   12. [POLITICS            ] The Newest Old Tech in Warfare: Balloons
   14. [BUSINESS_MARKETS    ] Marketplaces Are the Next Frontier in Publisher Deals With AI Companie
   15. [TECH                ] With a Frugal AI Strategy, India Offers Blueprint for Developing World
   22. [BUSINESS_MARKETS    ] ECB’s Move to Boost Euro’s Global Role Looks Positive For Currency
   23. [ECONOMY             ] Eurozone Industrial Output Declines
   24. [BUSINESS_MARKETS    ] Hapag-Lloyd in Advanced Talks Over Potential Acquisition of Israeli Ri
   33. [ECONOMY             ] Japan’s Narrow Growth Tests Fiscal, Monetary Policy Paths
   3

## Filter & Assemble Input (Full Content)

In [9]:
RELEVANCE_THRESHOLD = 0.6

articles = []
quality_count = 0
curated_full_count = 0
title_only_count = 0

for item in items.data:
    wid = item['id']
    crawl = crawl_map.get(wid)
    is_curated = wid in curated_ids

    has_quality = False
    if crawl:
        score = crawl.get('relevance_score') or 0
        same_event = crawl.get('llm_same_event', False)
        has_quality = score >= RELEVANCE_THRESHOLD or same_event

    base = {
        'id': wid,
        'title': item['title'],
        'description': item.get('description') or '',
        'category': item['feed_name'],
        'published_at': item.get('published_at', ''),
        'has_quality_crawl': has_quality,
        'is_curated': is_curated,
    }

    if has_quality:
        quality_count += 1
        llm = llm_map.get(crawl['id'], {})
        # Curated articles get full content, others get [:800]
        content = crawl.get('content') or ''
        if is_curated:
            curated_full_count += 1
        else:
            content = content[:800]
        articles.append({**base,
            'content': content,
            'key_entities': llm.get('key_entities', []),
            'key_numbers': llm.get('key_numbers', []),
            'event_type': llm.get('event_type', ''),
        })
    else:
        title_only_count += 1
        articles.append(base)

print(f'Total articles: {len(articles)}')
print(f'  Quality crawl: {quality_count}')
print(f'    Curated (full content): {curated_full_count}')
print(f'    Standard ([:800]):      {quality_count - curated_full_count}')
print(f'  Title+desc only: {title_only_count}')

Total articles: 70
  Quality crawl: 30
    Curated (full content): 8
    Standard ([:800]):      22
  Title+desc only: 40


In [8]:
# Validate: final article list — curated vs standard vs title-only
print(f'{"#":>3}  {"Tier":>8} {"Chars":>6} {"Category":<20} {"Entities":<30} Title')
print('-' * 115)
for i, a in enumerate(articles, 1):
    if a.get('is_curated') and a.get('has_quality_crawl'):
        tier = 'CURATED'
    elif a.get('has_quality_crawl'):
        tier = 'STD'
    else:
        tier = 'TITLE'
    chars = f'{len(a.get("content", "")):,}' if a.get('content') else '-'
    cat = a['category']
    entities = ', '.join(a.get('key_entities', []))[:30]
    title = a['title'][:42]
    print(f'{i:>3}  {tier:>8} {chars:>6} {cat:<20} {entities:<30} {title}')

curated_chars = sum(len(a.get('content', '')) for a in articles if a.get('is_curated'))
std_chars = sum(len(a.get('content', '')) for a in articles if a.get('has_quality_crawl') and not a.get('is_curated'))
print(f'\n--- Summary ---')
print(f'CURATED (full content):  {sum(1 for a in articles if a.get("is_curated") and a.get("has_quality_crawl"))} articles, {curated_chars:,} chars')
print(f'STD ([:800]):            {sum(1 for a in articles if a.get("has_quality_crawl") and not a.get("is_curated"))} articles, {std_chars:,} chars')
print(f'TITLE only:              {sum(1 for a in articles if not a.get("has_quality_crawl"))}')
print(f'Total input:             ~{(curated_chars + std_chars) // 4:,} tokens (content only)')

  #      Tier  Chars Category             Entities                       Title
-------------------------------------------------------------------------------------------------------------------
  1     TITLE      - BUSINESS_MARKETS                                    Starboard to Push for Big Shake-Up of Trip
  2     TITLE      - WORLD                                               Britain’s Monarchy Can’t Escape the Shadow
  3     TITLE      - BUSINESS_MARKETS                                    Anderson Cooper to Leave CBS News’s ‘60 Mi
  4     TITLE      - TECH                                                Irish Data Watchdog Opens Inquiry into X O
  5     TITLE      - BUSINESS_MARKETS                                    BHP Exploring Infrastructure Deals as It C
  6     TITLE      - BUSINESS_MARKETS                                    Nancy Guthrie’s Family Cleared as Suspects
  7     TITLE      - BUSINESS_MARKETS                                    Thomas Pritzker, Named in Epstein Fi

## Build Prompt (V1 Formal / V2 Friendly / V3 Balanced)

In [10]:

# --- V2: Casual/friendly podcast tone --- (ACTIVE)
BRIEFING_SYSTEM_FRIENDLY = """You are the host of a daily finance podcast that's smart but never stuffy. Think of yourself as that friend who reads everything and gives you the rundown over coffee — sharp, a little witty, and genuinely interested in making sense of the chaos.

You will receive ~40–90 news items; each has a title and description, and roughly half include crawled content plus extracted entities and key numbers.

Thinking process (use your thinking capacity before writing):
1. Scan & Sort: Identify the top 5 stories with the biggest market impact.
2. Date Check: Verify the current date from the "Date:" header — use that exact date and day of week in your greeting.
3. Group: Cluster related stories (e.g., all inflation/Fed news together, all AI news together).
4. Flow: Plan smooth transitions between these clusters so it doesn't sound like a list.
5. Count: You MUST cover exactly 24–28 articles. Plan which ones to cover deeply and which to mention briefly.

Constraints:
1,800–2,000 words (approximately 12–13 minutes at ~150 wpm).
You MUST reference or mention 24–28 articles total. Top 8–12 get deep coverage, the rest get brief contextual mentions.
Output strictly plain text. No markdown, no bullet points, no numbered lists, no section headers or labels.

Opening structure:
Open with a casual, warm greeting and today's date (use the exact date and day of week from the "Date:" header — do NOT guess).
Right after the greeting, naturally preview the 3-5 key topics you'll cover in one or two sentences.
Don't list them — weave them into conversation ("Today we've got the latest inflation numbers, a wild AI story out of the Pentagon, and a trade policy curveball that could hit your grocery bill").

Factual integrity (CRITICAL — follow strictly):
Your ONLY source of truth is the provided articles. Do NOT use your training data or background knowledge to fill in gaps.
NEVER add specific numbers, dates, percentages, dollar amounts, or conditions that are not explicitly stated in the provided content.
NEVER dramatize with analogies like "like a movie," "like science fiction," or "unprecedented" unless the source itself uses that language.
If a detail is interesting but not in the source, leave it out entirely. A shorter, accurate briefing is always better than a longer one with invented details.
Do not infer or estimate figures — if the source says "rate cuts expected" but doesn't say "June" or "25 basis points," do not add those specifics.
Do not quote or paraphrase statements that are not in the provided content, even if you believe they are accurate from other sources.
Market data freshness: Each article includes a "Published" timestamp. For market data (index levels, yields, oil prices, commodity prices), use ONLY figures from the most recent trading day's close. If older articles mention different numbers, ignore them in favor of the latest data.

Style rules:
Write like you talk. Short punchy sentences. Then a longer one when you need to unpack something properly.
Use rhetorical questions to pull listeners in — "So why does this matter?", "Guess what happened next?"
It's fine to have a reaction — "That's a big deal," "Not great, honestly," "This one's interesting" — but don't force it. Keep it natural.
Transitions should flow like conversation, not a teleprompter. Connect stories through cause and effect ("Speaking of inflation, here's where it gets spicy...").
You can be lighthearted, but never flippant about serious topics like layoffs or geopolitical crises.

Editorial rules:
Don't read headlines one by one. Deduplicate immediately — merge overlapping coverage of the same event into one narrative using the richest details available.
Prioritize stories with specific figures, named entities, tickers, timing, and measurable market moves. Use those details naturally.
Spend ~60–70% of the script on the top 8–12 highest-impact stories. Cover the remaining 12–20 as brief contextual mentions (1–2 sentences each) without turning into a list.
For title/description-only items (no crawled content), keep it to 1–2 cautious sentences — do not invent details.
You MUST cover 24–28 articles total. Count them as you write. If you're under 24, add more brief mentions. If you're over 28, cut the least important ones.
Entities and numbers are extracted hints — only mention them when clearly supported by the source material and relevant to why the story matters.
If sources conflict or details are uncertain, be honest about it ("Reports are a bit mixed on this one") rather than picking a side.
After major story arcs, add a short "what to watch" only if supported by the provided items.

Chapter markers (for navigation — IMPORTANT):
Before each new topic cluster, insert a marker on its own line: [CHAPTER: Topic Name]
Use short, clear topic names (e.g., "Fed & Inflation", "AI & Big Tech", "Energy & Oil", "Market Snapshot").
The opening greeting/agenda should be marked as [CHAPTER: Opening].
These markers will be stripped before audio generation — they do NOT affect your script's flow or tone.
Continue writing naturally as before; just add the marker line where the topic transition happens.

Closing structure:
After the main stories, wrap up with a quick "market snapshot" summarizing key numbers: major indexes (S&P 500, Nasdaq, Dow), Treasury yields (10-year), oil (Brent), gold, etc.
Only include items where the provided articles contain actual figures. Do not invent data.
CRITICAL: Only use market figures from articles published on the SAME trading day as today's date. If today is a weekend or holiday and no same-day market data exists, skip the market snapshot entirely — do NOT use stale figures from previous days.
After the market snapshot (or after the main stories if snapshot is skipped), note how many articles you covered out of the total (e.g., "We hit about X of today's Y stories").
End with a brief, warm sign-off."""


# ---- Active prompt: FRIENDLY ----
BRIEFING_SYSTEM = BRIEFING_SYSTEM_FRIENDLY

def format_article(article):
    parts = [f"[{article['category']}] {article['title']}"]
    if article.get('published_at'):
        parts.append(f"  Published: {article['published_at'][:16].replace('T', ' ')}")
    if article.get('description'): parts.append(f"  Desc: {article['description']}")
    if article.get('content'): parts.append(f"  Content: {article['content']}")
    if article.get('key_entities'): parts.append(f"  Entities: {', '.join(article['key_entities'])}")
    if article.get('key_numbers'): parts.append(f"  Numbers: {', '.join(str(n) for n in article['key_numbers'])}")
    return '\n'.join(parts)

today_str = today.strftime('%A, %B %d, %Y')
articles_text = f"Date: {today_str}\nToday's articles ({len(articles)} total):\n\n" + '\n\n'.join(format_article(a) for a in articles)
full_prompt = BRIEFING_SYSTEM + "\n\n" + articles_text

print(f'Using: FRIENDLY (V2)')
print(f'System prompt: {len(BRIEFING_SYSTEM.split())} words')
print(f'Articles: {len(articles_text):,} chars (~{len(articles_text)//4:,} tokens)')

Using: FRIENDLY (V2)
System prompt: 956 words
Articles: 67,787 chars (~16,946 tokens)


## Save Prompt + Articles Input

In [11]:
# Save articles input with friendly prompt
today_file = date.today().strftime('%Y-%m-%d')
text_dir = Path('../notebooks/tts_outputs/text')
text_dir.mkdir(exist_ok=True)

path = text_dir / f'articles-input-friendly-{today_file}.txt'
with open(str(path), 'w') as f:
    f.write(f'SYSTEM PROMPT — FRIENDLY ({len(BRIEFING_SYSTEM.split())} words):\n')
    f.write('=' * 80 + '\n')
    f.write(BRIEFING_SYSTEM + '\n\n')
    f.write('=' * 80 + '\n')
    f.write(f'ARTICLES INPUT ({len(articles)} articles)\n')
    f.write('=' * 80 + '\n\n')
    f.write(articles_text)
print(f'Saved: {path.name} ({path.stat().st_size / 1024:.0f}KB)')

Saved: articles-input-friendly-2026-02-16.txt (73KB)


## Generate Briefing (Gemini Flash + Pro)

In [12]:
# Generate briefing — Friendly only (Gemini 2.5 Pro)
today_file = date.today().strftime('%Y-%m-%d')
text_dir = Path('../notebooks/tts_outputs/text')

EN_THINKING_BUDGET = 4096

full = BRIEFING_SYSTEM + "\n\n" + articles_text
print(f'Generating [friendly] (Gemini 2.5 Pro, thinking={EN_THINKING_BUDGET:,})...', end=' ')
start = time.time()

resp = gemini_client.models.generate_content(
    model='gemini-2.5-pro',
    contents=full,
    config=types.GenerateContentConfig(
        max_output_tokens=8192,
        temperature=0.6,
        thinking_config=types.ThinkingConfig(thinking_budget=EN_THINKING_BUDGET),
    ),
)

briefing_pro = resp.text
elapsed = time.time() - start

# --- Extract chapter markers and clean text ---
import re, json as _json

chapter_pattern = re.compile(r'\[CHAPTER:\s*(.+?)\]\s*\n?')
matches = list(chapter_pattern.finditer(briefing_pro))

# Build chapters list with position as ratio (0.0-1.0)
briefing_clean = chapter_pattern.sub('', briefing_pro)
total_len = len(briefing_clean)
chapters_en = []
for m in matches:
    # Calculate position in cleaned text
    preceding = chapter_pattern.sub('', briefing_pro[:m.start()])
    pos = len(preceding) / total_len if total_len > 0 else 0.0
    chapters_en.append({"title": m.group(1).strip(), "position": round(pos, 4)})

print(f'Chapters found: {len(chapters_en)}')
for ch in chapters_en:
    print(f'  {ch["position"]:.1%} — {ch["title"]}')

# Save cleaned text (no markers) for TTS
txt_path = text_dir / f'briefing-pro-friendly-{today_file}.txt'
with open(str(txt_path), 'w') as f:
    f.write(briefing_clean)

# Save chapter metadata
ch_path = text_dir / f'chapters-en-{today_file}.json'
with open(str(ch_path), 'w') as f:
    _json.dump(chapters_en, f, indent=2, ensure_ascii=False)
print(f'Saved: {ch_path.name}')

# Use cleaned text for downstream (TTS)
briefing_pro = briefing_clean

# Save token usage for summary
en_usage = {
    'step': 'EN Briefing',
    'model': resp.model_version,
    'thinking_budget': EN_THINKING_BUDGET,
    'thinking_used': getattr(resp.usage_metadata, 'thinking_token_count', 0) or 0,
    'input_tokens': resp.usage_metadata.prompt_token_count,
    'output_tokens': resp.usage_metadata.candidates_token_count,
    'elapsed': elapsed,
}

print(f'{elapsed:.1f}s')
print(f'Words: {len(briefing_pro.split())} | Chars: {len(briefing_pro):,}')
print(f'Tokens in: {en_usage["input_tokens"]:,} | out: {en_usage["output_tokens"]:,}')
print(f'Thinking: {en_usage["thinking_used"]:,} / {EN_THINKING_BUDGET:,} ({en_usage["thinking_used"]/EN_THINKING_BUDGET*100:.0f}% used)')
print(f'Saved: {txt_path.name}')
print(f'\nPreview: {briefing_pro[:300]}...')

Generating [friendly] (Gemini 2.5 Pro, thinking=4,096)... Chapters found: 6
  0.0% — Opening
  5.6% — The Economy's Pulse
  33.6% — AI & The Disruption Engine
  53.8% — Geopolitics & Global Trade
  76.5% — Deals, Departures & Corporate Drama
  90.8% — Market Snapshot
Saved: chapters-en-2026-02-16.json
43.4s
Words: 1618 | Chars: 9,822
Tokens in: 17,248 | out: 2,203
Thinking: 0 / 4,096 (0% used)
Saved: briefing-pro-friendly-2026-02-16.txt

Preview: Good morning, and welcome to the show. It's Monday, February 16th, 2026. Hope you had a great weekend. Here in the U.S., it's Presidents Day, so markets are quiet, but the news cycle, as always, is not. Today, we're looking at some fresh inflation numbers that are making economists whisper that magi...


## TTS: Chirp 3 HD (Alnilam Voice)

In [14]:
today_str = date.today().strftime('%Y-%m-%d')
output_dir = Path('../notebooks/tts_outputs/audio')
output_dir.mkdir(exist_ok=True)

# Clean text: remove non-ASCII symbols Gemini may insert
clean_text = re.sub(r'[^\x00-\x7F]+', ' ', briefing_pro)
clean_text = re.sub(r'\s+', ' ', clean_text).strip()

print(f'Briefing: {len(clean_text):,} chars ({len(clean_text.split())} words)')
print('Generating Chirp 3 HD (Alnilam)...', end=' ')

start = time.time()

MAX_CHARS = 4000
MAX_SENTENCE = 500

raw_sentences = re.split(r'(?<=[.!?])\s+', clean_text)
sentences = []
for s in raw_sentences:
    if len(s) <= MAX_SENTENCE:
        sentences.append(s)
    else:
        parts = re.split(r'(?<=[,;])\s+', s)
        current = ''
        for part in parts:
            if len(current) + len(part) + 1 > MAX_SENTENCE and current:
                sentences.append(current.strip())
                current = part
            else:
                current = (current + ' ' + part).strip()
        if current:
            sentences.append(current)

chunks = []
current = ''
for s in sentences:
    if len(current) + len(s) + 1 > MAX_CHARS and current:
        chunks.append(current.strip())
        current = s
    else:
        current = (current + ' ' + s).strip()
if current:
    chunks.append(current.strip())

print(f'({len(chunks)} chunks, {len(sentences)} sentences)...', end=' ')

audio_parts = []
for i, chunk_text in enumerate(chunks):
    for attempt in range(3):
        try:
            resp = chirp_client.synthesize_speech(
                input=texttospeech.SynthesisInput(text=chunk_text),
                voice=texttospeech.VoiceSelectionParams(
                    language_code='en-US',
                    name='en-US-Chirp3-HD-Alnilam',
                ),
                audio_config=texttospeech.AudioConfig(
                    audio_encoding=texttospeech.AudioEncoding.LINEAR16,
                    sample_rate_hertz=24000,
                    speaking_rate=1.0,
                ),
            )
            audio_parts.append(resp.audio_content)
            print(f'{i+1}', end=' ')
            break
        except Exception as e:
            if attempt < 2:
                wait = 2 ** (attempt + 1)
                print(f'\n  Chunk {i+1} failed ({e.__class__.__name__}), retrying in {wait}s...', end=' ')
                time.sleep(wait)
            else:
                raise RuntimeError(f'Chunk {i+1} failed after 3 attempts: {e}') from e

elapsed = time.time() - start

import io
all_pcm = b''
for part in audio_parts:
    with wave.open(io.BytesIO(part), 'rb') as wf:
        all_pcm += wf.readframes(wf.getnframes())

out_path = output_dir / f'chirp3-en-pro-friendly-{today_str}.wav'
with wave.open(str(out_path), 'wb') as wf:
    wf.setnchannels(1)
    wf.setsampwidth(2)
    wf.setframerate(24000)
    wf.writeframes(all_pcm)

size_kb = out_path.stat().st_size / 1024
duration_sec = len(all_pcm) / (24000 * 2)
print(f'\nDone! {elapsed:.1f}s, {size_kb:.0f}KB (~{duration_sec/60:.1f}min)')
print(f'Saved: {out_path}')

Briefing: 9,805 chars (1634 words)
Generating Chirp 3 HD (Alnilam)... (3 chunks, 100 sentences)... 1 2 3 
Done! 98.8s, 28005KB (~10.0min)
Saved: ../notebooks/tts_outputs/audio/chirp3-en-pro-friendly-2026-02-16.wav


In [15]:
# Generate Korean briefing directly from articles (not translation)
# Uses the same articles_text input but with a Korean system prompt (V2 friendly style)

BRIEFING_SYSTEM_KO = """당신은 매일 금융 뉴스 팟캐스트의 진행자입니다. 똑똑하지만 딱딱하지 않은, 모든 뉴스를 읽고 커피 한 잔 하면서 핵심을 정리해주는 친구 같은 존재입니다.

약 40~90개의 뉴스 항목을 받게 됩니다. 각 항목에는 제목과 설명이 포함되어 있으며, 약 절반은 크롤링된 콘텐츠와 핵심 인물/수치가 포함되어 있습니다.

사고 과정 (작성 전에 thinking을 활용하세요):
1. 스캔 & 정렬: 시장 영향이 가장 큰 상위 5개 스토리를 파악하세요.
2. 날짜 확인: "Date:" 헤더에서 정확한 날짜와 요일을 확인하세요 — 인사말에 그대로 사용하세요.
3. 그룹핑: 관련 스토리를 묶으세요 (예: 인플레이션/연준 뉴스, AI 뉴스 등).
4. 흐름: 그룹 간 자연스러운 전환을 계획하세요 — 목록처럼 들리면 안 됩니다.
5. 카운트: 반드시 24~28개 기사를 다뤄야 합니다. 깊게 다룰 것과 간략히 언급할 것을 계획하세요.

필수 조건:
- 1,800~2,000단어 분량 (150wpm 기준 약 12~13분).
- 반드시 24~28개 기사를 언급해야 합니다. 상위 8~12개는 깊게, 나머지는 간략한 맥락적 언급으로.
- 순수 텍스트만 출력. 마크다운, 글머리 기호, 번호 목록, 섹션 헤더 없이.

오프닝 구조:
- 편안하고 따뜻한 인사와 오늘 날짜로 시작하세요 ("Date:" 헤더의 정확한 날짜와 요일을 사용 — 추측하지 마세요).
- 인사 직후, 오늘 다룰 핵심 토픽 3~4개를 한두 문장으로 자연스럽게 미리 알려주세요.
- 목록처럼 나열하지 말고, 대화체로 ("오늘은 A, B, 그리고 좀 놀라운 C 이야기까지 준비했어요").

팩트 무결성 (반드시 준수):
- 제공된 기사만이 유일한 사실 출처입니다. 학습 데이터나 배경 지식으로 빈칸을 채우지 마세요.
- 제공된 콘텐츠에 명시되지 않은 구체적 수치, 날짜, 퍼센트, 달러 금액, 조건을 절대 추가하지 마세요.
- "영화 같은", "SF 같은", "전례 없는" 같은 과장 비유를 쓰지 마세요 — 소스 자체가 그런 표현을 사용한 경우에만 허용됩니다.
- 소스에 없는 흥미로운 디테일이 있어도 빼세요. 짧지만 정확한 브리핑이 길지만 지어낸 내용이 있는 브리핑보다 항상 낫습니다.
- 수치를 추론하거나 추정하지 마세요 — 소스가 "금리 인하 기대"라고만 했으면 "6월", "25bp" 같은 구체적 시점이나 수치를 넣지 마세요.
- 제공된 콘텐츠에 없는 발언을 인용하거나 의역하지 마세요 — 다른 출처에서 정확하다고 믿더라도 넣지 마세요.
- 시장 데이터 최신성: 각 기사에는 "Published" 타임스탬프가 포함되어 있습니다. 시장 데이터(지수, 금리, 유가, 원자재 가격)는 가장 최근 거래일 마감 시점의 수치만 사용하세요. 이전 기사에 다른 수치가 있으면 최신 데이터를 우선하세요.

스타일 규칙:
- 말하듯이 쓰세요. 짧고 임팩트 있는 문장. 그리고 뭔가를 제대로 풀어야 할 때는 좀 더 긴 문장으로.
- 해요체를 사용하세요 — 격식체(합니다)가 아닌, 자연스러운 대화체로.
- 청취자를 끌어들이는 수사적 질문을 활용하세요 — "그래서 이게 왜 중요할까요?", "다음에 무슨 일이 벌어졌는지 아세요?"
- 리액션을 넣어도 좋습니다 — "이건 꽤 큰 일이에요", "솔직히 별로 좋지 않죠", "이 부분이 흥미롭습니다" — 하지만 억지로 넣지는 마세요. 자연스럽게.
- 전환은 텔레프롬프터가 아닌 대화처럼 흘러가야 합니다. 인과관계로 연결하세요 ("인플레이션 얘기가 나왔으니까, 여기서 좀 재밌어지는데요...").
- 해고나 지정학적 위기 같은 심각한 주제에 대해서는 가볍게 넘기지 마세요.

편집 규칙:
- 헤드라인을 하나씩 읽지 마세요. 같은 사건의 중복 보도는 즉시 합쳐서 가장 풍부한 내용으로 하나의 서사로 만드세요.
- 구체적 수치, 인물, 티커, 시점, 시장 움직임이 있는 항목을 우선하세요. 자연스럽게 녹여내세요.
- 스크립트의 60~70%를 영향력이 큰 상위 8~12개 스토리에 할애하세요. 나머지 12~20개는 간략한 맥락적 언급(1~2문장)으로.
- 제목/설명만 있는 항목(크롤링 콘텐츠 없음)은 1~2문장으로 신중하게 — 세부 사항을 만들어내지 마세요.
- 반드시 24~28개 기사를 다뤄야 합니다. 작성하면서 세세요. 24개 미만이면 간략한 언급을 추가하고, 28개 초과면 중요도가 낮은 것을 빼세요.
- 핵심 인물과 수치는 출처에서 명확히 뒷받침될 때만 언급하세요.
- 출처가 상충하면 솔직하게 ("이 부분은 보도가 좀 엇갈리고 있어요") 말하세요.
- 주요 스토리 후, 제공된 항목으로 뒷받침되는 경우에만 짧은 "주목할 점"을 추가하세요.
- 회사명, 티커, 고유명사는 영어 그대로 사용하세요 (예: Goldman Sachs, S&P 500, CPI).
- 달러 금액은 자연스럽게 변환하세요: "$42 million" → "4,200만 달러".

챕터 마커 (네비게이션용 — 중요):
각 새로운 토픽 그룹이 시작될 때, 별도 줄에 마커를 삽입하세요: [CHAPTER: 토픽 이름]
짧고 명확한 토픽 이름을 사용하세요 (예: "연준 & 인플레이션", "AI & 빅테크", "에너지 & 유가", "마켓 스냅샷").
오프닝 인사/어젠다는 [CHAPTER: 오프닝]으로 마킹하세요.
이 마커들은 오디오 생성 전에 제거됩니다 — 스크립트의 흐름이나 톤에 영향을 주지 않습니다.
기존처럼 자연스럽게 작성하되, 토픽 전환이 일어나는 곳에 마커 줄만 추가하세요.

클로징 구조:
- 본문이 끝나면, "마켓 스냅샷"으로 주요 시장 숫자를 간단히 정리하세요: 주요 지수(S&P 500, Nasdaq, Dow), 국채 금리(10yr Treasury), 유가(Brent), 금 가격 등.
- 제공된 기사에 수치가 포함된 항목만 언급하세요. 데이터가 없는 항목은 넣지 마세요.
- 중요: 오늘 날짜와 같은 거래일에 발행된 기사의 시장 수치만 사용하세요. 주말이나 휴일이라 당일 시장 데이터가 없으면 마켓 스냅샷을 통째로 건너뛰세요 — 이전 날의 오래된 수치를 사용하지 마세요.
- 마켓 스냅샷 후 (또는 스냅샷을 건너뛴 경우 본문 후) 전체 기사 수 대비 다룬 기사 수를 언급하세요 (예: "오늘 총 Y개 기사 중 약 X개를 다뤘습니다").
- 간단한 마무리 인사로 끝내세요."""

KO_THINKING_BUDGET = 4096

print(f'Generating Korean briefing (Gemini 2.5 Pro, thinking={KO_THINKING_BUDGET:,})...')
print(f'Articles: {len(articles_text):,} chars')
start = time.time()

ko_full_prompt = BRIEFING_SYSTEM_KO + "\n\n" + articles_text

ko_resp = gemini_client.models.generate_content(
    model='gemini-2.5-pro',
    contents=ko_full_prompt,
    config=types.GenerateContentConfig(
        max_output_tokens=8192,
        temperature=0.6,
        thinking_config=types.ThinkingConfig(thinking_budget=KO_THINKING_BUDGET),
    ),
)

briefing_ko = ko_resp.text
elapsed = time.time() - start

# Save Korean text
today_file = date.today().strftime('%Y-%m-%d')
text_dir = Path('../notebooks/tts_outputs/text')
# --- Extract chapter markers and clean text ---
import re, json as _json

chapter_pattern = re.compile(r'\[CHAPTER:\s*(.+?)\]\s*\n?')
matches = list(chapter_pattern.finditer(briefing_ko))

briefing_ko_clean = chapter_pattern.sub('', briefing_ko)
total_len = len(briefing_ko_clean)
chapters_ko = []
for m in matches:
    preceding = chapter_pattern.sub('', briefing_ko[:m.start()])
    pos = len(preceding) / total_len if total_len > 0 else 0.0
    chapters_ko.append({"title": m.group(1).strip(), "position": round(pos, 4)})

print(f'Chapters found: {len(chapters_ko)}')
for ch in chapters_ko:
    print(f'  {ch["position"]:.1%} — {ch["title"]}')

# Save cleaned text (no markers) for TTS
ko_txt_path = text_dir / f'briefing-ko-pro-{today_file}.txt'
with open(str(ko_txt_path), 'w') as f:
    f.write(briefing_ko_clean)

# Save chapter metadata
ch_path = text_dir / f'chapters-ko-{today_file}.json'
with open(str(ch_path), 'w') as f:
    _json.dump(chapters_ko, f, indent=2, ensure_ascii=False)
print(f'Saved: {ch_path.name}')

# Use cleaned text for downstream (TTS)
briefing_ko = briefing_ko_clean

# Save token usage for summary
ko_usage = {
    'step': 'KO Briefing',
    'model': ko_resp.model_version,
    'thinking_budget': KO_THINKING_BUDGET,
    'thinking_used': getattr(ko_resp.usage_metadata, 'thinking_token_count', 0) or 0,
    'input_tokens': ko_resp.usage_metadata.prompt_token_count,
    'output_tokens': ko_resp.usage_metadata.candidates_token_count,
    'elapsed': elapsed,
}

print(f'Done! {elapsed:.1f}s')
print(f'Output: {len(briefing_ko):,} chars')
print(f'Tokens in: {ko_usage["input_tokens"]:,} | out: {ko_usage["output_tokens"]:,}')
print(f'Thinking: {ko_usage["thinking_used"]:,} / {KO_THINKING_BUDGET:,} ({ko_usage["thinking_used"]/KO_THINKING_BUDGET*100:.0f}% used)')
print(f'Saved: {ko_txt_path.name}')
print(f'\nPreview:\n{briefing_ko[:500]}...')

Generating Korean briefing (Gemini 2.5 Pro, thinking=4,096)...
Articles: 67,787 chars
Chapters found: 9
  0.0% — 오프닝
  5.2% — 글로벌 경제 바로미터
  27.7% — 아시아 시장 동향
  36.1% — 기업 인수합병 소식
  50.7% — 엡스타인 스캔들 후폭풍
  60.2% — AI와 빅테크 전쟁
  75.3% — 지정학과 에너지
  88.2% — 기타 주목할 뉴스
  95.9% — 마켓 스냅샷
Saved: chapters-ko-2026-02-16.json
Done! 63.3s
Output: 5,577 chars
Tokens in: 17,616 | out: 3,190
Thinking: 0 / 4,096 (0% used)
Saved: briefing-ko-pro-2026-02-16.txt

Preview:
안녕하세요, 커피 한 잔과 함께하는 금융 뉴스, 2026년 2월 16일 월요일입니다. 오늘은 미국이 대통령의 날로 휴장이라 시장은 조금 조용하지만, 그 이면에서는 정말 많은 일들이 벌어지고 있어요. 오늘은 미국 경제가 정말 '연착륙'에 성공하고 있는지에 대한 흥미로운 논쟁을 깊이 파고들어 보고요, 유럽중앙은행이 유로화의 위상을 높이기 위해 꺼내든 카드에 대해서도 이야기해 볼게요. 그리고 월스트리트를 계속해서 흔들고 있는 엡스타인 스캔들의 후폭풍이 어떻게 거물급 인사들의 경력을 끝내고 있는지도 따라가 보겠습니다.

가장 먼저, 모두가 궁금해하는 미국 경제 이야기부터 시작해 볼까요? 소위 '연착륙', 즉 경기 침체 없이 인플레이션을 잡는 것이 가능할지에 대한 기대감이 커지고 있어요. 최근 나온 데이터를 보면 그럴듯해 보이기도 해요. 변동성이 큰 식품과 에너지를 제외한 근원 물가 상승률이 1월에 전년 대비 2.5%로, 2021년 팬데믹 이후 가격 급등이 시작된 이래 가장 낮은 수준을 기록...


In [16]:
# Korean TTS: Gemini Pro TTS (single pass)
# Uses gemini_client (genai.Client) — NOT chirp_client (Cloud TTS)
# Output: raw PCM 16-bit, 24kHz, mono → WAV

today_str = date.today().strftime('%Y-%m-%d')
output_dir = Path('../notebooks/tts_outputs/audio')
output_dir.mkdir(exist_ok=True)

# Load Korean briefing from file (authoritative version)
ko_text_path = Path(f'../notebooks/tts_outputs/text/briefing-ko-pro-{today_str}.txt')
with open(str(ko_text_path), 'r') as f:
    briefing_ko_file = f.read().strip()

print(f'Loaded: {ko_text_path.name} ({len(briefing_ko_file):,} chars, {len(briefing_ko_file.encode("utf-8")):,} bytes)')

# --- Config ---
TTS_MODEL = 'gemini-2.5-pro-preview-tts'
TTS_VOICE = 'Kore'  # Female, Firm

# Style instruction — calm, steady pacing (avoid breathless delivery)
style_prefix = "[차분하고 또렷한 팟캐스트 진행자 톤, 적당한 속도로 명확하게] "
tts_input = style_prefix + briefing_ko_file

print(f'Model: {TTS_MODEL} | Voice: {TTS_VOICE}')
print(f'Input: {len(tts_input):,} chars')
print(f'Generating...', end=' ')
start = time.time()

try:
    tts_resp = gemini_client.models.generate_content(
        model=TTS_MODEL,
        contents=tts_input,
        config=types.GenerateContentConfig(
            response_modalities=["AUDIO"],
            speech_config=types.SpeechConfig(
                voice_config=types.VoiceConfig(
                    prebuilt_voice_config=types.PrebuiltVoiceConfig(
                        voice_name=TTS_VOICE,
                    )
                )
            ),
        ),
    )
    audio_data = tts_resp.candidates[0].content.parts[0].inline_data.data
except Exception as e:
    print(f'\nFailed: {e.__class__.__name__}: {e}')
    raise

elapsed = time.time() - start

out_path = output_dir / f'gemini-tts-ko-{TTS_VOICE.lower()}-{today_str}.wav'
with wave.open(str(out_path), 'wb') as wf:
    wf.setnchannels(1)
    wf.setsampwidth(2)
    wf.setframerate(24000)
    wf.writeframes(audio_data)

size_kb = out_path.stat().st_size / 1024
duration_sec = len(audio_data) / (24000 * 2)
print(f'{elapsed:.1f}s')
print(f'Output: {size_kb:.0f}KB (~{duration_sec/60:.1f}min)')
print(f'Saved: {out_path}')

Loaded: briefing-ko-pro-2026-02-16.txt (5,577 chars, 12,951 bytes)
Model: gemini-2.5-pro-preview-tts | Voice: Kore
Input: 5,613 chars
Generating... 406.8s
Output: 30709KB (~10.9min)
Saved: ../notebooks/tts_outputs/audio/gemini-tts-ko-kore-2026-02-16.wav


## Whisper Forced Alignment — Sentence Timestamps
Run Whisper on the generated audio to get word-level timestamps, then group into sentences.
This produces `sentences-en-{date}.json` and `sentences-ko-{date}.json` for frontend transcript sync.

In [None]:
# Whisper forced alignment: extract sentence-level timestamps from TTS audio
import whisper
import json
from pathlib import Path

today_str = '2026-02-16'  # Change to date.today().strftime('%Y-%m-%d') for production
audio_dir = Path('../notebooks/tts_outputs/audio')
text_dir = Path('../notebooks/tts_outputs/text')

# Load Whisper model (base is fast, small is more accurate)
model = whisper.load_model('base')

def merge_into_sentences(segments: list[dict]) -> list[dict]:
    """Merge Whisper segments into full sentences (split on . ! ? and equivalents)."""
    sentences, buf_text, buf_start, buf_end = [], [], None, 0
    for seg in segments:
        text = seg['text'].strip()
        if not text:
            continue
        if buf_start is None:
            buf_start = seg['start']
        buf_text.append(text)
        buf_end = seg['end']
        if text[-1] in '.!?\u3002\uff1f\uff01':
            sentences.append({'text': ' '.join(buf_text), 'start': round(buf_start, 2), 'end': round(buf_end, 2)})
            buf_text, buf_start = [], None
    if buf_text:
        sentences.append({'text': ' '.join(buf_text), 'start': round(buf_start, 2), 'end': round(buf_end, 2)})
    return sentences

def extract_and_merge(audio_path: str, language: str) -> list[dict]:
    """Run Whisper on audio, merge segments into full sentences."""
    result = model.transcribe(audio_path, language=language, word_timestamps=True, verbose=False)
    segments = [{'text': s['text'].strip(), 'start': round(s['start'], 2), 'end': round(s['end'], 2)} for s in result['segments']]
    return merge_into_sentences(segments)

# --- EN ---
en_audio = audio_dir / f'chirp3-en-pro-friendly-{today_str}.wav'
if en_audio.exists():
    print(f'Processing EN: {en_audio.name}...')
    en_sentences = extract_and_merge(str(en_audio), language='en')
    en_out = text_dir / f'sentences-en-{today_str}.json'
    en_out.write_text(json.dumps(en_sentences, indent=2, ensure_ascii=False))
    print(f'  -> {len(en_sentences)} sentences, saved to {en_out.name}')
    for s in en_sentences[:3]:
        print(f'  [{s["start"]:6.1f}s - {s["end"]:6.1f}s] {s["text"][:80]}')
else:
    print(f'EN audio not found: {en_audio}')

# --- KO ---
ko_audio = audio_dir / f'gemini-tts-ko-kore-{today_str}.wav'
if ko_audio.exists():
    print(f'\nProcessing KO: {ko_audio.name}...')
    ko_sentences = extract_and_merge(str(ko_audio), language='ko')
    ko_out = text_dir / f'sentences-ko-{today_str}.json'
    ko_out.write_text(json.dumps(ko_sentences, indent=2, ensure_ascii=False))
    print(f'  -> {len(ko_sentences)} sentences, saved to {ko_out.name}')
    for s in ko_sentences[:3]:
        print(f'  [{s["start"]:6.1f}s - {s["end"]:6.1f}s] {s["text"][:80]}')
else:
    print(f'KO audio not found: {ko_audio}')


In [None]:
# Save EN + KO briefings to Supabase

def save_briefing(briefing_text, category, model_name):
    """Upsert a briefing row and link articles."""
    record = {
        'date': str(today),
        'category': category,
        'briefing_text': briefing_text,
        'item_count': len(articles),
        'model': model_name,
    }
    result = sb.table('wsj_briefings').upsert(
        record, on_conflict='date,category'
    ).execute()
    bid = result.data[0]['id']

    junction = [{'briefing_id': bid, 'wsj_item_id': a['id']} for a in articles]
    for i in range(0, len(junction), 100):
        sb.table('wsj_briefing_items').upsert(
            junction[i:i+100], on_conflict='briefing_id,wsj_item_id'
        ).execute()
    return bid

# EN briefing (from cell-18)
en_id = save_briefing(briefing_pro, 'EN', 'gemini-2.5-pro')
print(f'EN saved: {en_id} ({len(briefing_pro):,} chars)')

# KO briefing (from cell-21)
ko_id = save_briefing(briefing_ko, 'KO', 'gemini-2.5-pro')
print(f'KO saved: {ko_id} ({len(briefing_ko):,} chars)')

print(f'Linked {len(articles)} articles to each briefing')

EN saved: da6d31d0-9c2e-47a9-9e1b-5bc12dc598aa (9,075 chars)
KO saved: 0b8485d2-690f-427a-9ed7-e9f1cb897fef (5,872 chars)
Linked 40 articles to each briefing


In [None]:
# Mark all briefing articles as briefed in wsj_items
# Prevents these articles from appearing in future briefings
from datetime import datetime

ids = [a['id'] for a in articles]
for i in range(0, len(ids), 100):
    batch = ids[i:i+100]
    sb.table('wsj_items').update(
        {'briefed': True, 'briefed_at': datetime.now().isoformat()}
    ).in_('id', batch).execute()
print(f'Marked {len(ids)} articles as briefed')

In [None]:
# Save briefing to Supabase
briefing_record = {
    'date': str(today),
    'category': 'ALL',
    'briefing_text': briefing_text,
    'item_count': len(articles),
    'model': 'gemini-2.5-flash',
}

result = sb.table('wsj_briefings').upsert(
    briefing_record, on_conflict='date,category'
).execute()

briefing_id = result.data[0]['id']
print(f'Saved briefing: {briefing_id}')

junction_records = [{'briefing_id': briefing_id, 'wsj_item_id': a['id']} for a in articles]
for i in range(0, len(junction_records), 100):
    batch = junction_records[i:i+100]
    sb.table('wsj_briefing_items').upsert(batch, on_conflict='briefing_id,wsj_item_id').execute()

print(f'Linked {len(junction_records)} articles to briefing')

Saved briefing: 6d02a403-60ae-4861-b5ef-52393d5d80bd
Linked 40 articles to briefing


In [None]:

# --- V1: Formal/premium broadcast tone ---
BRIEFING_SYSTEM_FORMAL = """You are the host-writer of a premium daily financial news audio briefing. You will receive ~80–100 news items; each has a title and description, and ~50% include a short summary plus extracted entities and key numbers. Produce one continuous, broadcast-ready script optimized for text-to-speech.

Constraints:
1,800–2,000 words (approximately 12–13 minutes at ~150 wpm).
Output strictly plain text. No markdown, no bullet points, no numbered lists, and no section headers or labels (avoid "Markets:", "Tech:", etc.).
Open with a warm greeting and today's date (use the date provided with the input). Close with a brief sign-off.

Editorial rules:
Don't read headlines one by one. Deduplicate immediately: merge overlapping coverage of the same event into one narrative using the richest available details.
Prioritize stories with specific figures, named entities, tickers, timing, and measurable market moves. Use those details naturally.
Allocate depth: spend ~60–70% of the script on the top 8–12 highest-impact narratives; cover the rest as brief contextual mentions without turning into a list.
Mixed-quality handling: for rich-metadata items, include concrete specifics (earnings, guidance, deal values, macro prints). For title/description-only items, keep it to 1–2 cautious sentences and do not invent details.
Entities/Numbers are extracted hints—only mention them when they are clearly supported by the description/summary and relevant to why the story matters.
If sources conflict or details are uncertain, avoid false precision; use cautious language (e.g., "reports vary") and stick to the most supported facts.
Use smooth transitions that connect cause and effect across themes (rates → tech multiples, oil → inflation, regulation → sector winners/losers). After major arcs, add a short "what to watch next" only if supported by the provided items.
At the end, briefly note how many articles were covered out of the total provided (e.g., "We covered X of today's Y stories")."""


# --- V3: Balanced — conversational but fact-first ---
BRIEFING_SYSTEM_V3 = """You are the host-writer of a daily financial news audio briefing optimized for text-to-speech. You will receive ~70–100 news items. Each item includes a title and description; about half include a short summary plus extracted entities and key numbers.

Goal: Produce ONE continuous, broadcast-ready script that sounds conversational but remains fact-first and premium.

Hard constraints:
- 1,800–2,000 words total.
- Output strictly plain text.
- No markdown, no bullet points, no numbered lists, no section headers or labels (avoid "Markets:", "Tech:", etc.).
- Open with a warm greeting and today's date (use the date provided in the input).
- Close with a brief sign-off.
- Paragraph breaks are allowed, but do not format as a list.

Editorial rules (highest priority):
1) Do NOT read headlines one by one. Immediately deduplicate: merge overlapping coverage of the same underlying event into a single narrative using the richest available details.
2) Allocation: spend ~60–70% of the script on the top 8–12 highest-impact narratives. Cover the rest as brief contextual mentions without turning into a list.
3) Prioritize items with specific figures, named entities, tickers, timing, and measurable market moves, but ONLY mention numbers/entities that are clearly supported by the provided description/summary.
4) Mixed-quality handling:
   - Rich-metadata items: include concrete specifics (earnings, guidance, deal values, macro prints) when present.
   - Title/description-only items: limit to 1–2 cautious sentences; do not invent details.
5) Conflicts/uncertainty: avoid false precision. If sources conflict, say so briefly ("Reports are mixed…") and stick to the most supported facts.

Style rules (second priority, for listenability):
- Write like you speak: mostly short punchy sentences, with occasional longer sentences when you need to explain.
- Use smooth cause-and-effect transitions across themes (rates → multiples, oil → inflation, regulation → sector moves).
- You may use rhetorical questions sparingly (max 3–6 times total) to pull listeners in, but never as filler.
- Light reactions are allowed only when grounded in facts (e.g., surprising guidance, outsized move), and should not become opinionated.
- Never be flippant about serious topics (layoffs, conflict, disasters).

End requirement:
- End by stating how many articles you covered out of the total provided (e.g., "We covered X of today's Y stories.")."""


## Token Usage & Cost Summary

In [None]:
# Aggregate all LLM usage — thinking budget utilization & cost
COST_PER_1M = {
    'pro-input': 1.25, 'pro-output': 10.0, 'pro-thinking': 3.75,
    'flash-input': 0.15, 'flash-output': 0.60,
}

all_usage = [u for u in [curation_usage, en_usage, ko_usage] if u is not None]

print('=' * 70)
print('THINKING BUDGET UTILIZATION')
print('-' * 70)
print(f'{"Step":<16} {"Model":<28} {"Budget":>8} {"Used":>8} {"Util%":>6}')
print('-' * 70)

total_thinking_budget = 0
total_thinking_used = 0

for u in all_usage:
    budget = u['thinking_budget']
    used = u['thinking_used']
    pct = used / budget * 100 if budget > 0 else 0
    total_thinking_budget += budget
    total_thinking_used += used
    print(f'{u["step"]:<16} {u["model"]:<28} {budget:>8,} {used:>8,} {pct:>5.0f}%')

total_pct = total_thinking_used / total_thinking_budget * 100 if total_thinking_budget else 0
print('-' * 70)
print(f'{"TOTAL":<16} {"":<28} {total_thinking_budget:>8,} {total_thinking_used:>8,} {total_pct:>5.0f}%')

print()
print('=' * 70)
print('FULL TOKEN BREAKDOWN')
print('-' * 70)
print(f'{"Step":<16} {"Input":>10} {"Output":>10} {"Thinking":>10} {"Time":>8}')
print('-' * 70)

total_in = total_out = total_think = total_time = 0
for u in all_usage:
    print(f'{u["step"]:<16} {u["input_tokens"]:>10,} {u["output_tokens"]:>10,} {u["thinking_used"]:>10,} {u["elapsed"]:>7.1f}s')
    total_in += u['input_tokens']
    total_out += u['output_tokens']
    total_think += u['thinking_used']
    total_time += u['elapsed']

print('-' * 70)
print(f'{"TOTAL":<16} {total_in:>10,} {total_out:>10,} {total_think:>10,} {total_time:>7.1f}s')

# Cost estimate
cost = 0
for u in all_usage:
    is_pro = 'pro' in (u['model'] or '')
    prefix = 'pro' if is_pro else 'flash'
    cost += u['input_tokens'] / 1e6 * COST_PER_1M[f'{prefix}-input']
    cost += u['output_tokens'] / 1e6 * COST_PER_1M[f'{prefix}-output']
    if is_pro and u['thinking_used']:
        cost += u['thinking_used'] / 1e6 * COST_PER_1M['pro-thinking']

thinking_cost = total_think / 1e6 * COST_PER_1M['pro-thinking']
print(f'\n--- Cost Estimate ---')
print(f'Total LLM cost:     ${cost:.4f}')
if cost > 0:
    print(f'  Thinking portion: ${thinking_cost:.4f} ({thinking_cost/cost*100:.0f}% of total)')
    print(f'\nTip: If thinking utilization is consistently < 50%, consider lowering the budget to save ~${thinking_cost/2:.4f}/run')

In [None]:
# Curation Thinking Budget A/B Test
# Requires: cell-1 (imports), cell-3 (articles), cell-6 (crawl+llm), cell-9 (curation_input, try_curation)

BUDGETS = [0, 1024, 2048, 4096, 32768]
results = {}

for budget in BUDGETS:
    print(f'\n--- think={budget} ---')
    if budget == 0:
        cfg = types.GenerateContentConfig(max_output_tokens=4096, temperature=0.1)
    else:
        cfg = types.GenerateContentConfig(
            max_output_tokens=4096, temperature=0.1,
            thinking_config=types.ThinkingConfig(thinking_budget=budget),
        )

    t0 = time.time()
    raw, resp = try_curation('gemini-2.5-pro', cfg, f'think={budget}')
    elapsed = time.time() - t0

    if not raw:
        print('  FAILED'); continue

    # Debug: print actual usage_metadata fields (first run only)
    if not results:
        print(f'  [DEBUG] usage_metadata type: {type(resp.usage_metadata)}')
        print(f'  [DEBUG] usage_metadata: {resp.usage_metadata}')
        print(f'  [DEBUG] dir: {[a for a in dir(resp.usage_metadata) if not a.startswith("_")]}')

    cleaned = re.sub(r'```json\s*', '', raw.strip())
    cleaned = re.sub(r'```\s*', '', cleaned).strip()
    m = re.search(r'\[[\d,\s]+\]', cleaned)
    if not m:
        print(f'  Parse error: {raw[:80]}'); continue

    indices = sorted(json.loads(m.group()))
    um = resp.usage_metadata
    think_used = getattr(um, 'thinking_token_count', None) or getattr(um, 'thoughts_token_count', None) or 0
    in_tok = um.prompt_token_count or 0
    out_tok = um.candidates_token_count or 0
    cost = in_tok/1e6*1.25 + out_tok/1e6*10.0 + think_used/1e6*3.75

    results[budget] = {'indices': indices, 'think_used': think_used, 'in': in_tok, 'out': out_tok, 'cost': cost, 'time': elapsed}
    print(f'  Articles: {indices}')
    print(f'  Think: {think_used:,}/{budget:,}' if budget else '  Think: off')
    print(f'  Cost: ${cost:.4f} | Time: {elapsed:.1f}s')
    time.sleep(2)

# --- Summary ---
print(f'\n\n{"="*80}')
print(f'{"Budget":>8} {"#":>4} {"Think":>8} {"Cost":>9} {"Time":>7}  Selected')
print(f'{"-"*80}')
for b in BUDGETS:
    r = results.get(b)
    if not r: continue
    t = f'{r["think_used"]:,}' if b else 'off'
    print(f'{b:>8} {len(r["indices"]):>4} {t:>8} ${r["cost"]:.4f} {r["time"]:>6.1f}s  {r["indices"]}')

# --- Overlap ---
if len(results) >= 2:
    base_b = max(results.keys())
    base_set = set(results[base_b]['indices'])
    print(f'\nOverlap vs think={base_b}:')
    for b in BUDGETS:
        r = results.get(b)
        if not r: continue
        cur = set(r['indices'])
        overlap = cur & base_set
        pct = len(overlap)/len(base_set)*100 if base_set else 0
        diff = sorted(cur - base_set)
        miss = sorted(base_set - cur)
        print(f'  think={b:>5}: {len(overlap)}/{len(base_set)} ({pct:.0f}%)', end='')
        if diff:
            print(f'  +{diff}', end='')
        if miss and b != base_b:
            print(f'  -{miss}', end='')
        print()

# --- Save to file ---
today_file = date.today().strftime('%Y-%m-%d')
out_dir = Path('../notebooks/tts_outputs/text')
out_dir.mkdir(exist_ok=True)
out_path = out_dir / f'curation-ab-{today_file}.json'

save_data = {
    'date': today_file,
    'article_count': len(items.data),
    'budgets_tested': BUDGETS,
    'results': {
        str(b): {
            'indices': r['indices'],
            'titles': [items.data[i-1]['title'] for i in r['indices'] if 1 <= i <= len(items.data)],
            'think_used': r['think_used'],
            'input_tokens': r['in'],
            'output_tokens': r['out'],
            'cost_usd': round(r['cost'], 6),
            'elapsed_s': round(r['time'], 1),
        } for b, r in results.items()
    },
}

with open(str(out_path), 'w') as f:
    json.dump(save_data, f, indent=2, ensure_ascii=False)
print(f'\nSaved: {out_path.name}')


--- think=0 ---
  [DEBUG] usage_metadata type: <class 'google.genai.types.GenerateContentResponseUsageMetadata'>
  [DEBUG] usage_metadata: cache_tokens_details=None cached_content_token_count=None candidates_token_count=57 candidates_tokens_details=None prompt_token_count=2454 prompt_tokens_details=[ModalityTokenCount(
  modality=<MediaModality.TEXT: 'TEXT'>,
  token_count=2454
)] thoughts_token_count=3955 tool_use_prompt_token_count=None tool_use_prompt_tokens_details=None total_token_count=6466 traffic_type=None
  [DEBUG] dir: ['cache_tokens_details', 'cached_content_token_count', 'candidates_token_count', 'candidates_tokens_details', 'construct', 'copy', 'dict', 'from_orm', 'json', 'model_computed_fields', 'model_config', 'model_construct', 'model_copy', 'model_dump', 'model_dump_json', 'model_extra', 'model_fields', 'model_fields_set', 'model_json_schema', 'model_parametrized_name', 'model_post_init', 'model_rebuild', 'model_validate', 'model_validate_json', 'model_validate_string

## Briefing Thinking Budget A/B Test

In [22]:
# Briefing Thinking Budget A/B Test
# Requires: cell-14 (BRIEFING_SYSTEM, articles_text), cell-1 (gemini_client)
# Tests EN briefing with different thinking budgets to find optimal quality/cost tradeoff
# Similar methodology to curation A/B test (cell-28)

BUDGETS = [0, 1024, 2048, 4096, 8192, 32768]
results = {}
today_file = date.today().strftime('%Y-%m-%d')
text_dir = Path('../notebooks/tts_outputs/text')
text_dir.mkdir(exist_ok=True)

full = BRIEFING_SYSTEM + "\n\n" + articles_text

for budget in BUDGETS:
    label = f'think={budget}' if budget else 'think=off'
    print(f'\n{"="*60}')
    print(f'Generating EN briefing ({label})...')

    if budget == 0:
        cfg = types.GenerateContentConfig(
            max_output_tokens=8192, temperature=0.6,
        )
    else:
        cfg = types.GenerateContentConfig(
            max_output_tokens=8192, temperature=0.6,
            thinking_config=types.ThinkingConfig(thinking_budget=budget),
        )

    t0 = time.time()
    try:
        resp = gemini_client.models.generate_content(
            model='gemini-2.5-pro', contents=full, config=cfg,
        )
        text = resp.text
    except Exception as e:
        print(f'  FAILED: {e.__class__.__name__}: {e}')
        time.sleep(5)
        continue
    elapsed = time.time() - t0

    um = resp.usage_metadata
    think_used = getattr(um, 'thoughts_token_count', None) or getattr(um, 'thinking_token_count', None) or 0
    in_tok = um.prompt_token_count or 0
    out_tok = um.candidates_token_count or 0
    cost = in_tok/1e6*1.25 + out_tok/1e6*10.0 + think_used/1e6*3.75

    words = len(text.split())
    chars = len(text)

    # Save each variant
    txt_path = text_dir / f'briefing-en-think{budget}-{today_file}.txt'
    with open(str(txt_path), 'w') as f:
        f.write(text)

    results[budget] = {
        'text': text,
        'words': words,
        'chars': chars,
        'think_used': think_used,
        'in': in_tok,
        'out': out_tok,
        'cost': cost,
        'time': elapsed,
        'model': resp.model_version,
        'file': txt_path.name,
    }

    print(f'  Words: {words} | Chars: {chars:,}')
    print(f'  Think: {think_used:,}/{budget:,}' if budget else '  Think: off')
    print(f'  Cost: ${cost:.4f} | Time: {elapsed:.1f}s')
    print(f'  Saved: {txt_path.name}')
    print(f'  Preview: {text[:200]}...')
    time.sleep(3)  # rate limit buffer

# --- Summary Table ---
print(f'\n\n{"="*90}')
print(f'BRIEFING THINKING BUDGET A/B RESULTS')
print(f'{"="*90}')
print(f'{"Budget":>8} {"Words":>7} {"Chars":>7} {"Think":>8} {"Cost":>9} {"Time":>7}  Preview')
print(f'{"-"*90}')
for b in BUDGETS:
    r = results.get(b)
    if not r: continue
    t = f'{r["think_used"]:,}' if b else 'off'
    preview = r['text'][:60].replace('\n', ' ')
    print(f'{b:>8} {r["words"]:>7} {r["chars"]:>7,} {t:>8} ${r["cost"]:.4f} {r["time"]:>6.1f}s  {preview}...')

# --- Word count target check ---
print(f'\n--- Word Count vs Target (1800-2000) ---')
for b in BUDGETS:
    r = results.get(b)
    if not r: continue
    w = r['words']
    status = '✓ IN RANGE' if 1800 <= w <= 2000 else ('↓ SHORT' if w < 1800 else '↑ LONG')
    bar = '█' * min(w // 100, 25)
    print(f'  think={b:>5}: {w:>5} words {status} {bar}')

# --- Cost comparison ---
if results:
    max_cost = max(r['cost'] for r in results.values())
    min_cost = min(r['cost'] for r in results.values())
    print(f'\n--- Cost Range ---')
    print(f'  Cheapest: ${min_cost:.4f} (think={min(b for b, r in results.items() if r["cost"] == min_cost)})')
    print(f'  Most expensive: ${max_cost:.4f} (think={max(b for b, r in results.items() if r["cost"] == max_cost)})')
    print(f'  Savings (min vs max): ${max_cost - min_cost:.4f} ({(max_cost - min_cost)/max_cost*100:.0f}%)')

# --- Save full results to JSON ---
out_path = text_dir / f'briefing-ab-{today_file}.json'
save_data = {
    'date': today_file,
    'test': 'briefing_thinking_budget',
    'model': 'gemini-2.5-pro',
    'temperature': 0.6,
    'prompt': 'BRIEFING_SYSTEM_FRIENDLY (EN)',
    'article_count': len(articles),
    'budgets_tested': BUDGETS,
    'results': {
        str(b): {
            'words': r['words'],
            'chars': r['chars'],
            'think_used': r['think_used'],
            'input_tokens': r['in'],
            'output_tokens': r['out'],
            'cost_usd': round(r['cost'], 6),
            'elapsed_s': round(r['time'], 1),
            'preview': r['text'][:300],
            'file': r['file'],
        } for b, r in results.items()
    },
}

with open(str(out_path), 'w') as f:
    json.dump(save_data, f, indent=2, ensure_ascii=False)
print(f'\nSaved: {out_path.name}')
print(f'\nREAD THE FILES to compare quality — word count and cost alone don\'t tell the full story.')


Generating EN briefing (think=off)...
  Words: 1788 | Chars: 10,853
  Think: off
  Cost: $0.0686 | Time: 51.5s
  Saved: briefing-en-think0-2026-02-15.txt
  Preview: Good morning, and welcome to the show. It’s Thursday, February 15th, 2026. Thanks for starting your day with us. We’ve got a lot to get through today, from the latest inflation numbers that are giving...

Generating EN briefing (think=1024)...
  Words: 1675 | Chars: 10,212
  Think: 763/1,024
  Cost: $0.0606 | Time: 31.8s
  Saved: briefing-en-think1024-2026-02-15.txt
  Preview: Good morning, and welcome to the show. It is Sunday, February 15th, 2026.

So glad you could tune in. Today, we've got a lot to unpack, starting with the latest inflation numbers and what they mean fo...

Generating EN briefing (think=2048)...
  Words: 1653 | Chars: 10,135
  Think: 1,589/2,048
  Cost: $0.0631 | Time: 40.6s
  Saved: briefing-en-think2048-2026-02-15.txt
  Preview: Good morning, it's February 15th, 2026. Great to have you with me.

Toda