In [41]:
# --- Config ---
RUN_CRITERIA = 'all'  # one of: 'naive', 'statistical', 'llm', 'all'
RUN_MODELS = [
    'google/gemini-2.5-flash-lite',
    'deepseek/deepseek-chat-v3.1',
    'openai/gpt-4o-mini',
]  # used only when RUN_CRITERIA in {'llm','all'}

# Restrict which tables to run (None = all). Values are table 'name' from meta.
TABLE_WHITELIST = None  # e.g. ['south_african_class_15f_4_8_2']
TABLE_BLACKLIST = []
TABLE_LIMIT = None  # e.g. 5

# Heuristic targets
PAGE_SIZE_TARGET = 50
MIN_PAGES = 3
MAX_PAGES = 10
MIN_ROWS_THRESHOLD = 100  # only consider tables with >100 rows

# Re-run behavior
SKIP_IF_EXISTS = False  # skip writing if output exists for given criteria/model/table

# OpenRouter config (read API key from env var OPENROUTER_API_KEY)
OPENROUTER_BASE_URL = 'https://openrouter.ai/api/v1'
OPENROUTER_API_KEY_ENV = 'OPENROUTER_API_KEY'
OPENROUTER_ORIGIN = 'openrouter'  # label only

import os, json, time, re, math, random
from datetime import datetime
import pandas as pd
from pathlib import Path

RUN_TS = datetime.now().strftime('%Y%m%d_%H%M%S')
print('Run timestamp:', RUN_TS)

Run timestamp: 20250830_215331


In [43]:
# --- Load OpenRouter API key from launchctl and set in os.environ ---
import subprocess
def load_openrouter_api_key():
    try:
        result = subprocess.run(['launchctl', 'getenv', OPENROUTER_API_KEY_ENV], capture_output=True, text=True)
        api_key = result.stdout.strip()
        if api_key:
            os.environ[OPENROUTER_API_KEY_ENV] = api_key
        else:
            print(f'API key not found in launchctl for {OPENROUTER_API_KEY_ENV}')
        return api_key
    except Exception as e:
        print('Error loading API key:', e)
        return ''

api_key = load_openrouter_api_key()
print('Loaded API key:', '***' if api_key else '(none)')

Loaded API key: ***


In [45]:
os.environ.get(OPENROUTER_API_KEY_ENV, '')

'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'

In [47]:
# --- LLM prompt for pagination criteria ---
def build_llm_prompt(meta: dict, df: pd.DataFrame) -> str:
    cols = list(df.columns)
    head = df.head().to_string(index=False)
    query = meta.get('query_without_cutoff') or meta.get('table_title') or meta.get('name')
    tmpl = f'''
### Task
Recommend efficient pagination strategies for the following table.
- Aim for balanced, predictable page sizes.
- Base on available columns when possible.
- Include a complete list of page keys to request.
- If the full list cannot be determined, return an empty list of recommendations.
- Prefer around {PAGE_SIZE_TARGET} rows per page and no more than {MAX_PAGES} pages.

Return raw JSON only: 
[{{
  "criteria": "comma-separated column names used for pagination",
  "expected_page_size": 50,
  "expected_page_variance": 10,
  "estimated_pages": 3,
  "pages": ["full list of page keys/ranges to request downstream"]
}}]

### Input
Query: {query}
Columns: {', '.join(cols)}
Table head:
{head}
'''
    return tmpl

def parse_llm_recommendations(txt: str):
    if not txt:
        return []
    s = strip_code_fences(txt)
    try:
        data = json.loads(s)
        if isinstance(data, dict):
            data = [data]
        # sanitize numeric fields if strings
        out = []
        for r in data:
            if not isinstance(r, dict):
                continue
            rec = {
                'criteria': r.get('criteria', ''),
                'expected_page_size': int(str(r.get('expected_page_size', 0)).strip().split()[0] or 0),
                'expected_page_variance': int(str(r.get('expected_page_variance', 0)).strip().split()[0] or 0),
                'estimated_pages': int(str(r.get('estimated_pages', 0)).strip().split()[0] or 0),
                'pages': r.get('pages', []) or []
            }
            out.append(rec)
        return out
    except Exception as e:
        print('Failed to parse LLM JSON:', e)
        return []

In [49]:
# --- Naive recommendation ---
def naive_recommendation(meta: dict, df: pd.DataFrame):
    n = int(meta.get('numDataRows') or df.shape[0])
    return {
        'criteria': '',
        'estimated_pages': 1,
        'pages': ['ALL'],
        'expected_page_size': n,
        'expected_page_variance': 0
    }