# Stage 1: Pagination Criteria Notebook

This notebook generates pagination criteria for each table using three methods: naive (full-table fetch), LLM-based (OpenRouter), and statistical cardinality estimation. Results are saved as JSON files in a timestamped folder under `processing/1_pagination/`.

In [116]:
import os
import glob
import json
import pandas as pd
from datetime import datetime

## Locate Latest Data Folder

In [117]:
data_root = 'processing/0_data/'
folders = [f for f in os.listdir(data_root) if os.path.isdir(os.path.join(data_root, f))]
latest_folder = sorted(folders)[-1] if folders else None
data_path = os.path.join(data_root, latest_folder) if latest_folder else None
assert data_path and os.path.exists(data_path), 'No data folder found.'

## Load Table JSONs

In [118]:
json_files = glob.glob(os.path.join(data_path, '*.json'))
tables = []
for jf in json_files:
    with open(jf, 'r') as f:
        obj = json.load(f)
    # Find corresponding CSV
    csv_path = jf.replace('.json', '.csv')
    if not os.path.exists(csv_path):
        print(f'Missing CSV for {jf}')
        continue
    tables.append({'path': jf, 'meta': obj['meta'], 'csv': csv_path})

## Define Pagination Criteria Methods

In [119]:
def naive_criteria(table):
    return [{
        'criteria': 'none',
        'expected_page_size': len(table),
        'estimated_pages': 1,
        'expected_page_variance': 0,
        'pages': ['all']
    }]

def statistical_cardinality_criteria(df):
    # Placeholder: select columns with lowest variance
    variances = df.var(numeric_only=True)
    if not variances.empty:
        col = variances.idxmin()
        return [{
            'criteria': col,
            'expected_page_size': int(df.shape[0] / 5),
            'estimated_pages': 5,
            'expected_page_variance': int(variances.min()),
            'pages': []  # Could be filled with unique values or ranges
        }]

In [None]:
# --- Config ---
RUN_CRITERIA = 'all'  # one of: 'naive', 'statistical', 'llm', 'all'
RUN_MODELS = [
    'google/gemini-2.5-flash-lite',
    'deepseek/deepseek-chat-v3.1',
    'openai/gpt-4o-mini',
 ]  # used only when RUN_CRITERIA in {'llm','all'}

NUM_TABLES = 5

# Restrict which tables to run (None = all). Values are table 'name' from meta.
TABLE_WHITELIST = None  # e.g. ['south_african_class_15f_4_8_2']
TABLE_BLACKLIST = []
TABLE_LIMIT = None  # e.g. 5

# Heuristic targets
PAGE_SIZE_TARGET = 50
MIN_PAGES = 3
MAX_PAGES = 10
MIN_ROWS_THRESHOLD = 100  # only consider tables with >100 rows

# Re-run behavior
SKIP_IF_EXISTS = False  # skip writing if output exists for given criteria/model/table

# OpenRouter config (read API key from env var OPENROUTER_API_KEY)
OPENROUTER_BASE_URL = 'https://openrouter.ai/api/v1'
OPENROUTER_API_KEY_ENV = 'OPENROUTER_API_KEY'
OPENROUTER_ORIGIN = 'openrouter'  # label only

import os, json, time, re, math, random
from datetime import datetime
import pandas as pd
from pathlib import Path

RUN_TS = datetime.now().strftime('%Y%m%d_%H%M%S')
print('Run timestamp:', RUN_TS)

Run timestamp: 20250831_003454


In [121]:
# --- Load OpenRouter API key from launchctl and set in os.environ ---
import subprocess
def load_openrouter_api_key():
    try:
        result = subprocess.run(['launchctl', 'getenv', OPENROUTER_API_KEY_ENV], capture_output=True, text=True)
        api_key = result.stdout.strip()
        if api_key:
            os.environ[OPENROUTER_API_KEY_ENV] = api_key
        else:
            print(f'API key not found in launchctl for {OPENROUTER_API_KEY_ENV}')
        return api_key
    except Exception as e:
        print('Error loading API key:', e)
        return ''

api_key = load_openrouter_api_key()
print('Loaded API key:', '***' if api_key else '(none)')

Loaded API key: ***


In [122]:
# --- LLM prompt for pagination criteria ---
def build_llm_prompt(meta: dict, csv_path: str) -> str:
    df = pd.read_csv(csv_path)
    cols = list(df.columns)
    head = df.head(20).to_string(index=False)
    query = meta.get('query_without_cutoff') or meta.get('table_title') or meta.get('name')
    tmpl = f'''
### Task
Recommend efficient pagination strategies for the following table.
- Aim for balanced, predictable page sizes.
- Base on available columns when possible.
- Include a complete list of page keys to request.
- If the full list cannot be determined, return an empty list of recommendations.
- Prefer around {PAGE_SIZE_TARGET} rows per page and no more than {MAX_PAGES} pages.

Return raw JSON only: 
[{{
  "criteria": "comma-separated column names used for pagination",
  "expected_page_size": 50,
  "expected_page_variance": 10,
  "estimated_pages": 3,
  "pages": ["full list of page keys/ranges to request downstream"]
}}]

### Input
Query: {query}
Columns: {', '.join(cols)}
Table head:
{head}
'''
    return tmpl

In [123]:
# --- Naive recommendation ---
def naive_recommendation(meta: dict, csv_path: str):
    try:
        df = pd.read_csv(csv_path)
        n = df.shape[0]
        return {
            'criteria': '',
            'estimated_pages': 1,
            'pages': ['ALL'],
            'expected_page_size': n,
            'expected_page_variance': 0
        }
    except Exception as e:
        print('Naive recommendation error:', e)
        return {}

In [124]:
# --- Statistical cardinality recommendation ---
def statistical_cardinality_recommendation(meta: dict, csv_path: str, sample_size: int = 100):
    try:
        df = pd.read_csv(csv_path)
        n = df.shape[0]
        if n == 0:
            return {}
        sample_n = min(sample_size, n)
        df_sample = df.sample(n=sample_n, random_state=42) if n > sample_n else df
        variances = df_sample.var(numeric_only=True)
        if not variances.empty:
            col = variances.idxmin()
            return {
                'criteria': col,
                'estimated_pages': min(max(int(n / PAGE_SIZE_TARGET), MIN_PAGES), MAX_PAGES),
                'pages': [],
                'expected_page_size': int(n / max(1, min(max(int(n / PAGE_SIZE_TARGET), MIN_PAGES), MAX_PAGES))),
                'expected_page_variance': int(variances.min())
            }
    except Exception as e:
        print('Statistical cardinality error:', e)
    return {}

In [125]:
output_root = 'processing/1_pagination/'
timestamp = RUN_TS
output_folder = os.path.join(output_root, timestamp)
os.makedirs(output_folder, exist_ok=True)

# --- Testing mode: only run 2 tables if TESTING is True ---
table_iter = tables[:NUM_TABLES] if NUM_TABLES else tables
for table in table_iter:
    meta = table['meta']
    csv_path = table['csv']
    if TABLE_WHITELIST and meta.get('name') not in TABLE_WHITELIST:
        continue
    if TABLE_BLACKLIST and meta.get('name') in TABLE_BLACKLIST:
        continue
    try:
        df = pd.read_csv(csv_path)
    except Exception as e:
        print(f'Error loading CSV {csv_path}:', e)
        continue
    if df.shape[0] < MIN_ROWS_THRESHOLD:
        continue
    result = {}
    if RUN_CRITERIA in {'naive', 'all'}:
        result['naive'] = naive_recommendation(meta, csv_path)
    if RUN_CRITERIA in {'statistical', 'all'}:
        result['statistical'] = statistical_cardinality_recommendation(meta, csv_path)
    if RUN_CRITERIA in {'llm', 'all'}:
        result['llm'] = {}
        for model in RUN_MODELS:
            llm_result = None
            for attempt in range(3):
                llm_result = query_llm_for_criteria(meta, csv_path, model)
                # Accept result if not None and not empty list
                if llm_result is not None and (not isinstance(llm_result, list) or len(llm_result) > 0):
                    break
                print(f'LLM retry {attempt+1} for model {model} on table {meta.get("name", "?")}')
            if llm_result is not None and (not isinstance(llm_result, list) or len(llm_result) > 0):
                result['llm'][model] = llm_result
            else:
                print(f'Skipping table {meta.get("name", "?")} for model {model} due to repeated LLM failure.')
    out_payload = {'meta': meta, 'pagination_criteria': result}
    out_path = os.path.join(output_folder, os.path.basename(table['path']))
    with open(out_path, 'w') as f:
        json.dump(out_payload, f, indent=2)
    # Copy CSV file to output folder
    out_csv_path = os.path.join(output_folder, os.path.basename(csv_path))
    df.to_csv(out_csv_path, index=False)
print('Done. Results saved to', output_folder)

LLM retry 1 for model deepseek/deepseek-chat-v3.1 on table australia_demographics_1900_2010
LLM retry 1 for model openai/gpt-4o-mini on table rock_band_downloadable_2011
LLM retry 2 for model openai/gpt-4o-mini on table rock_band_downloadable_2011
LLM retry 3 for model openai/gpt-4o-mini on table rock_band_downloadable_2011
Skipping table rock_band_downloadable_2011 for model openai/gpt-4o-mini due to repeated LLM failure.
LLM retry 1 for model deepseek/deepseek-chat-v3.1 on table living_proof_the_farewell_tour
Done. Results saved to processing/1_pagination/20250831_003454
