# Stage 3: Fetch Pages

This notebook implements the third stage of the pipeline: fetching pages using LLMs, recording metrics, and saving results.

In [17]:
# Import Required Libraries
import os
import glob
import json
import pandas as pd
import time
import requests
from datetime import datetime
from pathlib import Path

In [18]:
# Identify Latest Folder Under Pagination
pagination_root = 'processing/1_pagination/'
folders = [f for f in os.listdir(pagination_root) if os.path.isdir(os.path.join(pagination_root, f))]
latest_folder = sorted(folders)[-1] if folders else None
pagination_path = os.path.join(pagination_root, latest_folder) if latest_folder else None
assert pagination_path and os.path.exists(pagination_path), 'No pagination folder found.'
print('Using pagination folder:', pagination_path)

Using pagination folder: processing/1_pagination/20250831_003454


In [19]:
# Load Pagination Criteria
json_files = glob.glob(os.path.join(pagination_path, '*.json'))
tables = []
for jf in json_files:
    with open(jf, 'r') as f:
        obj = json.load(f)
    tables.append({'path': jf, 'meta': obj['meta'], 'criteria': obj['pagination_criteria']})
print(f'Loaded {len(tables)} tables.')

Loaded 5 tables.


In [20]:
# Fetch Pages Using LLM and Record Metrics

from io import StringIO

PROVIDE_SOURCE_TABLE = True  # If True, include source table as CSV in the prompt
LLM_TIMEOUT = 30  # seconds
LLM_MODEL = 'x-ai/grok-3-mini'  # Use this model for all criteria

OPENROUTER_API_KEY = os.environ.get('OPENROUTER_API_KEY', '')

output_root = 'processing/2_fetched_pages/'
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
output_folder = os.path.join(output_root, timestamp)
os.makedirs(output_folder, exist_ok=True)

def fetch_page_llm(prompt, model, api_key):
    url = 'https://openrouter.ai/api/v1/chat/completions'
    headers = {'Authorization': f'Bearer {api_key}', 'Content-Type': 'application/json'}
    payload = {
        'model': model,
        'messages': [{'role': 'user', 'content': prompt}],
        'max_tokens': 2048
    }
    start = time.time()
    try:
        resp = requests.post(url, headers=headers, json=payload, timeout=LLM_TIMEOUT)
        latency = time.time() - start
        resp.raise_for_status()
        result = resp.json()
        content = result['choices'][0]['message']['content'] if 'choices' in result else ''
        usage = result.get('usage', {})
        return content, latency, usage, None
    except Exception as e:
        return None, None, None, str(e)

for table in tables:
    meta = table['meta']
    criteria = table['criteria']
    source_csv_path = meta.get('source_file')
    source_csv_str = ''
    source_columns = None
    if PROVIDE_SOURCE_TABLE and source_csv_path and os.path.exists(source_csv_path):
        try:
            source_df = pd.read_csv(source_csv_path)
            source_csv_str = source_df.to_csv(index=False)
            source_columns = list(source_df.columns)
        except Exception as e:
            print(f'[WARNING] Could not load source CSV for {meta.get("name")}: {e}')
            source_csv_str = ''
            source_columns = None
    table_results = {}
    for method, crit in criteria.items():
        # For llm, collect top recommendation from all listed models
        if method == 'llm':
            criteria_list = []
            for model_name, model_criteria in crit.items():
                if not model_criteria:
                    continue
                top_crit = model_criteria[0] if isinstance(model_criteria, list) else model_criteria
                criteria_list.append((model_name, top_crit))
        else:
            criteria_list = [(LLM_MODEL, crit)]
        for model_name, crit_obj in criteria_list:
            model_name = model_name.replace('/', '_')
            pages = crit_obj.get('pages', [])
            if source_columns:
                merged_df = pd.DataFrame(columns=source_columns)
            else:
                merged_df = pd.DataFrame()
            metrics = []
            for page_key in pages:
                prompt = (
                    f"Fetch page for {meta.get('query_without_cutoff')} using criteria: {crit_obj.get('criteria','')} with values: {page_key}. "
                    f"Return the result as CSV only. Expected columns: {', '.join(source_columns) if source_columns else ''}."
                )
                if PROVIDE_SOURCE_TABLE and source_csv_str:
                    prompt += f"\n\nSource table as CSV:\n{source_csv_str}"
                content, latency, usage, error = fetch_page_llm(prompt, LLM_MODEL, OPENROUTER_API_KEY)
                print(f"[FETCH] Table: {meta.get('name')}, Page: {page_key}, Criteria: {crit_obj.get('criteria','')}, Model: {model_name}")
                metrics.append({'page_key': page_key, 'latency': latency, 'usage': usage, 'error': error})
                if content:
                    try:
                        df_page = pd.read_csv(StringIO(content))
                    except Exception:
                        print(f"[WARNING] Skipping non-CSV response for table {meta.get('name')}, page {page_key}, model {model_name}")
                        df_page = None
                    if df_page is not None:
                        merged_df = pd.concat([merged_df, df_page], ignore_index=True)
            table_results[f'{method}_{model_name}'] = {'merged_df': merged_df, 'metrics': metrics, 'criteria': crit_obj}
            if merged_df is not None and not merged_df.empty:
                csv_name = f"{meta.get('id','')}_{meta.get('name','')}_{method}_{model_name}.csv"
                csv_path = os.path.join(output_folder, csv_name)
                os.makedirs(os.path.dirname(csv_path), exist_ok=True)
                merged_df.to_csv(csv_path, index=False)
                print(f'Saved merged CSV: {csv_path}')
    out_json = {
        'meta': meta,
        'results': {}
    }
    for key, res in table_results.items():
        out_json['results'][key] = {
            'criteria': res.get('criteria'),
            'metrics': res.get('metrics')
        }
    json_name = f"{meta.get('id','')}_{meta.get('name','')}_metrics.json"
    json_path = os.path.join(output_folder, json_name)
    os.makedirs(os.path.dirname(json_path), exist_ok=True)
    with open(json_path, 'w') as f:
        json.dump(out_json, f, indent=2)
    print(f'Saved JSON metadata: {json_path}')

[FETCH] Table: english_latin_rivalry_1887_2012, Page: ALL, Criteria: , Model: x-ai_grok-3-mini
Saved merged CSV: processing/2_fetched_pages/20250831_015204/25_english_latin_rivalry_1887_2012_naive_x-ai_grok-3-mini.csv
[FETCH] Table: english_latin_rivalry_1887_2012, Page: 1887-1906, Criteria: Year, Model: google_gemini-2.5-flash-lite
Saved merged CSV: processing/2_fetched_pages/20250831_015204/25_english_latin_rivalry_1887_2012_llm_google_gemini-2.5-flash-lite.csv
[FETCH] Table: english_latin_rivalry_1887_2012, Page: 1887-1906, Criteria: Year, Model: google_gemini-2.5-flash-lite
Saved merged CSV: processing/2_fetched_pages/20250831_015204/25_english_latin_rivalry_1887_2012_llm_google_gemini-2.5-flash-lite.csv
[FETCH] Table: english_latin_rivalry_1887_2012, Page: all, Criteria: Year, Model: deepseek_deepseek-chat-v3.1
Saved merged CSV: processing/2_fetched_pages/20250831_015204/25_english_latin_rivalry_1887_2012_llm_deepseek_deepseek-chat-v3.1.csv
[FETCH] Table: english_latin_rivalry_188

  merged_df = pd.concat([merged_df, df_page], ignore_index=True)


[FETCH] Table: australia_demographics_1900_2010, Page: 1900, Criteria: Year, Model: google_gemini-2.5-flash-lite


  merged_df = pd.concat([merged_df, df_page], ignore_index=True)


KeyboardInterrupt: 