# Stage 3: Fetch Pages

This notebook implements the third stage of the pipeline: fetching pages using LLMs, recording metrics, and saving results.

In [7]:
# Import Required Libraries
import os
import glob
import json
import pandas as pd
import time
import requests
from datetime import datetime
from pathlib import Path

In [8]:
# Identify Latest Folder Under Pagination
pagination_root = 'processing/1_pagination/'
folders = [f for f in os.listdir(pagination_root) if os.path.isdir(os.path.join(pagination_root, f))]
latest_folder = sorted(folders)[-1] if folders else None
pagination_path = os.path.join(pagination_root, latest_folder) if latest_folder else None
assert pagination_path and os.path.exists(pagination_path), 'No pagination folder found.'
print('Using pagination folder:', pagination_path)

Using pagination folder: processing/1_pagination/20250831_003454


In [9]:
# Load Pagination Criteria
json_files = glob.glob(os.path.join(pagination_path, '*.json'))
tables = []
for jf in json_files:
    with open(jf, 'r') as f:
        obj = json.load(f)
    tables.append({'path': jf, 'meta': obj['meta'], 'criteria': obj['pagination_criteria']})
print(f'Loaded {len(tables)} tables.')

Loaded 5 tables.


In [10]:
# Fetch Pages Using LLM and Record Metrics

from io import StringIO

def fetch_page_llm(prompt, model, api_key):
    # Example OpenRouter API call
    url = 'https://openrouter.ai/api/v1/chat/completions'
    headers = {'Authorization': f'Bearer {api_key}', 'Content-Type': 'application/json'}
    payload = {
        'model': model,
        'messages': [{'role': 'user', 'content': prompt}],
        'max_tokens': 2048
    }
    start = time.time()
    try:
        resp = requests.post(url, headers=headers, json=payload)
        latency = time.time() - start
        resp.raise_for_status()
        result = resp.json()
        # Extract content and metrics
        content = result['choices'][0]['message']['content'] if 'choices' in result else ''
        usage = result.get('usage', {})
        return content, latency, usage, None
    except Exception as e:
        return None, None, None, str(e)

OPENROUTER_API_KEY = os.environ.get('OPENROUTER_API_KEY', '')

output_root = 'processing/2_fetched_pages/'
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
output_folder = os.path.join(output_root, timestamp)
os.makedirs(output_folder, exist_ok=True)

for table in tables:
    meta = table['meta']
    criteria = table['criteria']
    table_results = {}
    for method, crit in criteria.items():
        if method == 'llm':
            for model, model_criteria in crit.items():
                if not model_criteria:
                    continue
                # Use the top pagination criteria
                top_crit = model_criteria[0] if isinstance(model_criteria, list) else model_criteria
                pages = top_crit.get('pages', [])
                merged_rows = []
                metrics = []
                for page_key in pages:
                    prompt = f"Fetch page for {meta.get('name')} using criteria: {top_crit['criteria']} and page key: {page_key}" 
                    content, latency, usage, error = fetch_page_llm(prompt, model, OPENROUTER_API_KEY)
                    print(f"[FETCH] Table: {meta.get('name')}, Page: {page_key}, Criteria: {top_crit['criteria']}, Model: {model}")
                    metrics.append({'page_key': page_key, 'latency': latency, 'usage': usage, 'error': error})
                    if content:
                        # Assume content is CSV or JSON rows, parse accordingly
                        try:
                            df_page = pd.read_csv(StringIO(content))
                        except Exception:
                            try:
                                df_page = pd.read_json(StringIO(content))
                            except Exception:
                                df_page = None
                        if df_page is not None:
                            merged_rows.append(df_page)
                if merged_rows:
                    merged_df = pd.concat(merged_rows, ignore_index=True)
                else:
                    merged_df = pd.DataFrame()
                table_results[f'{method}_{model}'] = {'merged_df': merged_df, 'metrics': metrics, 'criteria': top_crit}
        else:
            # For naive/statistical, just record criteria
            table_results[method] = {'criteria': crit}
    # Save merged CSVs for each method/model
    for key, res in table_results.items():
        merged_df = res.get('merged_df')
        if merged_df is not None and not merged_df.empty:
            csv_name = f"{meta.get('id','')}_{meta.get('name','')}_{key}.csv"
            csv_path = os.path.join(output_folder, csv_name)
            merged_df.to_csv(csv_path, index=False)
            print(f'Saved merged CSV: {csv_path}')
    # Save JSON metadata for this table
    out_json = {
        'meta': meta,
        'results': {}
    }
    for key, res in table_results.items():
        out_json['results'][key] = {
            'criteria': res.get('criteria'),
            'metrics': res.get('metrics')
        }
    json_name = f"{meta.get('id','')}_{meta.get('name','')}_metrics.json"
    json_path = os.path.join(output_folder, json_name)
    with open(json_path, 'w') as f:
        json.dump(out_json, f, indent=2)
    print(f'Saved JSON metadata: {json_path}')

[FETCH] Table: english_latin_rivalry_1887_2012, Page: 1887-1906, Criteria: Year, Model: google/gemini-2.5-flash-lite
[FETCH] Table: english_latin_rivalry_1887_2012, Page: all, Criteria: Year, Model: deepseek/deepseek-chat-v3.1
[FETCH] Table: english_latin_rivalry_1887_2012, Page: all, Criteria: Year, Model: deepseek/deepseek-chat-v3.1
[FETCH] Table: english_latin_rivalry_1887_2012, Page: 1887-1896, Criteria: Year, Model: openai/gpt-4o-mini
[FETCH] Table: english_latin_rivalry_1887_2012, Page: 1887-1896, Criteria: Year, Model: openai/gpt-4o-mini


OSError: Cannot save file into a non-existent directory: 'processing/2_fetched_pages/20250831_005427/25_english_latin_rivalry_1887_2012_llm_openai'