# Stage 3: Fetch Pages


This notebook implements the third stage of the pipeline: fetching pages using LLMs, recording metrics, and saving results.

In [49]:
# Import Required Libraries
import os
import glob
import json
import pandas as pd
import time
import requests
from datetime import datetime
from pathlib import Path

In [50]:
# Identify Latest Folder Under Pagination
pagination_root = 'processing/1_pagination/'
folders = [f for f in os.listdir(pagination_root) if os.path.isdir(os.path.join(pagination_root, f))]
latest_folder = sorted(folders)[-1] if folders else None
pagination_path = os.path.join(pagination_root, latest_folder) if latest_folder else None
assert pagination_path and os.path.exists(pagination_path), 'No pagination folder found.'
print('Using pagination folder:', pagination_path)

Using pagination folder: processing/1_pagination/20250831_003454


In [51]:
# Load Pagination Criteria
json_files = glob.glob(os.path.join(pagination_path, '*.json'))
tables = []
for jf in json_files:
    with open(jf, 'r') as f:
        obj = json.load(f)
    tables.append({'path': jf, 'meta': obj['meta'], 'criteria': obj['pagination_criteria']})
print(f'Loaded {len(tables)} tables.')

Loaded 5 tables.


In [52]:
# Fetch Pages Using LLM and Record Metrics
from io import StringIO

PROVIDE_SOURCE_TABLE = True  # If True, include source table as CSV in the prompt
LLM_TIMEOUT = 30  # seconds
LLM_MODEL = 'x-ai/grok-3-mini'  # Use this model for all criteria

OPENROUTER_API_KEY = os.environ.get('OPENROUTER_API_KEY', '')

output_root = 'processing/2_fetched_pages/'
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
output_folder = os.path.join(output_root, timestamp)
os.makedirs(output_folder, exist_ok=True)

def fetch_page_llm(prompt, model, api_key):
    url = 'https://openrouter.ai/api/v1/chat/completions'
    headers = {'Authorization': f'Bearer {api_key}', 'Content-Type': 'application/json'}
    payload = {
        'model': model,
        'messages': [{'role': 'user', 'content': prompt}],
        'max_tokens': 2048
    }
    start = time.time()
    try:
        resp = requests.post(url, headers=headers, json=payload, timeout=LLM_TIMEOUT)
        latency = time.time() - start
        resp.raise_for_status()
        result = resp.json()
        content = result['choices'][0]['message']['content'] if 'choices' in result else ''
        usage = result.get('usage', {})
        return content, latency, usage, None
    except Exception as e:
        return None, None, None, str(e)

# Accuracy metric functions (from old/2_Metrics_calculation.ipynb)
def accuracy_metrics(merged_df, source_df):
    # Only compare columns present in both
    common_cols = [col for col in source_df.columns if col in merged_df.columns]
    # Cast all columns to string for comparison
    src = source_df[common_cols].drop_duplicates().reset_index(drop=True).astype(str)
    pred = merged_df[common_cols].drop_duplicates().reset_index(drop=True).astype(str)
    # Row-level accuracy: fraction of source rows present in merged
    correct_rows = src.merge(pred, how='inner').shape[0]
    total_rows = src.shape[0]
    row_recall = correct_rows / total_rows if total_rows > 0 else None
    # Precision: fraction of merged rows that are correct
    correct_pred_rows = pred.merge(src, how='inner').shape[0]
    total_pred_rows = pred.shape[0]
    row_precision = correct_pred_rows / total_pred_rows if total_pred_rows > 0 else None
    # F1 score
    if row_precision is not None and row_recall is not None and (row_precision + row_recall) > 0:
        row_f1 = 2 * row_precision * row_recall / (row_precision + row_recall)
    else:
        row_f1 = None
    return {
        'row_recall': row_recall,
        'row_precision': row_precision,
        'row_f1': row_f1
    }

for table in tables:
    meta = table['meta']
    criteria = table['criteria']
    source_csv_path = meta.get('source_file')
    source_csv_str = ''
    source_columns = None
    source_df = None
    if PROVIDE_SOURCE_TABLE and source_csv_path and os.path.exists(source_csv_path):
        try:
            source_df = pd.read_csv(source_csv_path)
            source_csv_str = source_df.to_csv(index=False)
            source_columns = list(source_df.columns)
            csv_path = os.path.join(output_folder, meta['file'])
            source_df.to_csv(csv_path, index=False)
        except Exception as e:
            print(f'[WARNING] Could not load source CSV for {meta.get("name")}: {e}')
            source_csv_str = ''
            source_columns = None
    table_results = {}
    for method, crit in criteria.items():
        # For llm, collect top recommendation from all listed models
        if method == 'llm':
            criteria_list = []
            for model_name, model_criteria in crit.items():
                if not model_criteria:
                    continue
                top_crit = model_criteria[0] if isinstance(model_criteria, list) else model_criteria
                criteria_list.append((model_name, top_crit))
                continue
        else:
            criteria_list = [(LLM_MODEL, crit)]
        for model_name, crit_obj in criteria_list:
            print(f"[INFO] Using model: {model_name} for criteria {method}: {crit_obj}")
            model_name = model_name.replace('/', '_')
            pages = crit_obj.get('pages', [])
            if source_columns:
                merged_df = pd.DataFrame(columns=source_columns)
            else:
                raise ValueError("Source columns are not available.")
                merged_df = pd.DataFrame()
            metrics = []
            for page_key in pages:
                prompt = (
                    f"Fetch page for {meta.get('query_without_cutoff')} using criteria: {crit_obj.get('criteria','')} with values: {page_key}. "
                    f"Return the result as CSV only. Expected columns: {', '.join(source_columns) if source_columns else ''}."
                )
                if PROVIDE_SOURCE_TABLE and source_csv_str:
                    prompt += f"\n\nSource table as CSV:\n{source_csv_str}"
                content, latency, usage, error = fetch_page_llm(prompt, LLM_MODEL, OPENROUTER_API_KEY)
                print(f"[FETCH] Table: {meta.get('name')}, Page: {page_key}, Criteria: {crit_obj.get('criteria','')}, Model: {model_name}")
                metrics.append({'page_key': page_key, 'latency': latency, 'usage': usage, 'error': error})
                if content:
                    try:
                        df_page = pd.read_csv(StringIO(content))
                    except Exception:
                        print(f"[WARNING] Skipping non-CSV response for table {meta.get('name')}, page {page_key}, model {model_name}")
                        df_page = None
                    if df_page is not None:
                        merged_df = pd.concat([merged_df, df_page], ignore_index=True)
            row_count = len(merged_df) if merged_df is not None else 0
            col_consistency = (set(merged_df.columns) == set(source_columns)) if source_columns and not merged_df.empty else None
            error_count = sum(1 for m in metrics if m['error'])
            total_pages = len(metrics)
            error_rate = error_count / total_pages if total_pages > 0 else None
            latencies = [m['latency'] for m in metrics if m['latency'] is not None]
            avg_latency = sum(latencies) / len(latencies) if latencies else None
            token_counts = [m['usage'].get('total_tokens', 0) for m in metrics if m['usage'] and 'total_tokens' in m['usage']]
            avg_tokens = sum(token_counts) / len(token_counts) if token_counts else None
            # Accuracy metrics
            acc_metrics = None
            if source_df is not None and not merged_df.empty:
                acc_metrics = accuracy_metrics(merged_df, source_df)
            table_results[f'{method}_{model_name}'] = {
                'merged_df': merged_df,
                'metrics': metrics,
                'criteria': crit_obj,
                'row_count': row_count,
                'column_consistency': col_consistency,
                'error_rate': error_rate,
                'avg_latency': avg_latency,
                'avg_tokens': avg_tokens,
                'accuracy': acc_metrics
            }
            if merged_df is not None and not merged_df.empty:
                csv_name = f"{meta.get('id','')}_{meta.get('name','')}_{method}_{model_name}.csv"
                csv_path = os.path.join(output_folder, csv_name)
                merged_df.to_csv(csv_path, index=False)
                print(f'Saved merged CSV: {csv_path}')
    out_json = {
        'meta': meta,
        'results': {}
    }
    for key, res in table_results.items():
        out_json['results'][key] = {
            'criteria': res.get('criteria'),
            'metrics': res.get('metrics'),
            'row_count': res.get('row_count'),
            'column_consistency': res.get('column_consistency'),
            'error_rate': res.get('error_rate'),
            'avg_latency': res.get('avg_latency'),
            'avg_tokens': res.get('avg_tokens'),
            'accuracy': res.get('accuracy')
        }
    json_name = f"{meta.get('id','')}_{meta.get('name','')}_metrics.json"
    json_path = os.path.join(output_folder, json_name)
    os.makedirs(os.path.dirname(json_path), exist_ok=True)
    with open(json_path, 'w') as f:
        json.dump(out_json, f, indent=2)
    print(f'Saved JSON metadata: {json_path}')

[INFO] Using model: x-ai/grok-3-mini for criteria naive: {'criteria': '', 'estimated_pages': 1, 'pages': ['ALL'], 'expected_page_size': 126, 'expected_page_variance': 0}
[FETCH] Table: english_latin_rivalry_1887_2012, Page: ALL, Criteria: , Model: x-ai_grok-3-mini
Saved merged CSV: processing/2_fetched_pages/20250831_023122/25_english_latin_rivalry_1887_2012_naive_x-ai_grok-3-mini.csv
[INFO] Using model: x-ai/grok-3-mini for criteria statistical: {'criteria': 'English', 'estimated_pages': 3, 'pages': [], 'expected_page_size': 42, 'expected_page_variance': 76}
[INFO] Using model: google/gemini-2.5-flash-lite for criteria llm: {'criteria': 'Year', 'expected_page_size': 50, 'expected_page_variance': 10, 'estimated_pages': 1, 'pages': ['1887-1906']}
[FETCH] Table: english_latin_rivalry_1887_2012, Page: ALL, Criteria: , Model: x-ai_grok-3-mini
Saved merged CSV: processing/2_fetched_pages/20250831_023122/25_english_latin_rivalry_1887_2012_naive_x-ai_grok-3-mini.csv
[INFO] Using model: x-ai/g

KeyboardInterrupt: 