# Stage 3: Fetch Pages


This notebook implements the third stage of the pipeline: fetching pages using LLMs, recording metrics, and saving results.

In [1]:
# Import Required Libraries
import os
import glob
import json
import pandas as pd
import time
import requests
from datetime import datetime
from pathlib import Path

In [2]:
# Identify Latest Folder Under Pagination
pagination_root = 'processing/1_pagination/'
folders = [f for f in os.listdir(pagination_root) if os.path.isdir(os.path.join(pagination_root, f))]
latest_folder = sorted(folders)[-1] if folders else None
pagination_path = os.path.join(pagination_root, latest_folder) if latest_folder else None
assert pagination_path and os.path.exists(pagination_path), 'No pagination folder found.'
print('Using pagination folder:', pagination_path)

# INCREMENT MODE: Build a set of already processed table IDs/names
INCREMENT_MODE = True  # Set to False for normal full run

Using pagination folder: processing/1_pagination/20250917_135316


In [3]:
# Load Pagination Criteria
json_files = glob.glob(os.path.join(pagination_path, '*.json'))
tables = []
for jf in json_files:
    with open(jf, 'r') as f:
        obj = json.load(f)
    tables.append({'path': jf, 'meta': obj['meta'], 'criteria': obj['pagination_criteria']})
print(f'Loaded {len(tables)} tables.')

Loaded 26 tables.


In [4]:
import re

class TableGenerator_JSON():
    TEMPLATE = """
    List %s - as many as possible to fit into response.
    The response will be formatted as JSON.
    Each element of the response will contain %d fields: %s.
    Do not output any additional text that is not in JSON format.
    %s
    
    """   

    def _norm_field(self, s):
        s = s.lower().replace(" ","_").replace("-","_").replace(".", "").replace(",","_")\
                .replace("(", "").replace(")", "").replace(":", "").replace('"','').replace("'","")\
                .replace("/", "")
        return re.sub('_+', '_', s)
        
    def generate_prompts(self, query, fields, paging: dict | None):
        system_msg = "You are a retriever of facts."

        num_fields = len(fields)
        fields_json = []
        fields = [f for f in fields]
        for field in fields:
            fields_json.append('"%s": "%s"' % ('_'.join(field.replace("-", " ").split()), field))
        response_format = ', '.join(fields)
        if paging:
            paging_criteria = ('Only fetch the results where values for %s match: %s.' % (paging['field'], paging['value']))
        else:
            paging_criteria = ''
        user_msg = self.TEMPLATE % (query, num_fields, response_format, paging_criteria)
        return system_msg, user_msg

    def parse_llm_response(self, response): 
        res = []
        try:
            if not response.startswith("[") and "[" in response:
                response = response[response.find("["):]

            if not response.endswith("]") and "]" in response:
                response = response[:response.rfind("]")+1]

            if '[' not in response and ']' not in response and '{' in response and '}' in response:
                response = '[' + response + ']'    

            response_json = json.loads(response)

            if isinstance(response_json, dict) and len(response_json.keys()) == 1:
                response_json = list(response_json.values())[0]    
        except:  
            split_response = response.split("{")
            response_json = []
            for s in split_response[1:]:
                split_s = s.split("}")
                if len(split_s) > 1:
                    content = split_s[0]
                    attributes = content.split(",")
                    elements = {}
                    for attr in attributes:
                        knv = attr.split(":")   
                        if len(knv) > 1:
                            parsed_k = "%s" % knv[0].replace('"','').strip()
                            parsed_v = "%s" % knv[1].replace('"','').strip()
                            elements[parsed_k] = parsed_v

                    response_json.append(elements)  

        df = pd.DataFrame.from_records(response_json) 
        return df

In [5]:
def compute_metrics(merged_df, source_columns, metrics, source_df):
    row_count = len(merged_df) if merged_df is not None else 0
    col_consistency = (set(merged_df.columns) == set(source_columns)) if source_columns and not merged_df.empty else None
    error_count = sum(1 for m in metrics if m['error'])
    total_pages = len(metrics)
    error_rate = error_count / total_pages if total_pages > 0 else None
    latencies = [m['latency'] for m in metrics if m['latency'] is not None]
    avg_latency = sum(latencies) / len(latencies) if latencies else None
    token_counts = [m['usage'].get('total_tokens', 0) for m in metrics if m['usage'] and 'total_tokens' in m['usage']]
    sum_tokens = sum(token_counts) if token_counts else None
    acc_metrics = None
    if source_df is not None and not merged_df.empty:
        acc_metrics = accuracy_metrics(merged_df, source_df)
    return {
        'row_count': row_count,
        'column_consistency': col_consistency,
        'error_rate': error_rate,
        'avg_latency': avg_latency,
        'sum_tokens': sum_tokens,
        'accuracy': acc_metrics
    }

# Accuracy metric functions (from old/2_Metrics_calculation.ipynb)
def accuracy_metrics(merged_df, source_df):
    # Only compare columns present in both
    common_cols = [col for col in source_df.columns if col in merged_df.columns]
    # Prepare DataFrames and stringify all values (including lists/dicts)
    src = source_df[common_cols].reset_index(drop=True).copy()
    pred = merged_df[common_cols].reset_index(drop=True).copy()
    for col in common_cols:
        src[col] = src[col].apply(lambda x: json.dumps(x, sort_keys=True) if isinstance(x, (dict, list)) else str(x))
        pred[col] = pred[col].apply(lambda x: json.dumps(x, sort_keys=True) if isinstance(x, (dict, list)) else str(x))
    src = src.drop_duplicates().reset_index(drop=True)
    pred = pred.drop_duplicates().reset_index(drop=True)
    # Row-level accuracy: fraction of source rows present in merged
    correct_rows = src.merge(pred, how='inner').shape[0]
    total_rows = src.shape[0]
    row_recall = correct_rows / total_rows if total_rows > 0 else None
    # Precision: fraction of merged rows that are correct
    correct_pred_rows = pred.merge(src, how='inner').shape[0]
    total_pred_rows = pred.shape[0]
    row_precision = correct_pred_rows / total_pred_rows if total_pred_rows > 0 else None
    # F1 score
    if row_precision is not None and row_recall is not None and (row_precision + row_recall) > 0:
        row_f1 = 2 * row_precision * row_recall / (row_precision + row_recall)
    else:
        row_f1 = None
    return {
        'row_recall': row_recall,
        'row_precision': row_precision,
        'row_f1': row_f1
    }

In [6]:
def fetch_page_llm(prompt, model, api_key, system_msg: str = ''):
    url = 'https://openrouter.ai/api/v1/chat/completions'
    headers = {'Authorization': f'Bearer {api_key}', 'Content-Type': 'application/json'}
    payload = {
        'model': model,
        'messages': [
            {'role': 'system', 'content': system_msg},
            {'role': 'user', 'content': prompt}
        ],
        'max_tokens': 20000
    }
    start = time.time()
    try:
        resp = requests.post(url, headers=headers, json=payload, timeout=LLM_TIMEOUT)
        latency = time.time() - start
        resp.raise_for_status()
        result = resp.json()
        raw_response = result['choices'][0]['message']['content'] if 'choices' in result else ''
        content = table_generator.parse_llm_response(raw_response)
        usage = result.get('usage', {})
        return content, latency, usage, None, raw_response
    except Exception as e:
        print(f"[ERROR] Failed to fetch page: {e}")
        return None, None, None, str(e), None

def fetch_page_task(args):
    # args: (meta, crit_obj, page_key, model_name, source_columns, source_csv_str)
    meta, crit_obj, page_key, model_name, source_columns, source_csv_str = args
    page_content = {'field': crit_obj.get('criteria',''), 'value': page_key} if page_key != 'ALL' else None
    system_msg, user_msg = table_generator.generate_prompts(meta.get('query_without_cutoff'), source_columns, page_content)
    if PROVIDE_SOURCE_TABLE and source_csv_str:
        user_msg += f"\n\nSource table as CSV:\n{source_csv_str}"
    print(f"[FETCH] Table: {meta.get('name')}, Page: {page_content}, Model: {model_name}")
    content, latency, usage, error, raw_response = fetch_page_llm(user_msg, LLM_MODEL, OPENROUTER_API_KEY, system_msg)
    return {
        'content': content,
        'latency': latency,
        'usage': usage,
        'error': error,
        'page_key': page_key,
        'raw_response': raw_response
    }

In [None]:
# Fetch Pages Using LLM and Record Metrics (Parallelized)
from io import StringIO
from concurrent.futures import ThreadPoolExecutor, as_completed
import json  # ensure json is imported at the top

PROVIDE_SOURCE_TABLE = False  # If True, include source table as CSV in the prompt
NUM_WORKERS = 8
LLM_TIMEOUT = 30  # seconds
LLM_MODEL = 'openai/gpt-4o-mini'  # Use this model for all criteria
OPENROUTER_API_KEY=''
TOGGLE_USE_EXECUTOR = True  # Toggle whether to use ThreadPoolExecutor for parallel fetching
# OPENROUTER_API_KEY = os.environ.get('OPENROUTER_API_KEY', '')
# print('OPENROUTER_API_KEY:', OPENROUTER_API_KEY)

output_root = 'processing/2_fetched_pages/'

table_generator = TableGenerator_JSON()

def get_latest_timestamped_dir(root):
    folders = [f for f in os.listdir(root) if os.path.isdir(os.path.join(root, f))]
    if not folders:
        return None
    # Sort by timestamp in folder name (assuming YYYYMMDD_HHMMSS)
    return sorted(folders)[-1]

if INCREMENT_MODE:
    output_root = 'processing/2_fetched_pages/'
    latest_dir = get_latest_timestamped_dir(output_root)
    if latest_dir:
        output_folder = os.path.join(output_root, latest_dir)
        print(f"[INCREMENT] Using latest output folder: {output_folder}")
    else:
        output_folder = os.path.join(output_root, timestamp)
        os.makedirs(output_folder, exist_ok=True)
        print(f"[INCREMENT] No previous folder found, using new: {output_folder}")
    # Build set of already processed table names
    existing_metrics = set()
    for fname in os.listdir(output_folder):
        if fname.endswith('_metrics.json'):
            parts = fname.split('_metrics.json')[0].split('_', 1)
            if len(parts) == 2:
                existing_metrics.add(parts[1])  # table name
            else:
                existing_metrics.add(parts[0])  # fallback
    print(f"[INCREMENT] Existing metrics found for tables: {existing_metrics}")
else:
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    output_folder = os.path.join(output_root, timestamp)
    os.makedirs(output_folder, exist_ok=True)
    print(f"[FULL RUN] Using new output folder: {output_folder}")

for table in tables:
    meta = table['meta']
    criteria = table['criteria']
    if INCREMENT_MODE and meta.get('name') in existing_metrics:
        print(f"[INCREMENT] Skipping table {meta.get('name')} (metrics.json exists)")
        continue
    source_csv_path = meta.get('source_file')
    source_csv_str = ''
    source_columns = None
    source_df = None
    if source_csv_path and os.path.exists(source_csv_path):
        try:
            source_df = pd.read_csv(source_csv_path)
            source_csv_str = source_df.to_csv(index=False)
            source_columns = list(source_df.columns)
            csv_path = os.path.join(output_folder, f"{meta.get('id','')}_{meta['file']}")
            source_df.to_csv(csv_path, index=False)
        except Exception as e:
            print(f'[WARNING] Could not load source CSV for {meta.get("name")}: {e}')
            source_csv_str = ''
            source_columns = None
    print(f"[PROCESSING] Table: {meta.get('name')}, Source Columns: {source_columns}")
    table_results = {}
    for method, crit in criteria.items():
        # For llm, collect top recommendation from all listed models
        if method == 'llm':
            criteria_list = []
            for model_name, model_criteria in crit.items():
                if not model_criteria:
                    continue
                top_crit = model_criteria[0] if isinstance(model_criteria, list) else model_criteria
                criteria_list.append((model_name, top_crit))
        # For oracle_single_row and single_row_llm, treat as single-criteria methods
        elif method in {'oracle_single_row', 'single_row_llm'}:
            criteria_list = [(LLM_MODEL, crit)]
        else:
            criteria_list = [(LLM_MODEL, crit)]
        for model_name, crit_obj in criteria_list:
            model_name = model_name.replace('/', '_')
            pages = crit_obj.get('pages', [])
            if source_columns:
                merged_df = pd.DataFrame(columns=source_columns)
            else:
                merged_df = pd.DataFrame()
            metrics = []
            llm_logs = []  # Collect LLM queries and responses for each page
            # Prepare tasks for all pages
            tasks = []
            page_prompts = {}  # page_key -> prompt string
            raw_responses = {}  # page_key -> raw response string
            print(f"[PROCESSING] Table: {meta.get('name')}, Method: {method}, Model: {model_name}, Pages: {pages}")
            for page_key in pages:
                # Build the prompt string for each page
                page_content = {'field': crit_obj.get('criteria',''), 'value': page_key} if page_key != 'ALL' else None
                system_msg, user_msg = table_generator.generate_prompts(meta.get('query_without_cutoff'), source_columns, page_content)
                if PROVIDE_SOURCE_TABLE and source_csv_str:
                    user_msg += f"\n\nSource table as CSV:\n{source_csv_str}"
                key_str = json.dumps(page_key, sort_keys=True) if isinstance(page_key, dict) else str(page_key)
                tasks.append((meta, crit_obj, page_key, model_name, source_columns, source_csv_str if PROVIDE_SOURCE_TABLE else None))
                page_prompts[key_str] = user_msg
            results = []
            if TOGGLE_USE_EXECUTOR:
                with ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor:
                    future_to_page = {executor.submit(fetch_page_task, t): t[2] for t in tasks}
                    for future in as_completed(future_to_page):
                        page_key = future_to_page[future]
                        res = future.result()
                        metrics.append({'latency': res['latency'], 'usage': res['usage'], 'error': res['error']})
                        if res['content'] is not None:
                            try:
                                df_page = res['content']
                                merged_df = pd.concat([merged_df, df_page], ignore_index=True)
                            except Exception:
                                print(f"[WARNING] Skipping non-CSV response for table {meta.get('name')}, page {page_key}, model {model_name}")
                        # Save LLM query, raw response, and parsed response for this page
                        key_str = json.dumps(page_key, sort_keys=True) if isinstance(page_key, dict) else str(page_key)
                        llm_logs.append({
                            'page_key': page_key,
                            'query': page_prompts.get(key_str, ''),
                            'raw_response': res.get('raw_response', None),
                            'response': res.get('content', None),
                            'latency': res['latency'],
                            'usage': res['usage'],
                            'error': res['error']
                        })
            else:
                # Sequential execution
                for idx, t in enumerate(tasks):
                    page_key = t[2]
                    res = fetch_page_task(t)
                    metrics.append({'latency': res['latency'], 'usage': res['usage'], 'error': res['error']})
                    if res['content'] is not None:
                        try:
                            df_page = res['content']
                            merged_df = pd.concat([merged_df, df_page], ignore_index=True)
                        except Exception:
                            print(f"[WARNING] Skipping non-CSV response for table {meta.get('name')}, page {page_key}, model {model_name}")
                    # Save LLM query, raw response, and parsed response for this page
                    key_str = json.dumps(page_key, sort_keys=True) if isinstance(page_key, dict) else str(page_key)
                    llm_logs.append({
                        'page_key': page_key,
                        'query': page_prompts.get(key_str, ''),
                        'raw_response': res.get('raw_response', None),
                        'response': res.get('content', None),
                        'latency': res['latency'],
                        'usage': res['usage'],
                        'error': res['error']
                    })
            if merged_df is not None and not merged_df.empty:
                csv_name = f"{meta.get('id','')}_{meta.get('name','')}_{method}_{model_name}.csv"
                csv_path = os.path.join(output_folder, csv_name)
                merged_df.to_csv(csv_path, index=False)
                print(f'Saved merged CSV: {csv_path}')
                # Save LLM logs for this table/method/model
                llm_log_name = f"{meta.get('id','')}_{meta.get('name','')}_{method}_{model_name}_llm_logs.json"
                llm_log_path = os.path.join(output_folder, llm_log_name)
                # Convert DataFrames in 'response' to dicts for serialization
                for log in llm_logs:
                    if hasattr(log['response'], 'to_dict'):
                        log['response'] = log['response'].to_dict()
                with open(llm_log_path, 'w') as f:
                    json.dump(llm_logs, f, indent=2)
                print(f'Saved LLM logs: {llm_log_path}')
            else:
                metrics.append({'latency': None, 'usage': None, 'error': 'Merged DataFrame is empty'})
            # After building merged_df and metrics:
            metric_result = compute_metrics(merged_df, source_columns, metrics, source_df)
            table_results[f'{method}_{model_name}'] = {
                'merged_df': merged_df,
                'metrics': metrics,
                'criteria': crit_obj,
                **metric_result
            }
    out_json = {
        'meta': meta,
        'results': {}
    }
    for key, res in table_results.items():
        out_json['results'][key] = {
            'criteria': res.get('criteria'),
            'metrics': res.get('metrics'),
            'row_count': res.get('row_count'),
            'column_consistency': res.get('column_consistency'),
            'error_rate': res.get('error_rate'),
            'avg_latency': res.get('avg_latency'),
            'sum_tokens': res.get('sum_tokens'),
            'accuracy': res.get('accuracy')
        }
    json_name = f"{meta.get('id','')}_{meta.get('name','')}_metrics.json"
    json_path = os.path.join(output_folder, json_name)
    os.makedirs(os.path.dirname(json_path), exist_ok=True)
    with open(json_path, 'w') as f:
        json.dump(out_json, f, indent=2)
    print(f'Saved JSON metadata: {json_path}')

[INCREMENT] Using latest output folder: processing/2_fetched_pages/20250917_142817
[INCREMENT] Existing metrics found for tables: {'australia_demographics_1900_2010', 'english_latin_rivalry_1887_2012'}
[INCREMENT] Skipping table english_latin_rivalry_1887_2012 (metrics.json exists)
[INCREMENT] Skipping table australia_demographics_1900_2010 (metrics.json exists)
[PROCESSING] Table: elements, Source Columns: ['Z', 'Sym', 'Element', 'Group', 'Period', 'Atomic weight u', 'Density g / cm 3', 'Melt K', 'Boil K', 'Heat J / g * K', 'Neg']
[PROCESSING] Table: elements, Method: naive, Model: openai_gpt-4o-mini, Pages: ['ALL']
[FETCH] Table: elements, Page: None, Model: openai_gpt-4o-mini


  merged_df = pd.concat([merged_df, df_page], ignore_index=True)


Saved merged CSV: processing/2_fetched_pages/20250917_142817/15_elements_naive_openai_gpt-4o-mini.csv
Saved LLM logs: processing/2_fetched_pages/20250917_142817/15_elements_naive_openai_gpt-4o-mini_llm_logs.json
[PROCESSING] Table: elements, Method: statistical, Model: openai_gpt-4o-mini, Pages: [1, 2, 3, 4]
[FETCH] Table: elements, Page: {'field': 'Period', 'value': 1}, Model: openai_gpt-4o-mini
[FETCH] Table: elements, Page: {'field': 'Period', 'value': 2}, Model: openai_gpt-4o-mini
[FETCH] Table: elements, Page: {'field': 'Period', 'value': 3}, Model: openai_gpt-4o-mini
[FETCH] Table: elements, Page: {'field': 'Period', 'value': 4}, Model: openai_gpt-4o-mini


  merged_df = pd.concat([merged_df, df_page], ignore_index=True)


Saved merged CSV: processing/2_fetched_pages/20250917_142817/15_elements_statistical_openai_gpt-4o-mini.csv
Saved LLM logs: processing/2_fetched_pages/20250917_142817/15_elements_statistical_openai_gpt-4o-mini_llm_logs.json
[PROCESSING] Table: elements, Method: oracle_single_row, Model: openai_gpt-4o-mini, Pages: [{'Element': 'Hydrogen'}, {'Element': 'Helium'}, {'Element': 'Lithium'}, {'Element': 'Beryllium'}, {'Element': 'Boron'}, {'Element': 'Carbon'}, {'Element': 'Nitrogen'}, {'Element': 'Oxygen'}, {'Element': 'Fluorine'}, {'Element': 'Neon'}, {'Element': 'Sodium'}, {'Element': 'Magnesium'}, {'Element': 'Aluminium'}, {'Element': 'Silicon'}, {'Element': 'Phosphorus'}, {'Element': 'Sulfur'}, {'Element': 'Chlorine'}, {'Element': 'Argon'}, {'Element': 'Potassium'}, {'Element': 'Calcium'}, {'Element': 'Scandium'}, {'Element': 'Titanium'}, {'Element': 'Vanadium'}, {'Element': 'Chromium'}, {'Element': 'Manganese'}, {'Element': 'Iron'}, {'Element': 'Cobalt'}, {'Element': 'Nickel'}, {'Elemen

  merged_df = pd.concat([merged_df, df_page], ignore_index=True)


[FETCH] Table: elements, Page: {'field': 'Element', 'value': {'Element': 'Neon'}}, Model: openai_gpt-4o-mini
[FETCH] Table: elements, Page: {'field': 'Element', 'value': {'Element': 'Sodium'}}, Model: openai_gpt-4o-mini
[FETCH] Table: elements, Page: {'field': 'Element', 'value': {'Element': 'Magnesium'}}, Model: openai_gpt-4o-mini
[FETCH] Table: elements, Page: {'field': 'Element', 'value': {'Element': 'Aluminium'}}, Model: openai_gpt-4o-mini
[FETCH] Table: elements, Page: {'field': 'Element', 'value': {'Element': 'Silicon'}}, Model: openai_gpt-4o-mini
[FETCH] Table: elements, Page: {'field': 'Element', 'value': {'Element': 'Phosphorus'}}, Model: openai_gpt-4o-mini
[FETCH] Table: elements, Page: {'field': 'Element', 'value': {'Element': 'Silicon'}}, Model: openai_gpt-4o-mini
[FETCH] Table: elements, Page: {'field': 'Element', 'value': {'Element': 'Phosphorus'}}, Model: openai_gpt-4o-mini
[FETCH] Table: elements, Page: {'field': 'Element', 'value': {'Element': 'Sulfur'}}, Model: openai

  merged_df = pd.concat([merged_df, df_page], ignore_index=True)


[FETCH] Table: elements, Page: {'field': 'Element', 'value': {'Element': 'Potassium'}}, Model: openai_gpt-4o-mini
[FETCH] Table: elements, Page: {'field': 'Element', 'value': {'Element': 'Copernicium'}}, Model: openai_gpt-4o-mini
[FETCH] Table: elements, Page: {'field': 'Element', 'value': {'Element': 'Magnesium'}}, Model: openai_gpt-4o-mini
[FETCH] Table: elements, Page: {'field': 'Element', 'value': {'Element': 'Berkelium'}}, Model: openai_gpt-4o-mini
[FETCH] Table: elements, Page: {'field': 'Element', 'value': {'Element': 'Magnesium'}}, Model: openai_gpt-4o-mini
[FETCH] Table: elements, Page: {'field': 'Element', 'value': {'Element': 'Berkelium'}}, Model: openai_gpt-4o-mini
[FETCH] Table: elements, Page: {'field': 'Element', 'value': {'Element': 'Technetium'}}, Model: openai_gpt-4o-mini
[FETCH] Table: elements, Page: {'field': 'Element', 'value': {'Element': 'Germanium'}}, Model: openai_gpt-4o-mini
[FETCH] Table: elements, Page: {'field': 'Element', 'value': {'Element': 'Technetium'

  merged_df = pd.concat([merged_df, df_page], ignore_index=True)


Saved merged CSV: processing/2_fetched_pages/20250917_142817/15_elements_llm_google_gemini-2.5-flash-lite.csv
Saved LLM logs: processing/2_fetched_pages/20250917_142817/15_elements_llm_google_gemini-2.5-flash-lite_llm_logs.json
[PROCESSING] Table: elements, Method: llm, Model: deepseek_deepseek-chat-v3.1, Pages: ['1-2', '3-4', '5-6', '7']
[FETCH] Table: elements, Page: {'field': 'Period', 'value': '1-2'}, Model: deepseek_deepseek-chat-v3.1
[FETCH] Table: elements, Page: {'field': 'Period', 'value': '3-4'}, Model: deepseek_deepseek-chat-v3.1
[FETCH] Table: elements, Page: {'field': 'Period', 'value': '5-6'}, Model: deepseek_deepseek-chat-v3.1
[FETCH] Table: elements, Page: {'field': 'Period', 'value': '7'}, Model: deepseek_deepseek-chat-v3.1


  merged_df = pd.concat([merged_df, df_page], ignore_index=True)


Saved merged CSV: processing/2_fetched_pages/20250917_142817/15_elements_llm_deepseek_deepseek-chat-v3.1.csv
Saved LLM logs: processing/2_fetched_pages/20250917_142817/15_elements_llm_deepseek_deepseek-chat-v3.1_llm_logs.json
[PROCESSING] Table: elements, Method: llm, Model: openai_gpt-4o-mini, Pages: ['1.0', '2.0', '3.0', '4.0', '5.0', '6.0', '7.0', '8.0', '9.0', '10.0', '11.0', '12.0', '13.0', '14.0', '15.0', '16.0', '17.0']
[FETCH] Table: elements, Page: {'field': 'Group', 'value': '1.0'}, Model: openai_gpt-4o-mini
[FETCH] Table: elements, Page: {'field': 'Group', 'value': '2.0'}, Model: openai_gpt-4o-mini
[FETCH] Table: elements, Page: {'field': 'Group', 'value': '3.0'}, Model: openai_gpt-4o-mini
[FETCH] Table: elements, Page: {'field': 'Group', 'value': '4.0'}, Model: openai_gpt-4o-mini
[FETCH] Table: elements, Page: {'field': 'Group', 'value': '5.0'}, Model: openai_gpt-4o-mini
[FETCH] Table: elements, Page: {'field': 'Group', 'value': '6.0'}, Model: openai_gpt-4o-mini
[FETCH] Tab

  merged_df = pd.concat([merged_df, df_page], ignore_index=True)


[FETCH] Table: elements, Page: {'field': 'Group', 'value': '10.0'}, Model: openai_gpt-4o-mini
[FETCH] Table: elements, Page: {'field': 'Group', 'value': '11.0'}, Model: openai_gpt-4o-mini
[FETCH] Table: elements, Page: {'field': 'Group', 'value': '11.0'}, Model: openai_gpt-4o-mini
[FETCH] Table: elements, Page: {'field': 'Group', 'value': '12.0'}, Model: openai_gpt-4o-mini
[FETCH] Table: elements, Page: {'field': 'Group', 'value': '12.0'}, Model: openai_gpt-4o-mini
[FETCH] Table: elements, Page: {'field': 'Group', 'value': '13.0'}, Model: openai_gpt-4o-mini
[FETCH] Table: elements, Page: {'field': 'Group', 'value': '13.0'}, Model: openai_gpt-4o-mini
[FETCH] Table: elements, Page: {'field': 'Group', 'value': '14.0'}, Model: openai_gpt-4o-mini
[FETCH] Table: elements, Page: {'field': 'Group', 'value': '14.0'}, Model: openai_gpt-4o-mini
[FETCH] Table: elements, Page: {'field': 'Group', 'value': '15.0'}, Model: openai_gpt-4o-mini
[FETCH] Table: elements, Page: {'field': 'Group', 'value': '

  merged_df = pd.concat([merged_df, df_page], ignore_index=True)


[FETCH] Table: belgium_demographics_1900_2011, Page: {'field': 'Crude death rate (per 1,000)', 'value': '29.0-30.9'}, Model: openai_gpt-4o-mini
Saved merged CSV: processing/2_fetched_pages/20250917_142817/2_belgium_demographics_1900_2011_statistical_openai_gpt-4o-mini.csv
Saved LLM logs: processing/2_fetched_pages/20250917_142817/2_belgium_demographics_1900_2011_statistical_openai_gpt-4o-mini_llm_logs.json
[PROCESSING] Table: belgium_demographics_1900_2011, Method: oracle_single_row, Model: openai_gpt-4o-mini, Pages: [{'Year': 1900}, {'Year': 1901}, {'Year': 1902}, {'Year': 1903}, {'Year': 1904}, {'Year': 1905}, {'Year': 1906}, {'Year': 1907}, {'Year': 1908}, {'Year': 1909}, {'Year': 1910}, {'Year': 1911}, {'Year': 1912}, {'Year': 1913}, {'Year': 1914}, {'Year': 1915}, {'Year': 1916}, {'Year': 1917}, {'Year': 1918}, {'Year': 1919}, {'Year': 1920}, {'Year': 1921}, {'Year': 1922}, {'Year': 1923}, {'Year': 1924}, {'Year': 1925}, {'Year': 1926}, {'Year': 1927}, {'Year': 1928}, {'Year': 192

  merged_df = pd.concat([merged_df, df_page], ignore_index=True)


[FETCH] Table: belgium_demographics_1900_2011, Page: {'field': 'Year', 'value': {'Year': 1909}}, Model: openai_gpt-4o-mini
[FETCH] Table: belgium_demographics_1900_2011, Page: {'field': 'Year', 'value': {'Year': 1910}}, Model: openai_gpt-4o-mini
[FETCH] Table: belgium_demographics_1900_2011, Page: {'field': 'Year', 'value': {'Year': 1911}}, Model: openai_gpt-4o-mini
[FETCH] Table: belgium_demographics_1900_2011, Page: {'field': 'Year', 'value': {'Year': 1912}}, Model: openai_gpt-4o-mini
[FETCH] Table: belgium_demographics_1900_2011, Page: {'field': 'Year', 'value': {'Year': 1913}}, Model: openai_gpt-4o-mini
[FETCH] Table: belgium_demographics_1900_2011, Page: {'field': 'Year', 'value': {'Year': 1914}}, Model: openai_gpt-4o-mini
[FETCH] Table: belgium_demographics_1900_2011, Page: {'field': 'Year', 'value': {'Year': 1910}}, Model: openai_gpt-4o-mini
[FETCH] Table: belgium_demographics_1900_2011, Page: {'field': 'Year', 'value': {'Year': 1911}}, Model: openai_gpt-4o-mini
[FETCH] Table: b

  merged_df = pd.concat([merged_df, df_page], ignore_index=True)


Saved merged CSV: processing/2_fetched_pages/20250917_142817/2_belgium_demographics_1900_2011_single_row_llm_openai_gpt-4o-mini.csv
Saved LLM logs: processing/2_fetched_pages/20250917_142817/2_belgium_demographics_1900_2011_single_row_llm_openai_gpt-4o-mini_llm_logs.json
[PROCESSING] Table: belgium_demographics_1900_2011, Method: llm, Model: google_gemini-2.5-flash-lite, Pages: ['1900-1910', '1911-1920', '1921-1930', '1940-1950', '1951-1960', '1961-1970', '1971-1980', '1981-2003']
[FETCH] Table: belgium_demographics_1900_2011, Page: {'field': 'Year', 'value': '1900-1910'}, Model: google_gemini-2.5-flash-lite
[FETCH] Table: belgium_demographics_1900_2011, Page: {'field': 'Year', 'value': '1911-1920'}, Model: google_gemini-2.5-flash-lite
[FETCH] Table: belgium_demographics_1900_2011, Page: {'field': 'Year', 'value': '1921-1930'}, Model: google_gemini-2.5-flash-lite
[FETCH] Table: belgium_demographics_1900_2011, Page: {'field': 'Year', 'value': '1940-1950'}, Model: google_gemini-2.5-flash

  merged_df = pd.concat([merged_df, df_page], ignore_index=True)


Saved merged CSV: processing/2_fetched_pages/20250917_142817/2_belgium_demographics_1900_2011_llm_google_gemini-2.5-flash-lite.csv
Saved LLM logs: processing/2_fetched_pages/20250917_142817/2_belgium_demographics_1900_2011_llm_google_gemini-2.5-flash-lite_llm_logs.json
[PROCESSING] Table: belgium_demographics_1900_2011, Method: llm, Model: deepseek_deepseek-chat-v3.1, Pages: ['1900-1919', '1920-1939', '1940-1959', '1960-1979', '1980-2003']
[FETCH] Table: belgium_demographics_1900_2011, Page: {'field': 'Year', 'value': '1900-1919'}, Model: deepseek_deepseek-chat-v3.1
[FETCH] Table: belgium_demographics_1900_2011, Page: {'field': 'Year', 'value': '1920-1939'}, Model: deepseek_deepseek-chat-v3.1
[FETCH] Table: belgium_demographics_1900_2011, Page: {'field': 'Year', 'value': '1940-1959'}, Model: deepseek_deepseek-chat-v3.1
[FETCH] Table: belgium_demographics_1900_2011, Page: {'field': 'Year', 'value': '1960-1979'}, Model: deepseek_deepseek-chat-v3.1
[FETCH] Table: belgium_demographics_1900

  merged_df = pd.concat([merged_df, df_page], ignore_index=True)


Saved merged CSV: processing/2_fetched_pages/20250917_142817/2_belgium_demographics_1900_2011_llm_deepseek_deepseek-chat-v3.1.csv
Saved LLM logs: processing/2_fetched_pages/20250917_142817/2_belgium_demographics_1900_2011_llm_deepseek_deepseek-chat-v3.1_llm_logs.json
[PROCESSING] Table: belgium_demographics_1900_2011, Method: llm, Model: openai_gpt-4o-mini, Pages: ['1900-1940', '1941-1960', '1961-1980', '1981-2000', '2001-2020']
[FETCH] Table: belgium_demographics_1900_2011, Page: {'field': 'Year', 'value': '1900-1940'}, Model: openai_gpt-4o-mini
[FETCH] Table: belgium_demographics_1900_2011, Page: {'field': 'Year', 'value': '1941-1960'}, Model: openai_gpt-4o-mini
[FETCH] Table: belgium_demographics_1900_2011, Page: {'field': 'Year', 'value': '1961-1980'}, Model: openai_gpt-4o-mini
[FETCH] Table: belgium_demographics_1900_2011, Page: {'field': 'Year', 'value': '1981-2000'}, Model: openai_gpt-4o-mini
[FETCH] Table: belgium_demographics_1900_2011, Page: {'field': 'Year', 'value': '2001-2

  merged_df = pd.concat([merged_df, df_page], ignore_index=True)


[FETCH] Table: ramsar_convention_parties, Page: {'field': 'Ramsar sites', 'value': 64.0}, Model: openai_gpt-4o-mini
Saved merged CSV: processing/2_fetched_pages/20250917_142817/34_ramsar_convention_parties_statistical_openai_gpt-4o-mini.csv
Saved LLM logs: processing/2_fetched_pages/20250917_142817/34_ramsar_convention_parties_statistical_openai_gpt-4o-mini_llm_logs.json
[PROCESSING] Table: ramsar_convention_parties, Method: oracle_single_row, Model: openai_gpt-4o-mini, Pages: [{'Country': 'Algeria'}, {'Country': 'Andorra'}, {'Country': 'Antigua and Barbuda'}, {'Country': 'Argentina'}, {'Country': 'Armenia'}, {'Country': 'Australia'}, {'Country': 'Austria'}, {'Country': 'Azerbaijan'}, {'Country': 'The Bahamas'}, {'Country': 'Bahrain'}, {'Country': 'Bangladesh'}, {'Country': 'Barbados'}, {'Country': 'Belarus'}, {'Country': 'Belgium'}, {'Country': 'Belize'}, {'Country': 'Benin'}, {'Country': 'Bhutan'}, {'Country': 'Bolivia'}, {'Country': 'Bosnia and Herzegovina'}, {'Country': 'Botswana'}

  merged_df = pd.concat([merged_df, df_page], ignore_index=True)


Saved merged CSV: processing/2_fetched_pages/20250917_142817/13_figure_skating_ladies_2009_2010_naive_openai_gpt-4o-mini.csv
Saved LLM logs: processing/2_fetched_pages/20250917_142817/13_figure_skating_ladies_2009_2010_naive_openai_gpt-4o-mini_llm_logs.json
[PROCESSING] Table: figure_skating_ladies_2009_2010, Method: statistical, Model: openai_gpt-4o-mini, Pages: [{'range': '160-170'}, {'range': '170-180'}, {'range': '180-190'}, {'range': '190-200'}, {'range': '200-210'}, {'range': '210-230'}]
[FETCH] Table: figure_skating_ladies_2009_2010, Page: {'field': 'Points', 'value': {'range': '160-170'}}, Model: openai_gpt-4o-mini
[FETCH] Table: figure_skating_ladies_2009_2010, Page: {'field': 'Points', 'value': {'range': '170-180'}}, Model: openai_gpt-4o-mini
[FETCH] Table: figure_skating_ladies_2009_2010, Page: {'field': 'Points', 'value': {'range': '180-190'}}, Model: openai_gpt-4o-mini
[FETCH] Table: figure_skating_ladies_2009_2010, Page: {'field': 'Points', 'value': {'range': '190-200'}},

  merged_df = pd.concat([merged_df, df_page], ignore_index=True)


Saved merged CSV: processing/2_fetched_pages/20250917_142817/13_figure_skating_ladies_2009_2010_statistical_openai_gpt-4o-mini.csv
Saved LLM logs: processing/2_fetched_pages/20250917_142817/13_figure_skating_ladies_2009_2010_statistical_openai_gpt-4o-mini_llm_logs.json
[PROCESSING] Table: figure_skating_ladies_2009_2010, Method: oracle_single_row, Model: openai_gpt-4o-mini, Pages: [{'Name': 'Yu-Na Kim'}, {'Name': 'Mao Asada'}, {'Name': 'Joannie Rochette'}, {'Name': 'Mirai Nagasu'}, {'Name': 'Miki Ando'}, {'Name': 'Laura Lepistö'}, {'Name': 'Rachael Flatt'}, {'Name': 'Akiko Suzuki'}, {'Name': 'Cynthia Phaneuf'}, {'Name': 'Carolina Kostner'}, {'Name': 'Alena Leonova'}, {'Name': 'Ksenia Makarova'}, {'Name': 'Yukari Nakano'}, {'Name': 'Kanako Murakami'}, {'Name': 'Elene Gedevanishvili'}, {'Name': 'Ashley Wagner'}, {'Name': 'Kiira Korpi'}, {'Name': 'Alissa Czisny'}, {'Name': 'Viktoria Helgesson'}, {'Name': 'Caroline Zhang'}, {'Name': 'Polina Shelepen'}, {'Name': 'Júlia Sebestyén'}, {'Name':

  merged_df = pd.concat([merged_df, df_page], ignore_index=True)


[FETCH] Table: figure_skating_ladies_2009_2010, Page: {'field': 'Name', 'value': {'Name': 'Carolina Kostner'}}, Model: openai_gpt-4o-mini
[FETCH] Table: figure_skating_ladies_2009_2010, Page: {'field': 'Name', 'value': {'Name': 'Alena Leonova'}}, Model: openai_gpt-4o-mini
[FETCH] Table: figure_skating_ladies_2009_2010, Page: {'field': 'Name', 'value': {'Name': 'Ksenia Makarova'}}, Model: openai_gpt-4o-mini
[FETCH] Table: figure_skating_ladies_2009_2010, Page: {'field': 'Name', 'value': {'Name': 'Alena Leonova'}}, Model: openai_gpt-4o-mini
[FETCH] Table: figure_skating_ladies_2009_2010, Page: {'field': 'Name', 'value': {'Name': 'Ksenia Makarova'}}, Model: openai_gpt-4o-mini
[FETCH] Table: figure_skating_ladies_2009_2010, Page: {'field': 'Name', 'value': {'Name': 'Yukari Nakano'}}, Model: openai_gpt-4o-mini
[FETCH] Table: figure_skating_ladies_2009_2010, Page: {'field': 'Name', 'value': {'Name': 'Kanako Murakami'}}, Model: openai_gpt-4o-mini
[FETCH] Table: figure_skating_ladies_2009_2010

  merged_df = pd.concat([merged_df, df_page], ignore_index=True)


Saved merged CSV: processing/2_fetched_pages/20250917_142817/13_figure_skating_ladies_2009_2010_single_row_llm_openai_gpt-4o-mini.csv
Saved LLM logs: processing/2_fetched_pages/20250917_142817/13_figure_skating_ladies_2009_2010_single_row_llm_openai_gpt-4o-mini_llm_logs.json
[PROCESSING] Table: figure_skating_ladies_2009_2010, Method: llm, Model: google_gemini-2.5-flash-lite, Pages: ['2009-10-01 to 2009-10-15', '2009-10-16 to 2009-10-31']
[FETCH] Table: figure_skating_ladies_2009_2010, Page: {'field': 'Date', 'value': '2009-10-01 to 2009-10-15'}, Model: google_gemini-2.5-flash-lite
[FETCH] Table: figure_skating_ladies_2009_2010, Page: {'field': 'Date', 'value': '2009-10-16 to 2009-10-31'}, Model: google_gemini-2.5-flash-lite


  merged_df = pd.concat([merged_df, df_page], ignore_index=True)


Saved merged CSV: processing/2_fetched_pages/20250917_142817/13_figure_skating_ladies_2009_2010_llm_google_gemini-2.5-flash-lite.csv
Saved LLM logs: processing/2_fetched_pages/20250917_142817/13_figure_skating_ladies_2009_2010_llm_google_gemini-2.5-flash-lite_llm_logs.json
[PROCESSING] Table: figure_skating_ladies_2009_2010, Method: llm, Model: deepseek_deepseek-chat-v3.1, Pages: ['ISU JGP Lake Placid 2009', 'ISU GP Rostelecom Cup 2009', 'ISU GP Trophee Eric Bompard', 'ISU JGP Bosphorus 2009', 'ISU JGP Croatia Cup 2009', 'ISU JGP Pokal d. Blauen Schwerter 2009', 'ISU JGP Minsk Ice 2009', 'World Junior Championships 2010', 'Four Continents Championships 2010', 'ISU JGP Torun Cup 2009']
[FETCH] Table: figure_skating_ladies_2009_2010, Page: {'field': 'Event', 'value': 'ISU JGP Lake Placid 2009'}, Model: deepseek_deepseek-chat-v3.1
[FETCH] Table: figure_skating_ladies_2009_2010, Page: {'field': 'Event', 'value': 'ISU GP Rostelecom Cup 2009'}, Model: deepseek_deepseek-chat-v3.1
[FETCH] Tabl

  merged_df = pd.concat([merged_df, df_page], ignore_index=True)


Saved merged CSV: processing/2_fetched_pages/20250917_142817/13_figure_skating_ladies_2009_2010_llm_deepseek_deepseek-chat-v3.1.csv
Saved LLM logs: processing/2_fetched_pages/20250917_142817/13_figure_skating_ladies_2009_2010_llm_deepseek_deepseek-chat-v3.1_llm_logs.json
[PROCESSING] Table: figure_skating_ladies_2009_2010, Method: llm, Model: openai_gpt-4o-mini, Pages: ['ISU JGP Lake Placid 2009', 'ISU GP Rostelecom Cup 2009', 'ISU GP Trophee Eric Bompard', 'ISU JGP Bosphorus 2009', 'World Junior Championships 2010']
[FETCH] Table: figure_skating_ladies_2009_2010, Page: {'field': 'Event', 'value': 'ISU JGP Lake Placid 2009'}, Model: openai_gpt-4o-mini
[FETCH] Table: figure_skating_ladies_2009_2010, Page: {'field': 'Event', 'value': 'ISU GP Rostelecom Cup 2009'}, Model: openai_gpt-4o-mini
[FETCH] Table: figure_skating_ladies_2009_2010, Page: {'field': 'Event', 'value': 'ISU GP Trophee Eric Bompard'}, Model: openai_gpt-4o-mini
[FETCH] Table: figure_skating_ladies_2009_2010, Page: {'field

  merged_df = pd.concat([merged_df, df_page], ignore_index=True)


Saved merged CSV: processing/2_fetched_pages/20250917_142817/13_figure_skating_ladies_2009_2010_llm_openai_gpt-4o-mini.csv
Saved LLM logs: processing/2_fetched_pages/20250917_142817/13_figure_skating_ladies_2009_2010_llm_openai_gpt-4o-mini_llm_logs.json
Saved JSON metadata: processing/2_fetched_pages/20250917_142817/13_figure_skating_ladies_2009_2010_metrics.json
[PROCESSING] Table: liechtenstein_demographics_1901_2011, Source Columns: ['Year', 'Average population', 'Live births', 'Deaths', 'Natural change', 'Crude birth rate (per 1000)', 'Crude death rate (per 1000)', 'Natural change (per 1000)']
[PROCESSING] Table: liechtenstein_demographics_1901_2011, Method: naive, Model: openai_gpt-4o-mini, Pages: ['ALL']
[FETCH] Table: liechtenstein_demographics_1901_2011, Page: None, Model: openai_gpt-4o-mini
Saved merged CSV: processing/2_fetched_pages/20250917_142817/21_liechtenstein_demographics_1901_2011_naive_openai_gpt-4o-mini.csv
Saved LLM logs: processing/2_fetched_pages/20250917_142817/

  merged_df = pd.concat([merged_df, df_page], ignore_index=True)


[FETCH] Table: liechtenstein_demographics_1901_2011, Page: {'field': 'Year', 'value': {'Year': 1910}}, Model: openai_gpt-4o-mini
[FETCH] Table: liechtenstein_demographics_1901_2011, Page: {'field': 'Year', 'value': {'Year': 1911}}, Model: openai_gpt-4o-mini
[FETCH] Table: liechtenstein_demographics_1901_2011, Page: {'field': 'Year', 'value': {'Year': 1912}}, Model: openai_gpt-4o-mini
[FETCH] Table: liechtenstein_demographics_1901_2011, Page: {'field': 'Year', 'value': {'Year': 1913}}, Model: openai_gpt-4o-mini
[FETCH] Table: liechtenstein_demographics_1901_2011, Page: {'field': 'Year', 'value': {'Year': 1913}}, Model: openai_gpt-4o-mini
[FETCH] Table: liechtenstein_demographics_1901_2011, Page: {'field': 'Year', 'value': {'Year': 1914}}, Model: openai_gpt-4o-mini
[FETCH] Table: liechtenstein_demographics_1901_2011, Page: {'field': 'Year', 'value': {'Year': 1915}}, Model: openai_gpt-4o-mini
[FETCH] Table: liechtenstein_demographics_1901_2011, Page: {'field': 'Year', 'value': {'Year': 19

  merged_df = pd.concat([merged_df, df_page], ignore_index=True)


[FETCH] Table: liechtenstein_demographics_1901_2011, Page: {'field': 'Year', 'value': {'Year': 1919}}, Model: openai_gpt-4o-mini
[FETCH] Table: liechtenstein_demographics_1901_2011, Page: {'field': 'Year', 'value': {'Year': 1971}}, Model: openai_gpt-4o-mini
[FETCH] Table: liechtenstein_demographics_1901_2011, Page: {'field': 'Year', 'value': {'Year': 1957}}, Model: openai_gpt-4o-mini
[FETCH] Table: liechtenstein_demographics_1901_2011, Page: {'field': 'Year', 'value': {'Year': 1973}}, Model: openai_gpt-4o-mini
[FETCH] Table: liechtenstein_demographics_1901_2011, Page: {'field': 'Year', 'value': {'Year': 1957}}, Model: openai_gpt-4o-mini
[FETCH] Table: liechtenstein_demographics_1901_2011, Page: {'field': 'Year', 'value': {'Year': 1973}}, Model: openai_gpt-4o-mini
[FETCH] Table: liechtenstein_demographics_1901_2011, Page: {'field': 'Year', 'value': {'Year': 2010}}, Model: openai_gpt-4o-mini
[FETCH] Table: liechtenstein_demographics_1901_2011, Page: {'field': 'Year', 'value': {'Year': 19

  merged_df = pd.concat([merged_df, df_page], ignore_index=True)


Saved merged CSV: processing/2_fetched_pages/20250917_142817/21_liechtenstein_demographics_1901_2011_llm_openai_gpt-4o-mini.csv
Saved LLM logs: processing/2_fetched_pages/20250917_142817/21_liechtenstein_demographics_1901_2011_llm_openai_gpt-4o-mini_llm_logs.json
Saved JSON metadata: processing/2_fetched_pages/20250917_142817/21_liechtenstein_demographics_1901_2011_metrics.json
[PROCESSING] Table: new_brunswick_parishes_2006_2011, Source Columns: ['Name', 'County', 'Population (2011)', 'Population (2006)', 'Change (%)', 'Area (km^2)', 'Population density']
[PROCESSING] Table: new_brunswick_parishes_2006_2011, Method: naive, Model: openai_gpt-4o-mini, Pages: ['ALL']
[FETCH] Table: new_brunswick_parishes_2006_2011, Page: None, Model: openai_gpt-4o-mini


  merged_df = pd.concat([merged_df, df_page], ignore_index=True)


Saved merged CSV: processing/2_fetched_pages/20250917_142817/4_new_brunswick_parishes_2006_2011_naive_openai_gpt-4o-mini.csv
Saved LLM logs: processing/2_fetched_pages/20250917_142817/4_new_brunswick_parishes_2006_2011_naive_openai_gpt-4o-mini_llm_logs.json
[PROCESSING] Table: new_brunswick_parishes_2006_2011, Method: statistical, Model: openai_gpt-4o-mini, Pages: [{'min': 0, 'max': 0.5}, {'min': 0.5, 'max': 1}, {'min': 1, 'max': 2}, {'min': 2, 'max': 3}, {'min': 3, 'max': 4}, {'min': 4, 'max': 5}, {'min': 5, 'max': 6}, {'min': 6, 'max': 7}, {'min': 7, 'max': 8}, {'min': 8, 'max': 9}, {'min': 9, 'max': 10}, {'min': 10, 'max': 15}, {'min': 15, 'max': 20}, {'min': 20, 'max': 25}, {'min': 25, 'max': 30}, {'min': 30, 'max': 35}, {'min': 35, 'max': 40}, {'min': 40, 'max': 50}, {'min': 50, 'max': 60}]
[FETCH] Table: new_brunswick_parishes_2006_2011, Page: {'field': 'Population density', 'value': {'min': 0, 'max': 0.5}}, Model: openai_gpt-4o-mini
[FETCH] Table: new_brunswick_parishes_2006_201

  merged_df = pd.concat([merged_df, df_page], ignore_index=True)


[FETCH] Table: new_brunswick_parishes_2006_2011, Page: {'field': 'Population density', 'value': {'min': 8, 'max': 9}}, Model: openai_gpt-4o-mini
[FETCH] Table: new_brunswick_parishes_2006_2011, Page: {'field': 'Population density', 'value': {'min': 9, 'max': 10}}, Model: openai_gpt-4o-mini
[FETCH] Table: new_brunswick_parishes_2006_2011, Page: {'field': 'Population density', 'value': {'min': 10, 'max': 15}}, Model: openai_gpt-4o-mini
[FETCH] Table: new_brunswick_parishes_2006_2011, Page: {'field': 'Population density', 'value': {'min': 9, 'max': 10}}, Model: openai_gpt-4o-mini
[FETCH] Table: new_brunswick_parishes_2006_2011, Page: {'field': 'Population density', 'value': {'min': 10, 'max': 15}}, Model: openai_gpt-4o-mini
[FETCH] Table: new_brunswick_parishes_2006_2011, Page: {'field': 'Population density', 'value': {'min': 15, 'max': 20}}, Model: openai_gpt-4o-mini
[FETCH] Table: new_brunswick_parishes_2006_2011, Page: {'field': 'Population density', 'value': {'min': 15, 'max': 20}}, M

  merged_df = pd.concat([merged_df, df_page], ignore_index=True)


[FETCH] Table: new_brunswick_parishes_2006_2011, Page: {'field': 'Name,County', 'value': {'Name': 'Blackville', 'County': 'Northumberland'}}, Model: openai_gpt-4o-mini
[FETCH] Table: new_brunswick_parishes_2006_2011, Page: {'field': 'Name,County', 'value': {'Name': 'Blissfield', 'County': 'Northumberland'}}, Model: openai_gpt-4o-mini
[FETCH] Table: new_brunswick_parishes_2006_2011, Page: {'field': 'Name,County', 'value': {'Name': 'Blissville', 'County': 'Sunbury'}}, Model: openai_gpt-4o-mini
[FETCH] Table: new_brunswick_parishes_2006_2011, Page: {'field': 'Name,County', 'value': {'Name': 'Blissfield', 'County': 'Northumberland'}}, Model: openai_gpt-4o-mini
[FETCH] Table: new_brunswick_parishes_2006_2011, Page: {'field': 'Name,County', 'value': {'Name': 'Blissville', 'County': 'Sunbury'}}, Model: openai_gpt-4o-mini
[FETCH] Table: new_brunswick_parishes_2006_2011, Page: {'field': 'Name,County', 'value': {'Name': 'Botsford', 'County': 'Westmorland'}}, Model: openai_gpt-4o-mini
[FETCH] Tab

  merged_df = pd.concat([merged_df, df_page], ignore_index=True)


[FETCH] Table: new_brunswick_parishes_2006_2011, Page: {'field': 'Name,County', 'value': {'Name': 'Shediac', 'County': 'Westmorland'}}, Model: openai_gpt-4o-mini
[FETCH] Table: new_brunswick_parishes_2006_2011, Page: {'field': 'Name,County', 'value': {'Name': 'Saint Martins', 'County': 'Saint John'}}, Model: openai_gpt-4o-mini
[FETCH] Table: new_brunswick_parishes_2006_2011, Page: {'field': 'Name,County', 'value': {'Name': 'Cambridge', 'County': 'Queens'}}, Model: openai_gpt-4o-mini
[FETCH] Table: new_brunswick_parishes_2006_2011, Page: {'field': 'Name,County', 'value': {'Name': 'Lac-Baker', 'County': 'Madawaska'}}, Model: openai_gpt-4o-mini
[FETCH] Table: new_brunswick_parishes_2006_2011, Page: {'field': 'Name,County', 'value': {'Name': 'Cambridge', 'County': 'Queens'}}, Model: openai_gpt-4o-mini
[FETCH] Table: new_brunswick_parishes_2006_2011, Page: {'field': 'Name,County', 'value': {'Name': 'Lac-Baker', 'County': 'Madawaska'}}, Model: openai_gpt-4o-mini
[FETCH] Table: new_brunswick_

  merged_df = pd.concat([merged_df, df_page], ignore_index=True)


[FETCH] Table: new_brunswick_parishes_2006_2011, Page: {'field': 'County', 'value': 'Restigouche'}, Model: deepseek_deepseek-chat-v3.1
[FETCH] Table: new_brunswick_parishes_2006_2011, Page: {'field': 'County', 'value': 'Kent'}, Model: deepseek_deepseek-chat-v3.1
[FETCH] Table: new_brunswick_parishes_2006_2011, Page: {'field': 'County', 'value': 'Kent'}, Model: deepseek_deepseek-chat-v3.1
Saved merged CSV: processing/2_fetched_pages/20250917_142817/4_new_brunswick_parishes_2006_2011_llm_deepseek_deepseek-chat-v3.1.csv
Saved LLM logs: processing/2_fetched_pages/20250917_142817/4_new_brunswick_parishes_2006_2011_llm_deepseek_deepseek-chat-v3.1_llm_logs.json
[PROCESSING] Table: new_brunswick_parishes_2006_2011, Method: llm, Model: openai_gpt-4o-mini, Pages: ['Kings', 'Westmorland', 'York', 'Northumberland', 'Saint John', 'Gloucester', 'Restigouche', 'Kent', 'Madawaska']
[FETCH] Table: new_brunswick_parishes_2006_2011, Page: {'field': 'County', 'value': 'Kings'}, Model: openai_gpt-4o-mini
[