# Compile Starting Data

- Parse `cfg.json`
- Load each CSV from `benchmark_tables/` into a DataFrame
- Keep only tables with more than 200 cells and up to 10 columns
- Write one JSON per table with both metadata and table records under a timestamped folder in `processing/0_data/`

In [1]:
from __future__ import annotations

import json
from pathlib import Path
from datetime import datetime
import pandas as pd

# Paths
ROOT = Path('.')
CFG_PATH = ROOT / 'benchmark_tables' / 'cfg.json'
TABLES_DIR = ROOT / 'benchmark_tables'
OUTPUT_ROOT = ROOT / 'processing' / '0_data'

print('Using config at:', CFG_PATH.resolve())
print('Tables directory:', TABLES_DIR.resolve())
print('Output base:', OUTPUT_ROOT.resolve())

Using config at: /Users/bef/Desktop/TablePagination/benchmark_tables/cfg.json
Tables directory: /Users/bef/Desktop/TablePagination/benchmark_tables
Output base: /Users/bef/Desktop/TablePagination/processing/0_data


In [2]:
# Load and validate cfg.json
with open(CFG_PATH, 'r', encoding='utf-8') as f:
    cfg = json.load(f)

# Expect cfg to be a mapping of string keys to metadata dicts
assert isinstance(cfg, dict) and len(cfg) > 0, 'cfg.json should be a non-empty object'
len(cfg)

100

In [3]:
# Prepare timestamped output directory
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
out_dir = OUTPUT_ROOT / timestamp
out_dir.mkdir(parents=True, exist_ok=True)
out_dir

PosixPath('processing/0_data/20251003_115653')

In [None]:
# Helper: read a table safely
def load_table(csv_path: Path) -> pd.DataFrame:
    try:
        return pd.read_csv(csv_path)
    except UnicodeDecodeError:
        # Fallback to latin-1 if utf-8 fails
        return pd.read_csv(csv_path, encoding='latin-1')

# Convert DataFrame to JSON-serializable records
def df_to_records(df: pd.DataFrame):
    # Ensure NaNs become null in JSON
    return json.loads(df.to_json(orient='records'))

processed = []
skipped = []
errors = []

for key, meta in cfg.items():
    # Defensive checks
    if not isinstance(meta, dict):
        errors.append((key, 'invalid_meta'))
        continue
    csv_name = meta.get('file')
    if not csv_name:
        errors.append((key, 'missing_file'))
        continue
    csv_path = TABLES_DIR / csv_name
    if not csv_path.exists():
        errors.append((key, f'file_not_found: {csv_path}'))
        continue

    try:
        df = load_table(csv_path)
    except Exception as e:
        errors.append((key, f'read_error: {e}'))
        continue

    # Filter: only keep tables with > 200 cells and up to 10 columns
    num_rows, num_cols = df.shape
    total_cells = num_rows * num_cols
    
    if total_cells <= 200:
        skipped.append((key, f'too_few_cells: {total_cells}'))
        continue
    
    if num_cols > 10:
        skipped.append((key, f'too_many_columns: {num_cols}'))
        continue

    # Build output payload
    payload = {
        'meta': {**meta, 'source_file': str(csv_path)},
        'table': df_to_records(df)
    }

    # Filename: `<id>_<name>.json` with safe characters
    id_part = str(meta.get('id', key))
    name_part = str(meta.get('name', Path(csv_name).stem))
    safe_name = ''.join(c if c.isalnum() or c in ('-', '_') else '_' for c in name_part)
    out_path = out_dir / f'{id_part}_{safe_name}.json'

    with open(out_path, 'w', encoding='utf-8') as f:
        json.dump(payload, f, ensure_ascii=False)
    out_csv = out_dir / f'{id_part}_{safe_name}.csv'
    df.to_csv(out_csv, index=False)

    processed.append((key, out_path.name, df.shape[0], df.shape[1]))

len(processed), len(skipped), len(errors)

(26, 74, 0)

In [5]:
# Summary
print('Output directory:', out_dir)
print('Processed tables:', len(processed))
print('Skipped (<=100 rows):', len(skipped))
print('Errors:', len(errors))

# Show a few sample outputs
processed[:5]

Output directory: processing/0_data/20251003_115653
Processed tables: 26
Skipped (<=100 rows): 74
Errors: 0


[('0', '0_republican_straw_polls_2012.json', 113, 11),
 ('2', '2_belgium_demographics_1900_2011.json', 112, 9),
 ('3', '3_australia_demographics_1900_2010.json', 111, 9),
 ('4', '4_new_brunswick_parishes_2006_2011.json', 152, 7),
 ('5', '5_ice_hockey_2006.json', 244, 7)]