This notebook builds the dataset inventory for the Labor Force Survey.

- It loads the settings from 00_Settings.ipynb (specifically config.json).

- It scans only the valid year folders (2018, 2019, 2022, 2023, 2024).

- It detects survey files (CSV) and metadata files (XLSX).

- It organizes them by year and month, and prints a summary with progress feedback.

-  It saves the inventory into data/interim/inventory.json for reuse in later notebooks.

**INTENT:** The inventory is the foundation of the pipeline. It ensures we know exactly what data is available before reshaping metadata or loading surveys. This step avoids surprises and makes the workflow reproducible.

In [None]:
# Load settings from config.json (produced by 00_Settings.ipynb)
import json
from pathlib import Path
import os
import re

with open(Path("./data/interim/config.json")) as f:
    cfg = json.load(f)

BASE_PATH = Path(cfg["BASE_PATH"])
INTERIM_DIR = Path(cfg["INTERIM_DIR"])
MONTH_ORDER = cfg["MONTH_ORDER"]

# Regex patterns for month/year detection
MONTH_PATTERN = re.compile(
    r"(JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER)",
    re.IGNORECASE
)
YEAR_PATTERN = re.compile(r"(20\d{2})")


In [None]:
# Restrict scanning to known valid years
VALID_YEARS = {"2018", "2019", "2022", "2023", "2024"}
year_folders = [
    f for f in os.listdir(BASE_PATH)
    if (BASE_PATH / f).is_dir() and f in VALID_YEARS
]

print("Detected year folders:", sorted(year_folders))


In [None]:
inventory = {}

for year in sorted(year_folders):
    year_path = BASE_PATH / year
    print(f"\nScanning year: {year}")  # progress feedback

    # Accept both CSV and XLSX
    data_files = [f for f in os.listdir(year_path) if f.lower().endswith((".csv", ".xlsx"))]
    print(f"  Found {len(data_files)} files")

    inventory[year] = {}

    for file in data_files:
        upper = file.upper()
        filetype = "metadata" if upper.endswith(".XLSX") else "survey"

        # Detect month
        month_match = MONTH_PATTERN.search(upper)
        month = month_match.group(1).capitalize() if month_match else "Unmatched"

        # Detect year inside filename
        year_match = YEAR_PATTERN.search(upper)
        file_year = year_match.group(1) if year_match else "UNKNOWN"

        inventory[year].setdefault(month, []).append({
            "filename": file,
            "filetype": filetype,
            "file_year": file_year
        })


In [None]:
print("\n=== DATASET INVENTORY SUMMARY ===\n")
for yr in sorted(inventory.keys()):
    print(f"Year {yr}:")
    sorted_months = sorted(inventory[yr].keys(), key=lambda m: MONTH_ORDER.get(m, 99))
    for month in sorted_months:
        print(f"  {month}:")
        for item in inventory[yr][month]:
            print(f"    {item['filename']} ({item['filetype']})")
    print()


In [None]:
# Save inventory to interim folder for reuse
inv_path = INTERIM_DIR / "inventory.json"
with open(inv_path, "w") as f:
    json.dump(inventory, f, indent=2)

print(f"Saved inventory to {inv_path}")
