This notebook builds the dataset inventory for the Labor Force Survey.

- It loads the settings from 00_Settings.ipynb (specifically config.json).

- It scans only the valid year folders (2018, 2019, 2022, 2023, 2024).

- It detects survey files (CSV) and metadata files (XLSX).

- It organizes them by year and month, and prints a summary with progress feedback.

-  It saves the inventory into data/interim/inventory.json for reuse in later notebooks.

**INTENT:** The inventory is the foundation of the pipeline. It ensures we know exactly what data is available before reshaping metadata or loading surveys. This step avoids surprises and makes the workflow reproducible.

In [None]:
import json
from pathlib import Path
import os
import pandas as pd

# ------------------------------------------------------------
# Load settings from config.json (produced by 00_Settings.ipynb)
# ------------------------------------------------------------
with open(Path("./data/interim/config.json")) as f:
    cfg = json.load(f)

BASE_PATH = Path(cfg["BASE_PATH"])
INTERIM_DIR = Path(cfg["INTERIM_DIR"])
PROCESSED_DIR = Path(cfg["PROCESSED_DIR"])
LOG_DIR = Path(cfg["LOG_DIR"])
MONTH_ORDER = cfg["MONTH_ORDER"]

# ------------------------------------------------------------
# Load inventory (produced by 01_Inventory.ipynb)
# ------------------------------------------------------------
with open(Path(INTERIM_DIR) / "inventory.json") as f:
    inventory = json.load(f)

# Alias for compatibility
base_path = str(BASE_PATH)


In [2]:
# Restrict scanning to known valid years
VALID_YEARS = {"2018", "2019", "2022", "2023", "2024"}
year_folders = [
    f for f in os.listdir(BASE_PATH)
    if (BASE_PATH / f).is_dir() and f in VALID_YEARS
]

print("Detected year folders:", sorted(year_folders))


Detected year folders: ['2018', '2019', '2022', '2023', '2024']


In [3]:
inventory = {}

for year in sorted(year_folders):
    year_path = BASE_PATH / year
    print(f"\nScanning year: {year}")  # progress feedback

    # Accept both CSV and XLSX
    data_files = [f for f in os.listdir(year_path) if f.lower().endswith((".csv", ".xlsx"))]
    print(f"  Found {len(data_files)} files")

    inventory[year] = {}

    for file in data_files:
        upper = file.upper()
        filetype = "metadata" if upper.endswith(".XLSX") else "survey"

        # Detect month
        month_match = MONTH_PATTERN.search(upper)
        month = month_match.group(1).capitalize() if month_match else "Unmatched"

        # Detect year inside filename
        year_match = YEAR_PATTERN.search(upper)
        file_year = year_match.group(1) if year_match else "UNKNOWN"

        inventory[year].setdefault(month, []).append({
            "filename": file,
            "filetype": filetype,
            "file_year": file_year
        })



Scanning year: 2018
  Found 8 files

Scanning year: 2019
  Found 8 files

Scanning year: 2022
  Found 24 files

Scanning year: 2023
  Found 24 files

Scanning year: 2024
  Found 16 files


In [4]:
print("\n=== DATASET INVENTORY SUMMARY ===\n")
for yr in sorted(inventory.keys()):
    print(f"Year {yr}:")
    sorted_months = sorted(inventory[yr].keys(), key=lambda m: MONTH_ORDER.get(m, 99))
    for month in sorted_months:
        print(f"  {month}:")
        for item in inventory[yr][month]:
            print(f"    {item['filename']} ({item['filetype']})")
    print()



=== DATASET INVENTORY SUMMARY ===

Year 2018:
  January:
    JANUARY_2018.CSV (survey)
    JANUARY_2018_METADATA.xlsx (metadata)
  April:
    APRIL_2018.CSV (survey)
    APRIL_2018_METADATA.xlsx (metadata)
  July:
    JULY_2018.CSV (survey)
    JULY_2018_METADATA.xlsx (metadata)
  October:
    OCTOBER_2018.CSV (survey)
    OCTOBER_2018_METADATA.xlsx (metadata)

Year 2019:
  January:
    JANUARY_2019.CSV (survey)
    JANUARY_2019_METADATA.xlsx (metadata)
  April:
    APRIL_2019_METADATA.xlsx (metadata)
    APRIL_2019.CSV (survey)
  July:
    JULY_2019_METADATA.xlsx (metadata)
    JULY_2019.CSV (survey)
  October:
    OCTOBER_2019_METADATA.xlsx (metadata)
    OCTOBER_2019.CSV (survey)

Year 2022:
  January:
    JANUARY_2022.csv (survey)
    JANUARY_2022_METADATA.xlsx (metadata)
  February:
    FEBRUARY_2022.csv (survey)
    FEBRUARY_2022_METADATA.xlsx (metadata)
  March:
    MARCH_2022.csv (survey)
    MARCH_2022_METADATA.xlsx (metadata)
  April:
    APRIL_2022_METADATA.xlsx (metadata)


In [5]:
# Save inventory to interim folder for reuse
inv_path = INTERIM_DIR / "inventory.json"
with open(inv_path, "w") as f:
    json.dump(inventory, f, indent=2)

print(f"Saved inventory to {inv_path}")


Saved inventory to data\interim\inventory.json
