This notebook sets up the environment for the Labor Force Survey (LFS) preprocessing pipeline.

- Defines the base path where the raw LFS files are stored in Google Drive.

- Creates local folders for saving **interim outputs (temporary cached results) and processed outputs (final cleaned datasets).** 

- Establishes rules for month ordering and filename patterns so later notebooks can consistently detect survey months and years.

- Saves all settings into a small file (config.json) that other notebooks will read.

**INTENT:** Centralizing settings here makes the workflow reproducible and teamâ€‘friendly. If someone has a different Google Drive path for the data, ONLY edit this notebook, as all other notebooks will still run correctly.

In [1]:

from pathlib import Path
import json
import re
import os
import platform

# ------------------------------------------------------------
# Base path: EDIT this only when cloning on a new machine
# ------------------------------------------------------------

BASE_PATH = Path(r"G:\My Drive\Labor Force Survey")

# ------------------------------------------------------------
# Local project directories (inside the repo)
# ------------------------------------------------------------
DATA_DIR = Path("./data")
INTERIM_DIR = DATA_DIR / "interim"     # temporary outputs (inventory.json, reshaped metadata, parquet surveys)
PROCESSED_DIR = DATA_DIR / "processed" # final cleaned datasets ready for analysis
LOG_DIR = Path("./logs")               # optional folder for debugging logs

for p in [DATA_DIR, INTERIM_DIR, PROCESSED_DIR, LOG_DIR]:
    p.mkdir(parents=True, exist_ok=True)

# ------------------------------------------------------------
# Month order for chronological sorting
# ------------------------------------------------------------
MONTH_ORDER = {
    "January": 1, "February": 2, "March": 3, "April": 4,
    "May": 5, "June": 6, "July": 7, "August": 8,
    "September": 9, "October": 10, "November": 11, "December": 12
}

# Regex patterns for detecting month and year in filenames
MONTH_PATTERN = re.compile(
    r"(JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER)",
    re.IGNORECASE
)
YEAR_PATTERN = re.compile(r"(20\d{2})")

# ------------------------------------------------------------
# Save all settings into a JSON file so other notebooks can load them
# ------------------------------------------------------------
config = {
    "BASE_PATH": str(BASE_PATH.resolve()),  # normalize to absolute path
    "DATA_DIR": str(DATA_DIR.resolve()),
    "INTERIM_DIR": str(INTERIM_DIR.resolve()),
    "PROCESSED_DIR": str(PROCESSED_DIR.resolve()),
    "LOG_DIR": str(LOG_DIR.resolve()),
    "MONTH_ORDER": MONTH_ORDER
}

CONFIG_PATH = INTERIM_DIR / "config.json"
with open(CONFIG_PATH, "w") as f:
    json.dump(config, f, indent=2)

print(f"Saved config to {CONFIG_PATH}")

# ------------------------------------------------------------
# Validation checks
# ------------------------------------------------------------
assert BASE_PATH.exists(), f"Base path does not exist: {BASE_PATH}"
assert CONFIG_PATH.exists(), "Config file was not written."

print("Settings validated. You can proceed to other notebooks.")


Saved config to data\interim\config.json
Settings validated. You can proceed to other notebooks.
