In [5]:
# Michigan QCEW Employment Trends (2001–2024)
#
# This notebook visualizes employment time series for the ten supply-chain segments
# and three stages using the BLS QCEW data (2001–2024).
# The aggregates are generated by scripts/process_mi_qcew_segments.py and
# stored under data/interim/.

from pathlib import Path
import sys
import pandas as pd
import matplotlib.pyplot as plt

from pathlib import Path
import os

def find_repo_root(markers=("data", "scripts")) -> Path:
    here = Path.cwd()
    for p in (here, *here.parents):
        if all((p / m).exists() for m in markers):
            return p
    return here  # fallback: current directory

REPO_ROOT = find_repo_root()
os.chdir(REPO_ROOT)  # ensure all relative paths are from repo root
print("Working dir ->", Path.cwd())

DATA_INTERIM = REPO_ROOT / "data" / "interim"
FIG_DIR = REPO_ROOT / "reports" / "figures"
FIG_DIR.mkdir(parents=True, exist_ok=True)

print("Expecting CSVs in ->", DATA_INTERIM)

# --- Optional: use seaborn for nicer defaults if available ---
try:
    import seaborn as sns
    sns.set_theme(context="talk", style="whitegrid")
except Exception:
    # Fall back to matplotlib defaults if seaborn isn't installed
    pass


SEGMENT_CSV = DATA_INTERIM / "mi_qcew_segment_employment_timeseries.csv"
STAGE_CSV   = DATA_INTERIM / "mi_qcew_stage_employment_timeseries.csv"

# ---- Helpers
def require_exists(p: Path, label: str):
    if not p.exists():
        raise FileNotFoundError(
            f"Missing {label} file at {p}. "
            "Generate it by running scripts/process_mi_qcew_segments.py"
        )

def validate_columns(df: pd.DataFrame, required: list, df_name: str):
    missing = [c for c in required if c not in df.columns]
    if missing:
        raise ValueError(
            f"{df_name} is missing required column(s): {missing}\n"
            f"Columns found: {sorted(df.columns.tolist())}"
        )

def coerce_year(df: pd.DataFrame, year_col: str = "year", min_year: int = 2001, max_year: int = 2024):
    # Convert to numeric, drop rows with invalid years, clip to range
    df[year_col] = pd.to_numeric(df[year_col], errors="coerce")
    df = df.dropna(subset=[year_col]).copy()
    df[year_col] = df[year_col].astype(int)
    df = df[(df[year_col] >= min_year) & (df[year_col] <= max_year)].copy()
    return df

def maybe_annualize(df: pd.DataFrame, group_cols: list, value_col: str):
    """If there are multiple rows per year (e.g., quarterly), annualize by summing."""
    checks = df.groupby(group_cols + ["year"]).size()
    if (checks > 1).any():
        df = (
            df.groupby(group_cols + ["year"], as_index=False)[value_col]
            .sum()
        )
    return df

# ---- Load
require_exists(SEGMENT_CSV, "segment timeseries")
require_exists(STAGE_CSV, "stage timeseries")

segment_ts = pd.read_csv(SEGMENT_CSV)
stage_ts   = pd.read_csv(STAGE_CSV)

# ---- Validate expected columns
# Expected column names (adjust here if your pipeline uses slightly different labels)
SEGMENT_REQUIRED = ["segment_id", "segment_label", "year", "employment_qcew"]
STAGE_REQUIRED   = ["stage", "year", "employment_qcew"]

validate_columns(segment_ts, SEGMENT_REQUIRED, "segment_ts")
validate_columns(stage_ts, STAGE_REQUIRED, "stage_ts")

# ---- Clean & standardize
segment_ts = coerce_year(segment_ts, "year", 2001, 2024)
stage_ts   = coerce_year(stage_ts, "year", 2001, 2024)

# Drop any accidental duplicates
segment_ts = segment_ts.drop_duplicates(SEGMENT_REQUIRED).copy()
stage_ts   = stage_ts.drop_duplicates(STAGE_REQUIRED).copy()

# If your inputs are quarterly/monthly, aggregate to annual totals
segment_ts = maybe_annualize(segment_ts, ["segment_id", "segment_label"], "employment_qcew")
stage_ts   = maybe_annualize(stage_ts, ["stage"], "employment_qcew")

# Sort for plotting
segment_ts = segment_ts.sort_values(["segment_id", "year"])
stage_ts   = stage_ts.sort_values(["stage", "year"])

# ---- Quick sanity prints
print("Segment series shape:", segment_ts.shape)
print("Stage series shape  :", stage_ts.shape)
print("Segment years:", f"{segment_ts['year'].min()}–{segment_ts['year'].max()}")
print("Stage years  :", f"{stage_ts['year'].min()}–{stage_ts['year'].max()}")

# =========================
# Segment-Level Employment
# =========================
# Employment totals (jobs) for each supply-chain segment. Values are summed across
# the NAICS industries assigned to each segment.

fig, ax = plt.subplots(figsize=(14, 8))
# Use matplotlib directly for reliability; seaborn lineplot also works if imported
for name, g in segment_ts.groupby("segment_label", sort=False):
    ax.plot(g["year"], g["employment_qcew"], label=name)

ax.set_ylabel("Employment (QCEW)")
ax.set_xlabel("Year")
ax.set_title("Michigan Employment by Supply-Chain Segment (QCEW, 2001–2024)")
ax.legend(title="Segment", bbox_to_anchor=(1.02, 1), loc="upper left")
fig.tight_layout()

segment_fig_path = FIG_DIR / "mi_qcew_segment_employment_trends.png"
fig.savefig(segment_fig_path, dpi=300, bbox_inches="tight")
plt.close(fig)
print(f"Saved: {segment_fig_path}")

# =======================
# Stage-Level Employment
# =======================
# Employment totals aggregated to Upstream, OEM, and Downstream stages.

fig, ax = plt.subplots(figsize=(12, 6))
for name, g in stage_ts.groupby("stage", sort=False):
    ax.plot(g["year"], g["employment_qcew"], label=name)

ax.set_ylabel("Employment (QCEW)")
ax.set_xlabel("Year")
ax.set_title("Michigan Employment by Stage (QCEW, 2001–2024)")
ax.legend(title="Stage")
fig.tight_layout()

stage_fig_path = FIG_DIR / "mi_qcew_stage_employment_trends.png"
fig.savefig(stage_fig_path, dpi=300, bbox_inches="tight")
plt.close(fig)
print(f"Saved: {stage_fig_path}")

# ---- Notes
# - Segment aggregates use the NAICS-to-segment mapping in data/lookups/segment_assignments.csv.
# - Stage totals are summed across all segments mapped to each stage (Upstream, OEM, Downstream).
# - To refresh, rerun scripts/process_mi_qcew_segments.py before executing this notebook.


Working dir -> c:\Users\vasilauskas\GitHub\EV-Transition
Expecting CSVs in -> c:\Users\vasilauskas\GitHub\EV-Transition\data\interim
Segment series shape: (240, 4)
Stage series shape  : (72, 3)
Segment years: 2001–2024
Stage years  : 2001–2024
Saved: c:\Users\vasilauskas\GitHub\EV-Transition\reports\figures\mi_qcew_segment_employment_trends.png
Saved: c:\Users\vasilauskas\GitHub\EV-Transition\reports\figures\mi_qcew_stage_employment_trends.png
