In [2]:
# Moody's Employment & Output Time Series (1970–2055)
#
# Plots Michigan & US time series by Stage and Segment for Employment and Output (GDP).
# Source CSVs are generated by scripts/process_moodys_timeseries.py into data/interim/.

from pathlib import Path
import os
import pandas as pd
import matplotlib.pyplot as plt

# --- Optional nicer defaults if seaborn is available ---
try:
    import seaborn as sns
    sns.set_theme(context="talk", style="whitegrid")
except Exception:
    pass

# ---------------------------
# Repo paths & figure folder
# ---------------------------
def find_repo_root(markers=("data", "scripts")) -> Path:
    here = Path.cwd()
    for p in (here, *here.parents):
        if all((p / m).exists() for m in markers):
            return p
    return here

REPO_ROOT = find_repo_root()
os.chdir(REPO_ROOT)
print("Working dir ->", Path.cwd())

DATA_INTERIM = REPO_ROOT / "data" / "interim"
FIG_DIR = REPO_ROOT / "reports" / "figures"
FIG_DIR.mkdir(parents=True, exist_ok=True)
print("Expecting CSVs in ->", DATA_INTERIM)

# ---------------------------
# Files from Moody's pipeline
# ---------------------------
FILES = {
    "mi_segments": DATA_INTERIM / "moodys_michigan_segments_timeseries.csv",
    "us_segments": DATA_INTERIM / "moodys_us_segments_timeseries.csv",
    "mi_stages":   DATA_INTERIM / "moodys_michigan_stages_timeseries.csv",
    "us_stages":   DATA_INTERIM / "moodys_us_stages_timeseries.csv",
}

# ---------------------------
# Helpers
# ---------------------------
def require_exists(p: Path, label: str):
    if not p.exists():
        raise FileNotFoundError(
            f"Missing {label} at {p}. "
            "Generate it by running scripts/process_moodys_timeseries.py"
        )

for k, p in FILES.items():
    require_exists(p, k)

def load_csv(path: Path) -> pd.DataFrame:
    df = pd.read_csv(path)
    # Normalize expected columns across segment/stage files
    if "segment_label" in df.columns and "segment_name" not in df.columns:
        df = df.rename(columns={"segment_label": "segment_name"})
    # Ensure required value columns exist
    for col in ["employment", "gdp"]:
        if col not in df.columns:
            df[col] = pd.NA
        df[col] = pd.to_numeric(df[col], errors="coerce")
    # Year as int
    df["year"] = pd.to_numeric(df["year"], errors="coerce").astype("Int64")
    return df

mi_segments = load_csv(FILES["mi_segments"])
us_segments = load_csv(FILES["us_segments"])
mi_stages   = load_csv(FILES["mi_stages"])
us_stages   = load_csv(FILES["us_stages"])

# Validate essentials
def validate_columns(df: pd.DataFrame, required: list, df_name: str):
    missing = [c for c in required if c not in df.columns]
    if missing:
        raise ValueError(
            f"{df_name} is missing required column(s): {missing}\n"
            f"Columns found: {sorted(df.columns.tolist())}"
        )

validate_columns(mi_segments, ["segment_id", "segment_name", "year", "employment", "gdp"], "mi_segments")
validate_columns(us_segments, ["segment_id", "segment_name", "year", "employment", "gdp"], "us_segments")
validate_columns(mi_stages,   ["stage", "year", "employment", "gdp"], "mi_stages")
validate_columns(us_stages,   ["stage", "year", "employment", "gdp"], "us_stages")

# Clean
def clean_years(df, min_y=1970, max_y=2055):
    df = df.dropna(subset=["year"]).copy()
    df["year"] = df["year"].astype(int)
    return df[(df["year"] >= min_y) & (df["year"] <= max_y)].copy()

mi_segments = clean_years(mi_segments)
us_segments = clean_years(us_segments)
mi_stages   = clean_years(mi_stages)
us_stages   = clean_years(us_stages)

# Order stages if present
stage_order = ["Upstream", "OEM", "Downstream"]
for d in (mi_stages, us_stages):
    if "stage" in d.columns:
        d["stage"] = pd.Categorical(d["stage"], categories=stage_order, ordered=True)
        d.sort_values(["stage", "year"], inplace=True)

# ---------------------------
# Plotting helpers
# ---------------------------
def plot_timeseries(
    df: pd.DataFrame,
    group_col: str,
    metric_col: str,
    title: str,
    outfile: Path,
    yr_min: int = 1970,
    yr_max: int = 2055,
    legend_title: str | None = None,
    figsize=(14, 8),
):
    sub = df[(df["year"] >= yr_min) & (df["year"] <= yr_max)].copy()
    sub = sub.dropna(subset=[metric_col])
    if sub.empty:
        print(f"[skip] No data for {title} in {yr_min}-{yr_max}")
        return

    fig, ax = plt.subplots(figsize=figsize)
    for name, g in sub.groupby(group_col, sort=False):
        ax.plot(g["year"], g[metric_col], label=str(name))

    ax.set_xlabel("Year")
    ax.set_ylabel(metric_col.capitalize())  # simple label; adjust if you prefer units
    ax.set_title(title)
    if legend_title is None:
        legend_title = group_col.replace("_", " ").title()
    ax.legend(title=legend_title, bbox_to_anchor=(1.02, 1), loc="upper left")
    fig.tight_layout()
    fig.savefig(outfile, dpi=300, bbox_inches="tight")
    plt.close(fig)
    print(f"Saved: {outfile}")

def batch_plots(
    df: pd.DataFrame,
    group_col: str,
    geo_label: str,
    base_filename_prefix: str,
):
    """
    Emits four plots for this dataframe:
      - Employment (1970–2055)
      - Employment (1990–2030)
      - Output/GDP (1970–2055)
      - Output/GDP (1990–2030)
    """
    items = [
        ("employment", (1970, 2055), f"{geo_label} {group_col.replace('_',' ').title()} – Employment (1970–2055)"),
        ("employment", (1990, 2030), f"{geo_label} {group_col.replace('_',' ').title()} – Employment (1990–2030)"),
        ("gdp",        (1970, 2055), f"{geo_label} {group_col.replace('_',' ').title()} – Output (1970–2055)"),
        ("gdp",        (1990, 2030), f"{geo_label} {group_col.replace('_',' ').title()} – Output (1990–2030)"),
    ]
    for metric, (ymin, ymax), title in items:
        suffix = f"{metric}_{ymin}_{ymax}.png"
        outfile = FIG_DIR / f"{base_filename_prefix}_{suffix}"
        legend_title = "Stage" if group_col == "stage" else "Segment"
        plot_timeseries(
            df=df,
            group_col=group_col,
            metric_col=metric,
            title=title,
            outfile=outfile,
            yr_min=ymin,
            yr_max=ymax,
            legend_title=legend_title,
            figsize=(12, 6) if group_col == "stage" else (14, 8),
        )

# ---------------------------
# Run all plot batches
# ---------------------------
# Michigan
batch_plots(mi_stages,   group_col="stage",        geo_label="Michigan", base_filename_prefix="moodys_mi_stage")
batch_plots(mi_segments, group_col="segment_name", geo_label="Michigan", base_filename_prefix="moodys_mi_segment")

# United States
batch_plots(us_stages,   group_col="stage",        geo_label="United States", base_filename_prefix="moodys_us_stage")
batch_plots(us_segments, group_col="segment_name", geo_label="United States", base_filename_prefix="moodys_us_segment")

print("Done.")


Working dir -> c:\Users\vasilauskas\GitHub\EV-Transition
Expecting CSVs in -> c:\Users\vasilauskas\GitHub\EV-Transition\data\interim


  for name, g in sub.groupby(group_col, sort=False):


Saved: c:\Users\vasilauskas\GitHub\EV-Transition\reports\figures\moodys_mi_stage_employment_1970_2055.png


  for name, g in sub.groupby(group_col, sort=False):


Saved: c:\Users\vasilauskas\GitHub\EV-Transition\reports\figures\moodys_mi_stage_employment_1990_2030.png


  for name, g in sub.groupby(group_col, sort=False):


Saved: c:\Users\vasilauskas\GitHub\EV-Transition\reports\figures\moodys_mi_stage_gdp_1970_2055.png


  for name, g in sub.groupby(group_col, sort=False):


Saved: c:\Users\vasilauskas\GitHub\EV-Transition\reports\figures\moodys_mi_stage_gdp_1990_2030.png
Saved: c:\Users\vasilauskas\GitHub\EV-Transition\reports\figures\moodys_mi_segment_employment_1970_2055.png
Saved: c:\Users\vasilauskas\GitHub\EV-Transition\reports\figures\moodys_mi_segment_employment_1990_2030.png
Saved: c:\Users\vasilauskas\GitHub\EV-Transition\reports\figures\moodys_mi_segment_gdp_1970_2055.png
Saved: c:\Users\vasilauskas\GitHub\EV-Transition\reports\figures\moodys_mi_segment_gdp_1990_2030.png


  for name, g in sub.groupby(group_col, sort=False):


Saved: c:\Users\vasilauskas\GitHub\EV-Transition\reports\figures\moodys_us_stage_employment_1970_2055.png


  for name, g in sub.groupby(group_col, sort=False):


Saved: c:\Users\vasilauskas\GitHub\EV-Transition\reports\figures\moodys_us_stage_employment_1990_2030.png


  for name, g in sub.groupby(group_col, sort=False):


Saved: c:\Users\vasilauskas\GitHub\EV-Transition\reports\figures\moodys_us_stage_gdp_1970_2055.png


  for name, g in sub.groupby(group_col, sort=False):


Saved: c:\Users\vasilauskas\GitHub\EV-Transition\reports\figures\moodys_us_stage_gdp_1990_2030.png
Saved: c:\Users\vasilauskas\GitHub\EV-Transition\reports\figures\moodys_us_segment_employment_1970_2055.png
Saved: c:\Users\vasilauskas\GitHub\EV-Transition\reports\figures\moodys_us_segment_employment_1990_2030.png
Saved: c:\Users\vasilauskas\GitHub\EV-Transition\reports\figures\moodys_us_segment_gdp_1970_2055.png
Saved: c:\Users\vasilauskas\GitHub\EV-Transition\reports\figures\moodys_us_segment_gdp_1990_2030.png
Done.
