<a href="https://colab.research.google.com/github/eshaanraj/cdc_2025/blob/main/cdc_data_forecast.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""
Forecast each sectors growth using Exponential Smoothing method for the next 5 years.
plain_cdc_dataset is the data of the BEA survey
"""
!pip install pandas statsmodels openpyxl xlsxwriter

import pandas as pd
from statsmodels.tsa.holtwinters import Holt

INPUT_PATH = "plain_cdc_dataset.xlsx"
OUTPUT_CSV = "sector_forecasts.csv"
OUTPUT_XLSX = "sector_forecasts.xlsx"

def load_plain_dataset():
    """tidy the dataset"""
    raw = pd.read_excel(INPUT_PATH, sheet_name=0)

    obj_cols = [c for c in raw.columns if raw[c].dtype == "O"]
    industry_col = obj_cols[0] if obj_cols else raw.columns[0]

    year_cols = [c for c in raw.columns if str(c).isdigit()]
    dfw = raw[[industry_col] + year_cols].copy()
    dfw = dfw.rename(columns={industry_col: "Industry"})
    dfw["Industry"] = dfw["Industry"].ffill()

    tidy = dfw.melt(id_vars=["Industry"], var_name="Year", value_name="Value")
    tidy["Year"] = pd.to_numeric(tidy["Year"], errors="coerce").astype("Int64")
    tidy["Value"] = pd.to_numeric(tidy["Value"], errors="coerce")
    return tidy.dropna(subset=["Industry", "Year", "Value"])

def forecast_industry(series, horizon=5):
    """Run exponential smoothing method."""
    s = series.dropna().sort_index()
    if s.size < 3:
        return None
    model = Holt(s, initialization_method="estimated")
    fit = model.fit(optimized=True)
    last_year = int(s.index.max())
    future_years = list(range(last_year + 1, last_year + 1 + horizon))
    forecast = fit.forecast(horizon)
    return pd.DataFrame({
        "Year": future_years,
        "Forecast": forecast,
        "alpha": fit.params.get("smoothing_level"),
        "beta": fit.params.get("smoothing_trend"),
        "rmse_in_sample": ((fit.fittedvalues - s) ** 2).mean() ** 0.5
    })

def main():
    df = load_plain_dataset()
    df = df.groupby(["Industry", "Year"], as_index=False)["Value"].mean()

    results = []
    for ind, grp in df.groupby("Industry"):
        s = grp.set_index("Year")["Value"].sort_index()
        res = forecast_industry(s)
        if res is not None:
            res.insert(0, "Industry", ind)
            results.append(res)

    out = pd.concat(results, ignore_index=True)
    out.to_csv(OUTPUT_CSV, index=False)
    with pd.ExcelWriter(OUTPUT_XLSX, engine="xlsxwriter") as writer:
        out.to_excel(writer, sheet_name="Forecasts", index=False)
    print("Saved forecasts to:", OUTPUT_CSV, "and", OUTPUT_XLSX)

if __name__ == "__main__":
    main()

Saved: plain_format_forecasts.xlsx
Saved: plain_format_extended.xlsx
