In [12]:
# ===== ARIMA for one sheet: "age group final" → CSVs + Plots to 2043 =====
# pip install pandas numpy matplotlib statsmodels openpyxl
import re, warnings
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter
from statsmodels.tsa.arima.model import ARIMA
warnings.filterwarnings("ignore")

# ---------- CONFIG ----------
DATA_PATH   = Path(r"D:\arima project\Task5\cdc (IHD AND VFF) finals.xlsx")   # <— your file
SHEET_NAME = "overall final"                        # <— only this sheet
RESULTS_DIR = Path(r"D:\arima project\Task5\results")  # output root
RESULTS_DIR.mkdir(parents=True, exist_ok=True)
END_YEAR    = 2043

# If auto-detect picks the wrong metric, set this to the exact column name
# Example: MANUAL_AAMR_COL = "Age Adjusted Rate"
MANUAL_AAMR_COL = None

# ---------- PLOTTING STYLE ----------
plt.rcParams.update({
    "figure.figsize": (13.5, 8.0),
    "figure.dpi": 160,
    "axes.grid": True,
    "grid.alpha": 0.25,
    "font.size": 14,
    "font.weight": "bold",
    "axes.labelsize": 18,
    "axes.labelweight": "bold",
    "axes.titlesize": 22,
    "axes.titleweight": "bold",
    "legend.fontsize": 12,
    "legend.frameon": True,
    "legend.title_fontsize": 13,
    "lines.linewidth": 2.7,
    "lines.markersize": 6.7,
})

def style_axes(ax):
    ax.tick_params(axis="both", labelsize=15, width=2.0, length=7)
    for s in ax.spines.values():
        s.set_linewidth(2.2)

# ---------- HELPERS ----------
def _flatten_columns(df: pd.DataFrame) -> pd.DataFrame:
    if isinstance(df.columns, pd.MultiIndex):
        df.columns = [
            " ".join([str(x) for x in tup if str(x) and str(x) != "nan"]).strip()
            for tup in df.columns.to_list()
        ]
    df.columns = [re.sub(r"\s+", " ", str(c)).strip() for c in df.columns]
    seen = {}
    newcols = []
    for c in df.columns:
        if c in seen:
            seen[c] += 1
            newcols.append(f"{c}.{seen[c]}")
        else:
            seen[c] = 0
            newcols.append(c)
    df.columns = newcols
    return df

def _pick_col(df, patterns, required=True, exclude_regex=None):
    low = {c: str(c).strip().lower() for c in df.columns}
    for pat in patterns:
        rx = re.compile(pat)
        for c, lc in low.items():
            if rx.search(lc):
                if exclude_regex and re.search(exclude_regex, lc):
                    continue
                return c
    if required:
        raise KeyError(f"Could not find any of: {patterns}")
    return None

def find_year(df):
    return _pick_col(df, [r"\byear\b", r"^yr$", r"\bcalendar\s*year\b"])

def find_aamr(df):
    if MANUAL_AAMR_COL and MANUAL_AAMR_COL in df.columns:
        return MANUAL_AAMR_COL
    for p in [r"\bage[-\s]*adjust(ed)?\s*rate\b", r"\baamr\b", r"\bage[-\s]*standard(ized)?\s*rate\b"]:
        try:
            return _pick_col(df, [p])
        except:
            pass
    # heuristic fallback
    low = {c: str(c).strip().lower() for c in df.columns}
    for c, lc in low.items():
        if "age" in lc and "adjust" in lc and "rate" in lc and "group" not in lc:
            return c
    return _pick_col(df, [r"\baamr\b"])

def find_group(df):
    exclude = r"age\s*adjust|aamr|rate|ci|conf|se|stderr|mean|median|total|overall|deaths|population|percent"
    for pat in [r"\bage\s*group(s)?\b|age\s*cat|age\-group", r"\bgroup\b", r"\bvariable\b"]:
        c = _pick_col(df, [pat], required=False, exclude_regex=exclude)
        if c:
            return c
    # If not found, but first column looks categorical, use it
    first = df.columns[0]
    if df[first].dtype == "object":
        return first
    return None

def load_observed_excel(xls: pd.ExcelFile, sheet_name: str, overall_label="Overall"):
    df = pd.read_excel(xls, sheet_name=sheet_name)
    df = _flatten_columns(df)

    ycol = find_year(df)
    tcol = find_aamr(df)
    gcol = find_group(df)

    # Convert numerics (also handle % strings)
    def _to_num(s):
        if isinstance(s, str) and s.endswith("%"):
            try:
                return float(s.strip("%")) / 100.0
            except:
                return np.nan
        return pd.to_numeric(s, errors="coerce")

    df[ycol] = pd.to_numeric(df[ycol], errors="coerce")
    df[tcol] = df[tcol].map(_to_num)

    if gcol and gcol == tcol:
        gcol = None

    if gcol is None:
        tidy = (df[[ycol, tcol]].dropna(subset=[ycol, tcol])
                .groupby(ycol, as_index=False)[tcol].mean()
                .rename(columns={ycol:"Year", tcol:"AAMR"}))
        tidy["Group"] = overall_label
        tidy = tidy[["Year","Group","AAMR"]]
    else:
        df[gcol] = df[gcol].astype(str).str.strip()
        tidy = (df[[ycol, gcol, tcol]].dropna(subset=[ycol, gcol, tcol])
                .groupby([ycol, gcol], as_index=False)[tcol].mean()
                .rename(columns={ycol:"Year", gcol:"Group", tcol:"AAMR"}))

    tidy["Year"] = tidy["Year"].astype(int)
    tidy = tidy.sort_values(["Group","Year"]).reset_index(drop=True)

    print(f"\nSheet: {sheet_name}")
    print(f"Detected → Year: '{ycol}' | AAMR: '{tcol}' | Group: '{(gcol or overall_label)}'")
    print(tidy.head(6))
    return tidy

def _sanitize_series_for_arima(y: pd.Series):
    y = y.sort_index()
    full = pd.Index(range(int(y.index.min()), int(y.index.max())+1), name=y.index.name)
    y = y.reindex(full)
    if y.isna().any():
        y = y.interpolate(limit_direction="both")
    return y

def select_arima_order(y):
    best = None
    for d in [0,1,2]:
        for p in range(4):
            for q in range(4):
                if (p,d,q) == (0,0,0):
                    continue
                try:
                    trend = "n" if d>0 else "c"
                    res = ARIMA(y, order=(p,d,q), trend=trend,
                                enforce_stationarity=False, enforce_invertibility=False
                               ).fit(method_kwargs={"warn_convergence":False})
                    aic = res.aic
                    if (best is None) or (aic < best[0]):
                        best = (aic, (p,d,q,trend), res)
                except Exception:
                    pass
    if best is None:
        res = ARIMA(y, order=(1,1,0), trend="n",
                    enforce_stationarity=False, enforce_invertibility=False
                   ).fit(method_kwargs={"warn_convergence":False})
        return (1,1,0,"n"), res
    return best[1], best[2]

def forecast_to(y, end_year, conf=0.95):
    last_year = int(y.index.max())
    steps = max(0, end_year - last_year)
    if steps == 0:
        raise ValueError("Observed already reaches END_YEAR.")
    order, res = select_arima_order(y)
    fc  = res.get_forecast(steps=steps)
    ci  = fc.conf_int(alpha=1-conf)
    yrs = list(range(last_year+1, end_year+1))
    return order, pd.DataFrame({
        "Year": yrs,
        "Point.Forecast": fc.predicted_mean.values,
        "Lo.95": ci.iloc[:,0].values,
        "Hi.95": ci.iloc[:,1].values,
        "Order": [f"{order[0]},{order[1]},{order[2]} ({order[3]})"]*steps
    })

def safe_name(s: str) -> str:
    return re.sub(r"[^a-zA-Z0-9_-]+", "_", str(s)).strip("_")

# ---------- RUN (one sheet) ----------
xls = pd.ExcelFile(DATA_PATH)
assert SHEET_NAME in xls.sheet_names, f"Sheet '{SHEET_NAME}' not found. Available: {xls.sheet_names}"

observed = load_observed_excel(xls, sheet_name=SHEET_NAME, overall_label="Overall")
groups   = sorted(observed["Group"].unique())
print("Detected groups:", groups)

out_root = RESULTS_DIR / safe_name(DATA_PATH.stem) / safe_name(SHEET_NAME)
out_root.mkdir(parents=True, exist_ok=True)

all_rows = []
last_obs_year = int(observed["Year"].max())

for g in groups:
    sub = observed[observed["Group"]==g].copy().dropna(subset=["Year","AAMR"]).sort_values("Year")
    if sub.empty:
        print(f" - [{g}] no data, skipping.")
        continue

    y   = pd.Series(sub["AAMR"].values, index=sub["Year"].astype(int), name="AAMR")
    y   = _sanitize_series_for_arima(y)

    try:
        order, fc = forecast_to(y, end_year=END_YEAR, conf=0.95)
    except Exception as e:
        print(f" - [{g}] forecast failed: {e}")
        continue

    # Per-group CSV
    csv_path = out_root / f"{safe_name(g)}_forecast_to_{END_YEAR}.csv"
    out = fc.copy()
    for c in ["Point.Forecast","Lo.95","Hi.95"]:
        out[c] = out[c].round(2)
    out.insert(0, "Series", g)
    out.to_csv(csv_path, index=False)
    print(f" [{g}] order={order} → CSV: {csv_path}")

    # Keep for combined CSV
    tmp = out.copy(); tmp["Group"] = g
    all_rows.append(tmp)

    # Plot
    fig, ax = plt.subplots()
    ax.plot(y.index, y.values, marker="o", label=f"{g} (obs)")
    ln, = ax.plot(fc["Year"], fc["Point.Forecast"], linestyle="--", marker="o", label=f"{g} (fc)")
    lo = np.maximum(fc["Lo.95"].values, 0.0)
    ax.fill_between(fc["Year"], lo, fc["Hi.95"].values, alpha=0.15, color=ln.get_color(), label="95% CI (fc)")
    ax.axvline(x=last_obs_year+0.5, linestyle=":", linewidth=2.2)

    ax.set_title(f"{SHEET_NAME} — {g}: obs (≤{last_obs_year}) & ARIMA ({last_obs_year+1}–{END_YEAR})")
    ax.set_xlabel("Year"); ax.set_ylabel("AAMR (per 100,000)")
    ax.yaxis.set_major_formatter(FormatStrFormatter("%.2f"))
    style_axes(ax)
    ax.legend(ncols=2)
    plt.tight_layout()
    png_path = out_root / f"{safe_name(g)}_timeseries_to_{END_YEAR}.png"
    plt.savefig(png_path, dpi=300, bbox_inches="tight"); plt.close()
    print(f"    Plot: {png_path}")

# Combined outputs
if all_rows:
    all_df = pd.concat(all_rows, ignore_index=True)
    all_df.to_csv(out_root / f"ALL_GROUPS_forecasts_to_{END_YEAR}.csv", index=False)
    print(" Consolidated CSV:", out_root / f"ALL_GROUPS_forecasts_to_{END_YEAR}.csv")

    # Combined plot
    fig, ax = plt.subplots()
    added_ci = False
    for g in groups:
        sub = observed[observed["Group"]==g].copy().sort_values("Year")
        if sub.empty: 
            continue
        ax.plot(sub["Year"], sub["AAMR"], marker="o", label=f"{g} (obs)")
        fc_path = out_root / f"{safe_name(g)}_forecast_to_{END_YEAR}.csv"
        if fc_path.exists():
            fc = pd.read_csv(fc_path)
            ln, = ax.plot(fc["Year"], fc["Point.Forecast"], linestyle="--", marker="o", label=f"{g} (fc)")
            lo = np.maximum(fc["Lo.95"].values, 0.0)
            if not added_ci:
                ax.fill_between(fc["Year"], lo, fc["Hi.95"].values, alpha=0.12, color=ln.get_color(), label="95% CI (fc)")
                added_ci = True
            else:
                ax.fill_between(fc["Year"], lo, fc["Hi.95"].values, alpha=0.12, color=ln.get_color())

    ax.axvline(x=last_obs_year+0.5, linestyle=":", linewidth=2.2)
    ax.set_title(f"{SHEET_NAME} (≤{last_obs_year}) ARIMA ({last_obs_year+1}–{END_YEAR})")
    ax.set_xlabel("Year"); ax.set_ylabel("AAMR (per 100,000)")
    ax.yaxis.set_major_formatter(FormatStrFormatter("%.2f"))
    style_axes(ax)
    ax.legend(ncols=2)
    ax.margins(x=0.02)
    plt.tight_layout()
    combined_png = out_root / f"COMBINED_timeseries_to_{END_YEAR}.png"
    plt.savefig(combined_png, dpi=300, bbox_inches="tight"); plt.close()
    print(" Combined plot:", combined_png)

print("\nDone →", out_root)



Sheet: overall final
Detected → Year: 'Year' | AAMR: 'Age Adjusted Rate' | Group: 'Sex'
   Year   Group  AAMR
0  1999  Female  4.01
1  2000  Female  3.46
2  2001  Female  3.03
3  2002  Female  2.68
4  2003  Female  2.41
5  2004  Female  2.09
Detected groups: ['Female', 'Male', 'overall']
 [Female] order=(1, 0, 1, 'c') → CSV: D:\arima project\Task5\results\cdc_IHD_AND_VFF_finals\overall_final\Female_forecast_to_2043.csv
    Plot: D:\arima project\Task5\results\cdc_IHD_AND_VFF_finals\overall_final\Female_timeseries_to_2043.png
 [Male] order=(2, 0, 0, 'c') → CSV: D:\arima project\Task5\results\cdc_IHD_AND_VFF_finals\overall_final\Male_forecast_to_2043.csv
    Plot: D:\arima project\Task5\results\cdc_IHD_AND_VFF_finals\overall_final\Male_timeseries_to_2043.png
 [overall] order=(1, 0, 0, 'c') → CSV: D:\arima project\Task5\results\cdc_IHD_AND_VFF_finals\overall_final\overall_forecast_to_2043.csv
    Plot: D:\arima project\Task5\results\cdc_IHD_AND_VFF_finals\overall_final\overall_timeseries

In [13]:
# ===== Post-process totals across groups =====
import pandas as pd
from pathlib import Path

# Point this to your results folder where ALL_GROUPS_forecasts_to_2043.csv is saved
all_groups_path = Path(r"D:\arima project\Task5\results\cdc_IHD_AND_VFF_finals\overall_final\ALL_GROUPS_forecasts_to_2043.csv")

# Load combined forecasts
df = pd.read_csv(all_groups_path)

# Calculate totals across groups per year
totals = (
    df.groupby("Year")
      .agg({
          "Point.Forecast": "sum",
          "Lo.95": "sum",
          "Hi.95": "sum"
      })
      .reset_index()
)

# Save totals
out_path = all_groups_path.parent / "TOTAL_forecasts_to_2043.csv"
totals.to_csv(out_path, index=False)
print("Saved:", out_path)
print(totals.head(10))   # peek at first 10 years


Saved: D:\arima project\Task5\results\cdc_IHD_AND_VFF_finals\overall_final\TOTAL_forecasts_to_2043.csv
   Year  Point.Forecast  Lo.95  Hi.95
0  2024            6.08   5.76   6.40
1  2025            6.10   5.70   6.48
2  2026            6.10   5.67   6.54
3  2027            6.12   5.66   6.57
4  2028            6.13   5.65   6.59
5  2029            6.13   5.65   6.62
6  2030            6.13   5.64   6.61
7  2031            6.13   5.64   6.62
8  2032            6.14   5.65   6.64
9  2033            6.14   5.65   6.64


In [None]:
# ===== Totals of observed AAMRs (1999–2023) by variable =====
# pip install pandas openpyxl

import re
from pathlib import Path
import numpy as np
import pandas as pd

# ---- CONFIG ----
DATA_PATH   = Path(r"D:\arima project\Task5\cdc (IHD AND VFF) finals.xlsx")  # your workbook
SHEET_NAMES = ["overall final", "age group final", "Race final ", "census final", "urbanization final"]
OUT_DIR     = Path(r"D:\arima project\Task5\observed_totals_1999_2023")      # where to save
YEAR_MIN, YEAR_MAX = 1999, 2023

OUT_DIR.mkdir(parents=True, exist_ok=True)

# ---- HELPERS ----
def _flatten_columns(df: pd.DataFrame) -> pd.DataFrame:
    if isinstance(df.columns, pd.MultiIndex):
        df.columns = [
            " ".join([str(x) for x in tup if str(x) and str(x) != "nan"]).strip()
            for tup in df.columns.to_list()
        ]
    df.columns = [re.sub(r"\s+", " ", str(c)).strip() for c in df.columns]
    # disambiguate duplicates
    seen = {}
    newcols = []
    for c in df.columns:
        if c in seen:
            seen[c] += 1
            newcols.append(f"{c}.{seen[c]}")
        else:
            seen[c] = 0
            newcols.append(c)
    df.columns = newcols
    return df

def _pick_col(df, patterns, required=True, exclude_regex=None):
    low = {c: str(c).lower().strip() for c in df.columns}
    for pat in patterns:
        rx = re.compile(pat)
        for c, lc in low.items():
            if rx.search(lc):
                if exclude_regex and re.search(exclude_regex, lc):
                    continue
                return c
    if required:
        raise KeyError(f"Could not find any of: {patterns}")
    return None

def find_year(df):     return _pick_col(df, [r"\byear\b", r"^yr$", r"calendar\s*year"])
def find_group(df):
    exclude = r"age\s*adjust|aamr|rate|ci|conf|se|stderr|mean|median|total|overall|deaths|population|percent"
    for pat in [r"\bage\s*group(s)?\b|age\s*cat|age\-group", r"\bgroup\b", r"\bvariable\b"]:
        c = _pick_col(df, [pat], required=False, exclude_regex=exclude)
        if c: return c
    # fallback: if first column looks categorical, use it
    first = df.columns[0]
    return first if df[first].dtype == "object" else None

def find_aamr(df):
    for p in [r"\bage[-\s]*adjust(ed)?\s*rate\b", r"\baamr\b", r"age[-\s]*standard(ized)?\s*rate"]:
        try: return _pick_col(df, [p])
        except: pass
    # heuristic
    low = {c: str(c).lower() for c in df.columns}
    for c, lc in low.items():
        if "age" in lc and "adjust" in lc and "rate" in lc and "group" not in lc:
            return c
    return _pick_col(df, [r"\baamr\b"])

def find_ci_low(df):
    return _pick_col(df, [r"\blo(\.?\s*95)?\b", r"\blower\b", r"\bci\s*low\b", r"\blcl\b"], required=False)

def find_ci_high(df):
    return _pick_col(df, [r"\bhi(\.?\s*95)?\b", r"\bupper\b", r"\bci\s*high\b", r"\bucl\b"], required=False)

def _to_num(x):
    if isinstance(x, str) and x.endswith("%"):
        try: return float(x.strip("%")) / 100.0
        except: return np.nan
    return pd.to_numeric(x, errors="coerce")

def load_observed_tidy(xls: pd.ExcelFile, sheet: str):
    df = pd.read_excel(xls, sheet_name=sheet)
    df = _flatten_columns(df)

    ycol  = find_year(df)
    gcol  = find_group(df)
    rcol  = find_aamr(df)
    loCol = find_ci_low(df)
    hiCol = find_ci_high(df)

    # coerce numerics
    df[ycol] = pd.to_numeric(df[ycol], errors="coerce")
    df[rcol] = df[rcol].map(_to_num)
    if loCol: df[loCol] = df[loCol].map(_to_num)
    if hiCol: df[hiCol] = df[hiCol].map(_to_num)

    # keep needed cols
    cols = [c for c in [ycol, gcol, rcol, loCol, hiCol] if c is not None]
    df = df[cols].dropna(subset=[ycol, rcol])

    # standardize names
    rename_map = {ycol: "Year", rcol: "AAMR"}
    if gcol:  rename_map[gcol] = "Group"
    if loCol: rename_map[loCol] = "Lo.95"
    if hiCol: rename_map[hiCol] = "Hi.95"
    df = df.rename(columns=rename_map)

    # aggregate if duplicates
    by = ["Year"] + (["Group"] if "Group" in df.columns else [])
    agg_map = {"AAMR":"mean"}
    if "Lo.95" in df.columns: agg_map["Lo.95"] = "mean"
    if "Hi.95" in df.columns: agg_map["Hi.95"] = "mean"
    df = df.groupby(by, as_index=False).agg(agg_map)

    # filter observed window
    df = df[(df["Year"] >= YEAR_MIN) & (df["Year"] <= YEAR_MAX)].copy()
    df = df.sort_values(by).reset_index(drop=True)
    return df

def compute_totals(df: pd.DataFrame) -> pd.DataFrame:
    """Sum across groups per year (AAMR, Lo.95, Hi.95). If CI columns
    are missing, return NaN for them."""
    if "Group" not in df.columns:
        # single series (no groups) -> 'Total' is just the series itself
        out = df[["Year", "AAMR"]].copy()
        out["Lo.95"] = df.get("Lo.95", np.nan)
        out["Hi.95"] = df.get("Hi.95", np.nan)
        out = out.rename(columns={"AAMR":"Total.Point", "Lo.95":"Total.Lo.95", "Hi.95":"Total.Hi.95"})
        return out

    agg_cols = {"AAMR": "sum"}
    if "Lo.95" in df.columns: agg_cols["Lo.95"] = "sum"
    if "Hi.95" in df.columns: agg_cols["Hi.95"] = "sum"

    out = (df.groupby("Year", as_index=False)
             .agg(agg_cols)
             .rename(columns={"AAMR":"Total.Point",
                              "Lo.95":"Total.Lo.95",
                              "Hi.95":"Total.Hi.95"}))
    return out

# ---- RUN ----
xls = pd.ExcelFile(DATA_PATH)
master_rows = []

for sheet in SHEET_NAMES:
    if sheet not in xls.sheet_names:
        print(f"⚠️ Sheet not found: {sheet}. Available: {xls.sheet_names}")
        continue

    print(f"\nProcessing sheet: {sheet}")
    tidy = load_observed_tidy(xls, sheet)

    totals = compute_totals(tidy)
    totals["Variable"] = sheet

    # Save per-sheet totals
    out_file = OUT_DIR / f"{sheet.replace(' ', '_')}_TOTALS_1999_2023.csv"
    totals.to_csv(out_file, index=False)
    print("  Saved:", out_file)

    master_rows.append(totals)

# Save a master file with all variables appended
if master_rows:
    master = pd.concat(master_rows, ignore_index=True)
    master = master[["Variable", "Year", "Total.Point", "Total.Lo.95", "Total.Hi.95"]]
    master_file = OUT_DIR / "MASTER_TOTALS_1999_2023.csv"
    master.to_csv(master_file, index=False)
    print("\n✅ Master totals saved:", master_file)
else:
    print("\nNo outputs created. Check sheet names and column detection.")



Processing sheet: overall final
  Saved: D:\arima project\Task5\observed_totals_1999_2023\overall_final_TOTALS_1999_2023.csv

Processing sheet: age group final
  Saved: D:\arima project\Task5\observed_totals_1999_2023\age_group_final_TOTALS_1999_2023.csv
⚠️ Sheet not found: Race final. Available: ['overall final', 'Race final ', 'census final', 'state final', 'urbanization final', 'age group final']

Processing sheet: census final
  Saved: D:\arima project\Task5\observed_totals_1999_2023\census_final_TOTALS_1999_2023.csv

Processing sheet: urbanization final
  Saved: D:\arima project\Task5\observed_totals_1999_2023\urbanization_final_TOTALS_1999_2023.csv

✅ Master totals saved: D:\arima project\Task5\observed_totals_1999_2023\MASTER_TOTALS_1999_2023.csv
