In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

In [4]:
# === Step 1: Load Historical Core Data and Projection Data ===
core_data = pd.read_csv("Historical.csv")  # State-level shares over years
projection_data = pd.read_excel("Forecast.xlsx")  # National projections by variant

In [None]:
# === Step 2: Preprocess Historical State Shares ===
# Remove 'Total' rows
core_data = core_data[core_data["Age"] != "Total"]
core_data["Age"] = core_data["Age"].astype(int)

# Filter for required columns only (states + Age + Year)
state_columns = [col for col in core_data.columns if col not in ["Year", "Age"]]

# === Step 3: Predict Future State Shares (Wide Format) ===
future_years = np.arange(2022, 2071)
all_preds = []

# For each state column, predict share for each age using linear regression
for state in state_columns:
    state_preds = {"Age": [], "Year": [], state: []}
    for age in core_data["Age"].unique():
        subset = core_data[core_data["Age"] == age][["Year", state]].dropna()
        if len(subset) < 3:
            continue  # Not enough data to fit a model

        X = subset["Year"].values.reshape(-1, 1)
        y = subset[state].values
        model = LinearRegression().fit(X, y)
        y_pred = model.predict(future_years.reshape(-1, 1))
        y_pred = np.clip(y_pred, 0, 1)  # Ensure shares stay within [0, 1]

        state_preds["Age"].extend([age] * len(future_years))
        state_preds["Year"].extend(future_years)
        state_preds[state].extend(y_pred)

    all_preds.append(pd.DataFrame(state_preds))

# Merge all predicted state share dataframes on (Age, Year)
from functools import reduce
predicted_shares_df = reduce(lambda left, right: pd.merge(left, right, on=["Age", "Year"], how="outer"), all_preds)

# === Step 4: Preprocess Projection Data ===
projection_data["Variant"] = projection_data["Variant"].ffill()
projection_data["Variant Description"] = projection_data["Variant Description"].ffill()

# Drop 'Total' if present
projection_data = projection_data[projection_data["Age"] != "Total"]
projection_data["Age"] = projection_data["Age"].astype(int)

# Melt projections: one row per (Variant, Age, Year)
proj_long = projection_data.melt(
    id_vars=["Variant", "Variant Description", "Age"],
    var_name="Year",
    value_name="National_Pop_Thousands"
)
proj_long["Year"] = proj_long["Year"].astype(int)
proj_long["National_Pop"] = proj_long["National_Pop_Thousands"] * 1000

# === Step 5: Merge National Projections with Predicted Shares ===
merged = proj_long.merge(predicted_shares_df, on=["Age", "Year"], how="left")

# === Step 6: Multiply Shares with National Population for Each State ===
for state in state_columns:
    merged[state] = merged[state] * merged["National_Pop"]

# === Step 7: Final Output in Wide Format ===
# Keep only necessary columns
output_columns = ["Variant", "Variant Description", "Year", "Age"] + state_columns
final_df = merged[output_columns]

In [17]:
# Sort to ensure order
final_df.sort_values(by=["Variant", "Variant Description", "Year", "Age"], inplace=True)

# Reset repeated values to empty strings
final_df[["Variant", "Variant Description", "Year"]] = final_df[["Variant", "Variant Description", "Year"]].mask(
    final_df[["Variant", "Variant Description", "Year"]].eq(
        final_df[["Variant", "Variant Description", "Year"]].shift()
    )
)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df.sort_values(by=["Variant", "Variant Description", "Year", "Age"], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df[["Variant", "Variant Description", "Year"]] = final_df[["Variant", "Variant Description", "Year"]].mask(


In [18]:
final_df.to_excel("state_level_projection_wide.xlsx", index=False)