In [1]:
! pip install pygam

Collecting pygam
  Downloading pygam-0.9.1-py3-none-any.whl.metadata (7.1 kB)
Collecting scipy<1.12,>=1.11.1 (from pygam)
  Downloading scipy-1.11.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.4/60.4 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Downloading pygam-0.9.1-py3-none-any.whl (522 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m522.0/522.0 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hDownloading scipy-1.11.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (36.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m36.4/36.4 MB[0m [31m47.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: scipy, pygam
  Attempting uninstall: scipy
    Found existing installation: scipy 1.15.2
    Uninstalling scipy-1.15.2:
      Successfully uninstalled scipy-1.15.2
[31mERROR: pip's dependency resolve

In [5]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from pygam import LinearGAM, s
import xgboost as xgb
from functools import reduce
import warnings

warnings.filterwarnings("ignore")

# Load your historical data
core_data = pd.read_csv("/kaggle/input/data-science/Historical.csv")
core_data = core_data[core_data["Age"] != "Total"]
core_data["Age"] = core_data["Age"].astype(int)

state_columns = [col for col in core_data.columns if col not in ["Year", "Age"]]
future_years = np.arange(2022, 2071)

all_preds = []

def predict_best_model(X, y, future_X):
    # Split into train/test
    split = int(0.8 * len(X))
    X_train, X_test = X[:split], X[split:]
    y_train, y_test = y[:split], y[split:]

    results = {}

    # Linear Regression
    lr = LinearRegression().fit(X_train, y_train)
    results["LR"] = (lr, mean_squared_error(y_test, lr.predict(X_test)))

    # XGBoost
    xgb_model = xgb.XGBRegressor(objective="reg:squarederror", n_estimators=100, max_depth=3)
    xgb_model.fit(X_train, y_train)
    results["XGB"] = (xgb_model, mean_squared_error(y_test, xgb_model.predict(X_test)))

    # GAM
    try:
        gam = LinearGAM(s(0)).fit(X_train.ravel(), y_train)
        results["GAM"] = (gam, mean_squared_error(y_test, gam.predict(X_test.ravel())))
    except:
        pass

    # ETS (needs univariate series)
    try:
        ets_model = ExponentialSmoothing(y_train, trend="add", seasonal=None).fit()
        y_ets_pred = ets_model.forecast(len(y_test))
        results["ETS"] = (ets_model, mean_squared_error(y_test, y_ets_pred))
    except:
        pass

    # Choose best
    best_name = min(results, key=lambda k: results[k][1])
    best_model = results[best_name][0]

    # Predict future
    if best_name == "GAM":
        preds = best_model.predict(future_X.ravel())
    elif best_name == "ETS":
        preds = best_model.forecast(len(future_X))
    else:
        preds = best_model.predict(future_X)

    return np.clip(preds, 0, 1), best_name

best_models_list = []

for state in state_columns:
    state_preds = {"Age": [], "Year": [], state: []}
    for age in core_data["Age"].unique():
        subset = core_data[core_data["Age"] == age][["Year", state]].dropna()
        if len(subset) < 4:
            continue

        X = subset["Year"].values.reshape(-1, 1)
        y = subset[state].values

        y_pred, chosen_model = predict_best_model(X, y, future_years.reshape(-1, 1))
        # print(f"Best model for {state} age {age}: {chosen_model}")
        best_models_list.append({"State": state, "Age": age, "BestModel": chosen_model})


        state_preds["Age"].extend([age] * len(future_years))
        state_preds["Year"].extend(future_years)
        state_preds[state].extend(y_pred)

    all_preds.append(pd.DataFrame(state_preds))

# # Merge all predicted state share dataframes
# predicted_shares_df = reduce(lambda left, right: pd.merge(left, right, on=["Age", "Year"], how="outer"), all_preds)

# # Optional: Normalize to sum to 1 across states
# predicted_shares_df[state_columns] = predicted_shares_df[state_columns].div(
#     predicted_shares_df[state_columns].sum(axis=1), axis=0
# )

# # Save to file (optional)
# # predicted_shares_df.to_csv("predicted_shares_by_best_model.csv", index=False)


In [6]:
# === Count how many times each model was used ===
best_models_df = pd.DataFrame(best_models_list)

# Count overall
model_counts = best_models_df["BestModel"].value_counts()
print("Overall model usage counts:")
print(model_counts)

# Optional: Count per state
model_counts_by_state = best_models_df.groupby("State")["BestModel"].value_counts().unstack(fill_value=0)
print("\nModel usage per state:")
print(model_counts_by_state)


Overall model usage counts:
BestModel
XGB    515
ETS    510
LR     337
GAM     94
Name: count, dtype: int64

Model usage per state:
BestModel               ETS  GAM  LR  XGB
State                                    
Baden-Württemberg        32    0  23   36
Bayern                   21    3  30   37
Berlin                   30    5  20   36
Brandenburg              31    9  20   31
Bremen                   27    5  30   29
Hamburg                  40    2  28   21
Hessen                   30    2  25   34
Mecklenburg-Vorpommern   32   16  15   28
Niedersachsen            34    0  20   37
Nordrhein-Westfalen      33    1  19   38
Rheinland-Pfalz          40    2  22   27
Saarland                 27    9  21   34
Sachsen                  30   14  16   31
Sachsen-Anhalt           26   12  16   37
Schleswig-Holstein       47    3  18   23
Thüringen                30   11  14   36


In [4]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from functools import reduce

# === Step 1: Load Historical Core Data and Projection Data ===
core_data = pd.read_csv("/kaggle/input/data-science/Historical.csv")
projection_data = pd.read_excel("/kaggle/input/data-science/Forecast.xlsx")

# === Step 2: Preprocess Historical State Shares ===
core_data = core_data[core_data["Age"] != "Total"]
core_data["Age"] = core_data["Age"].astype(int)
state_columns = [col for col in core_data.columns if col not in ["Year", "Age"]]

# === Step 3: Predict Future State Shares (Wide Format) ===
future_years = np.arange(2022, 2071)
all_preds = []

for state in state_columns:
    # Prepare training data
    df = core_data[["Year", "Age", state]].dropna()
    if len(df) < 3:
        continue

    X_train = df[["Year", "Age"]]
    y_train = df[state]

    model = XGBRegressor(n_estimators=100, max_depth=3, learning_rate=0.1)
    model.fit(X_train, y_train)

    # Prepare future prediction grid
    future_grid = pd.DataFrame([(y, a) for y in future_years for a in core_data["Age"].unique()],
                               columns=["Year", "Age"])
    y_pred = model.predict(future_grid)
    y_pred = np.clip(y_pred, 0, 1)

    state_preds = pd.DataFrame({
        "Year": future_grid["Year"],
        "Age": future_grid["Age"],
        state: y_pred
    })

    all_preds.append(state_preds)

# Merge all state predictions
predicted_shares_df = reduce(lambda left, right: pd.merge(left, right, on=["Age", "Year"], how="outer"), all_preds)

In [5]:
# === Step 4: Preprocess Projection Data ===
projection_data["Variant"] = projection_data["Variant"].ffill()
projection_data["Variant Description"] = projection_data["Variant Description"].ffill()
projection_data = projection_data[projection_data["Age"] != "Total"]
projection_data["Age"] = projection_data["Age"].astype(int)

proj_long = projection_data.melt(
    id_vars=["Variant", "Variant Description", "Age"],
    var_name="Year",
    value_name="National_Pop_Thousands"
)
proj_long["Year"] = proj_long["Year"].astype(int)
proj_long["National_Pop"] = proj_long["National_Pop_Thousands"] * 1000

In [6]:
# === Step 5: Merge National Projections with Predicted Shares ===
merged = proj_long.merge(predicted_shares_df, on=["Age", "Year"], how="left")

# === Step 6: Multiply Shares with National Population for Each State ===
for state in state_columns:
    if state in merged:
        merged[state] = merged[state] * merged["National_Pop"]

# === Step 7: Final Output in Wide Format ===
output_columns = ["Variant", "Variant Description", "Year", "Age"] + state_columns
final_df = merged[output_columns]
final_df.sort_values(by=["Variant", "Variant Description", "Year", "Age"], inplace=True)

final_df[["Variant", "Variant Description", "Year"]] = final_df[["Variant", "Variant Description", "Year"]].mask(
    final_df[["Variant", "Variant Description", "Year"]].eq(
        final_df[["Variant", "Variant Description", "Year"]].shift()
    )
)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df.sort_values(by=["Variant", "Variant Description", "Year", "Age"], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df[["Variant", "Variant Description", "Year"]] = final_df[["Variant", "Variant Description", "Year"]].mask(


In [7]:
# === Step 8: Save to Excel ===
final_df.to_excel("state_level_projection_wide_xgboost.xlsx", index=False)
