In [25]:
from pathlib import Path
import os
import subprocess
import sys
import pandas as pd

ROOT = Path.cwd()
PYTHON = sys.executable

print(f"Working directory: {ROOT}")
print(f"Python executable: {PYTHON}")

Working directory: c:\Users\calli\Desktop\GIT Projects\peak-bloom-prediction
Python executable: c:\Users\calli\AppData\Local\Python\pythoncore-3.14-64\python.exe


In [26]:
step_4 = sorted(
    p.name for p in ROOT.glob("4*.py") if p.is_file()
)

pipeline_steps = [
    "0a_generate_metadata.py",
    "0b_generate_blossom_site_metadata.py",
    "1a_aggregate_bloom_data.py",
    "1b_aggregate_climate.py",
    "2_forecast_2026_climate.py",
    "3_feature_engineering.py",
    *step_4,
    "5_model_selection.py",
    "5_stacked_ensemble.py",
]

# "4_process_based_thermal_prediction.py",
#    

for step in pipeline_steps:
    if not (ROOT / step).exists():
        raise FileNotFoundError(f"Missing pipeline script: {step}")

print(f"Validated {len(pipeline_steps)} pipeline scripts.")

Validated 17 pipeline scripts.


In [27]:
# Run the pipeline steps sequentially, measuring execution time for each
import time

RUN_FULL_PIPELINE = True

if RUN_FULL_PIPELINE:
    env = os.environ.copy()
    env["PYTHONUTF8"] = "1"
    env["PYTHONIOENCODING"] = "utf-8"

    total = len(pipeline_steps)
    for idx, script in enumerate(pipeline_steps, start=1):
        print(f"Step {idx}/{total}: {script}")

        start = time.time()
        proc = subprocess.Popen(
            [PYTHON, script],
            cwd=ROOT,
            env=env,
            stdout=subprocess.DEVNULL,
            stderr=subprocess.DEVNULL,
        )

        while proc.poll() is None:
            elapsed = int(time.time() - start)
            print(f"  running... {elapsed}s", flush=True)
            time.sleep(5)

        elapsed = int(time.time() - start)
        if proc.returncode != 0:
            raise RuntimeError(
                f"Pipeline failed at {script} (exit code {proc.returncode}) after {elapsed}s."
            )

        print(f"  completed in {elapsed}s", flush=True)

    print("\nPipeline finished successfully.")
else:
    print("RUN_FULL_PIPELINE is False. Skipping execution.")

Step 1/17: 0a_generate_metadata.py
  running... 0s
  completed in 5s
Step 2/17: 0b_generate_blossom_site_metadata.py
  running... 0s
  completed in 5s
Step 3/17: 1a_aggregate_bloom_data.py
  running... 0s
  completed in 5s
Step 4/17: 1b_aggregate_climate.py
  running... 0s
  completed in 5s
Step 5/17: 2_forecast_2026_climate.py
  running... 0s
  completed in 5s
Step 6/17: 3_feature_engineering.py
  running... 0s
  completed in 5s
Step 7/17: 4_arimax_prediction_model.py
  running... 0s
  completed in 5s
Step 8/17: 4_bayseian_ridge_train_and_predict.py
  running... 0s
  completed in 5s
Step 9/17: 4_gradient_boosting_quantile_train_and_predict.py
  running... 0s
  completed in 5s
Step 10/17: 4_lm_train_and_predict.py
  running... 0s
  completed in 5s
Step 11/17: 4_process_based_dts_model.py
  running... 0s
  running... 5s
  running... 10s
  completed in 15s
Step 12/17: 4_process_based_thermal_prediction.py
  running... 0s
  running... 5s
  completed in 10s
Step 13/17: 4_random_forest_trai

In [28]:
# Dynamically check for output files based on config and model selection
from phenology_config import USE_CV_FOLDS, HOLDOUT_LAST_N_YEARS

output_checks = {}

# Check model selection outputs
model_selection_summary = ROOT / "data/model_outputs/model_selection_metrics_summary.csv"
model_selection_recommended = ROOT / "data/model_outputs/model_selection_recommended_for_ensemble.csv"

output_checks["Model selection summary"] = "data/model_outputs/model_selection_metrics_summary.csv"
output_checks["Model selection recommended"] = "data/model_outputs/model_selection_recommended_for_ensemble.csv"

# Check recommended model holdouts
if model_selection_recommended.exists():
    recommended_df = pd.read_csv(model_selection_recommended)
    recommended_models = recommended_df["model"].tolist()
    
    # Determine holdout file suffix based on USE_CV_FOLDS
    holdout_type = "CV" if USE_CV_FOLDS else f"last{HOLDOUT_LAST_N_YEARS}y"
    
    # Map model names to holdout file paths
    holdout_mapping = {
        "linear_ols": f"holdout_{'cv' if USE_CV_FOLDS else f'last{HOLDOUT_LAST_N_YEARS}y'}_linear_ols.csv",
        "weighted_lm": f"holdout_{'cv' if USE_CV_FOLDS else f'last{HOLDOUT_LAST_N_YEARS}y'}_weighted_lm.csv",
        "bayesian_ridge": f"holdout_{'cv' if USE_CV_FOLDS else f'last{HOLDOUT_LAST_N_YEARS}y'}_bayesian_ridge.csv",
        "ridge": f"holdout_{'cv' if USE_CV_FOLDS else f'last{HOLDOUT_LAST_N_YEARS}y'}_ridge_lasso.csv",
        "lasso": f"holdout_{'cv' if USE_CV_FOLDS else f'last{HOLDOUT_LAST_N_YEARS}y'}_ridge_lasso.csv",
        "gradient_boosting_quantile": f"holdout_{'cv' if USE_CV_FOLDS else f'last{HOLDOUT_LAST_N_YEARS}y'}_gradient_boosting_quantile.csv",
        "arimax": f"holdout_{'cv' if USE_CV_FOLDS else f'last{HOLDOUT_LAST_N_YEARS}y'}_arimax.csv",
        "process_based_thermal": f"holdout_last{HOLDOUT_LAST_N_YEARS}y_process_based_thermal.csv",
        "dts": f"holdout_last{HOLDOUT_LAST_N_YEARS}y_dts.csv",
        "random_forest": f"holdout_last{HOLDOUT_LAST_N_YEARS}y_random_forest.csv",
    }
    
    for model in recommended_models:
        if model in holdout_mapping:
            filename = holdout_mapping[model]
            output_checks[f"{model} holdout ({holdout_type})"] = f"data/model_outputs/holdout/{filename}"

# Check ensemble outputs
output_checks["Stacked predictions"] = "data/model_outputs/predictions/final_2026_predictions_stacked_ensemble.csv"
output_checks["Stacked weights"] = "data/model_outputs/stacked_ensemble_meta_model_weights.csv"
output_checks["Stacked metrics"] = "data/model_outputs/stacked_ensemble_model_metrics.csv"

# Create status dataframe
status_df = pd.DataFrame(
    [
        {"artifact": name, "path": rel_path, "exists": (ROOT / rel_path).exists()}
        for name, rel_path in output_checks.items()
    ]
)

print(f"Config: USE_CV_FOLDS={USE_CV_FOLDS}, HOLDOUT_LAST_N_YEARS={HOLDOUT_LAST_N_YEARS}")
print(f"Checking {len(output_checks)} output files...\n")

status_df

Config: USE_CV_FOLDS=False, HOLDOUT_LAST_N_YEARS=5
Checking 10 output files...



Unnamed: 0,artifact,path,exists
0,Model selection summary,data/model_outputs/model_selection_metrics_sum...,True
1,Model selection recommended,data/model_outputs/model_selection_recommended...,True
2,weighted_lm holdout (last5y),data/model_outputs/holdout/holdout_last5y_weig...,False
3,linear_ols holdout (last5y),data/model_outputs/holdout/holdout_last5y_line...,False
4,bayesian_ridge holdout (last5y),data/model_outputs/holdout/holdout_last5y_baye...,False
5,ridge holdout (last5y),data/model_outputs/holdout/holdout_last5y_ridg...,False
6,lasso holdout (last5y),data/model_outputs/holdout/holdout_last5y_ridg...,False
7,Stacked predictions,data/model_outputs/predictions/final_2026_pred...,True
8,Stacked weights,data/model_outputs/stacked_ensemble_meta_model...,True
9,Stacked metrics,data/model_outputs/stacked_ensemble_model_metr...,True


In [29]:
final_path = ROOT / "data/model_outputs/predictions/final_2026_predictions_stacked_ensemble.csv"
if not final_path.exists():
    raise FileNotFoundError(f"Expected final predictions file not found: {final_path}")

final_pred = pd.read_csv(final_path)
final_pred

Unnamed: 0,location,predicted_date,predicted_doy,90_pi_lower,90_pi_upper,interval_halfwidth_days,90_pi_lower_date,90_pi_upper_date,simple_average,stacked_ensemble,pred_weighted_lm,pred_linear_ols,pred_bayesian_ridge,pred_ridge,pred_lasso
0,kyoto,Apr 02,92.8,86.3,99.3,6.486613,Mar 27,Apr 09,98.2,92.802614,96.7,97.3,99.3,99.1,98.6
1,liestal,Apr 03,93.0,86.5,99.4,6.486613,Mar 27,Apr 09,102.5,92.953485,96.4,97.6,106.8,106.3,105.4
2,newyorkcity,Apr 09,99.7,93.2,106.2,6.486613,Apr 03,Apr 16,106.38,99.697068,104.7,105.5,108.2,107.8,105.7
3,vancouver,Mar 30,89.5,83.0,96.0,6.486613,Mar 24,Apr 06,97.36,89.495903,92.8,93.2,100.3,99.8,100.7
4,washingtondc,Apr 02,92.2,85.7,98.7,6.486613,Mar 26,Apr 08,98.9,92.222817,96.0,96.5,101.0,100.7,100.3


In [30]:
weights_path = ROOT / "data/model_outputs/stacked_ensemble_meta_model_weights.csv"
if not weights_path.exists():
    raise FileNotFoundError(f"Expected weights file not found: {weights_path}")

weights = pd.read_csv(weights_path)
weights

Unnamed: 0,feature,coefficient,weight_percent
0,pred_weighted_lm,0.44894,53.3
1,pred_linear_ols,0.372911,44.2
2,pred_bayesian_ridge,0.025491,3.0
3,pred_ridge,0.030848,3.7
4,pred_lasso,-0.035237,-4.2


In [31]:
submission_cols = [col for col in [
    "location",
    "predicted_date",
    "predicted_doy",
    "90_pi_lower",
    "90_pi_upper",
] if col in final_pred.columns]

final_pred[submission_cols].sort_values("location").reset_index(drop=True)

Unnamed: 0,location,predicted_date,predicted_doy,90_pi_lower,90_pi_upper
0,kyoto,Apr 02,92.8,86.3,99.3
1,liestal,Apr 03,93.0,86.5,99.4
2,newyorkcity,Apr 09,99.7,93.2,106.2
3,vancouver,Mar 30,89.5,83.0,96.0
4,washingtondc,Apr 02,92.2,85.7,98.7


In [32]:
# Model metrics
metrics_file = "data/model_outputs/stacked_ensemble_model_metrics.csv"

metrics = pd.read_csv(metrics_file)

metrics


Unnamed: 0,model,mae_days,mse_days2,rmse_days,holdout_rows,holdout_start_year,holdout_end_year,pi_alpha,pi_halfwidth_days,empirical_coverage,mae_improvement_pct_vs_simple_avg,mse_improvement_vs_simple_avg,rmse_improvement_vs_simple_avg_days
0,pred_weighted_lm,4.7457,29.4306,5.425,35,2016,2025,0.1,6.4866,0.8857,23.3656,15.2023,1.4721
1,pred_linear_ols,4.9714,33.7846,5.8124,35,2016,2025,0.1,6.4866,0.8857,23.3656,15.2023,1.4721
2,pred_bayesian_ridge,5.1143,37.3503,6.1115,35,2016,2025,0.1,6.4866,0.8857,23.3656,15.2023,1.4721
3,pred_ridge,5.1229,37.3774,6.1137,35,2016,2025,0.1,6.4866,0.8857,23.3656,15.2023,1.4721
4,pred_lasso,5.4971,44.1234,6.6425,35,2016,2025,0.1,6.4866,0.8857,23.3656,15.2023,1.4721
5,stacked_ensemble,3.8089,19.6011,4.4273,35,2016,2025,0.1,6.4866,0.8857,23.3656,15.2023,1.4721
6,simple_average,4.9703,34.8034,5.8994,35,2016,2025,0.1,6.4866,0.8857,23.3656,15.2023,1.4721
