In [22]:
from pathlib import Path
import os
import subprocess
import sys
import pandas as pd

ROOT = Path.cwd()
PYTHON = sys.executable

print(f"Working directory: {ROOT}")
print(f"Python executable: {PYTHON}")

Working directory: c:\Users\calli\Desktop\GIT Projects\peak-bloom-prediction
Python executable: c:\Users\calli\AppData\Local\Python\pythoncore-3.14-64\python.exe


In [23]:
step_4 = sorted(
    p.name for p in ROOT.glob("4*.py") if p.is_file()
)

pipeline_steps = [
    "0a_generate_metadata.py",
    "0b_generate_blossom_site_metadata.py",
    "1a_aggregate_bloom_data.py",
    "1b_aggregate_climate.py",
    "2_forecast_2026_climate.py",
    "3_feature_engineering.py",
    *step_4,
    "5_model_selection.py",
    "5_stacked_ensemble.py",
]

# "4_process_based_thermal_prediction.py",
#    

for step in pipeline_steps:
    if not (ROOT / step).exists():
        raise FileNotFoundError(f"Missing pipeline script: {step}")

print(f"Validated {len(pipeline_steps)} pipeline scripts.")

Validated 17 pipeline scripts.


In [24]:
# Run the pipeline steps sequentially, measuring execution time for each
import time

RUN_FULL_PIPELINE = True

if RUN_FULL_PIPELINE:
    env = os.environ.copy()
    env["PYTHONUTF8"] = "1"
    env["PYTHONIOENCODING"] = "utf-8"

    total = len(pipeline_steps)
    for idx, script in enumerate(pipeline_steps, start=1):
        print(f"Step {idx}/{total}: {script}")

        start = time.time()
        proc = subprocess.Popen(
            [PYTHON, script],
            cwd=ROOT,
            env=env,
            stdout=subprocess.DEVNULL,
            stderr=subprocess.DEVNULL,
        )

        while proc.poll() is None:
            elapsed = int(time.time() - start)
            print(f"  running... {elapsed}s", flush=True)
            time.sleep(5)

        elapsed = int(time.time() - start)
        if proc.returncode != 0:
            raise RuntimeError(
                f"Pipeline failed at {script} (exit code {proc.returncode}) after {elapsed}s."
            )

        print(f"  completed in {elapsed}s", flush=True)

    print("\nPipeline finished successfully.")
else:
    print("RUN_FULL_PIPELINE is False. Skipping execution.")

Step 1/17: 0a_generate_metadata.py
  running... 0s
  completed in 5s
Step 2/17: 0b_generate_blossom_site_metadata.py
  running... 0s
  completed in 5s
Step 3/17: 1a_aggregate_bloom_data.py
  running... 0s
  completed in 5s
Step 4/17: 1b_aggregate_climate.py
  running... 0s
  completed in 5s
Step 5/17: 2_forecast_2026_climate.py
  running... 0s
  completed in 5s
Step 6/17: 3_feature_engineering.py
  running... 0s
  completed in 5s
Step 7/17: 4_arimax_prediction_model.py
  running... 0s
  completed in 5s
Step 8/17: 4_bayseian_ridge_train_and_predict.py
  running... 0s
  completed in 5s
Step 9/17: 4_gradient_boosting_quantile_train_and_predict.py
  running... 0s
  completed in 5s
Step 10/17: 4_lm_train_and_predict.py
  running... 0s
  completed in 5s
Step 11/17: 4_process_based_dts_model.py
  running... 0s
  running... 5s
  completed in 10s
Step 12/17: 4_process_based_thermal_prediction.py
  running... 0s
  running... 5s
  completed in 10s
Step 13/17: 4_random_forest_train_and_predict.py


In [25]:
# Dynamically check for output files based on config and model selection
from phenology_config import USE_CV_FOLDS, HOLDOUT_LAST_N_YEARS

output_checks = {}

# Check model selection outputs
model_selection_summary = ROOT / "data/model_outputs/model_selection_metrics_summary.csv"
model_selection_recommended = ROOT / "data/model_outputs/model_selection_recommended_for_ensemble.csv"

output_checks["Model selection summary"] = "data/model_outputs/model_selection_metrics_summary.csv"
output_checks["Model selection recommended"] = "data/model_outputs/model_selection_recommended_for_ensemble.csv"

# Check recommended model holdouts
if model_selection_recommended.exists():
    recommended_df = pd.read_csv(model_selection_recommended)
    recommended_models = recommended_df["model"].tolist()
    
    # Determine holdout file suffix based on USE_CV_FOLDS
    holdout_type = "CV" if USE_CV_FOLDS else f"last{HOLDOUT_LAST_N_YEARS}y"
    
    # Map model names to holdout file paths
    holdout_mapping = {
        "linear_ols": f"holdout_{'cv' if USE_CV_FOLDS else f'last{HOLDOUT_LAST_N_YEARS}y'}_linear_ols.csv",
        "weighted_lm": f"holdout_{'cv' if USE_CV_FOLDS else f'last{HOLDOUT_LAST_N_YEARS}y'}_weighted_lm.csv",
        "bayesian_ridge": f"holdout_{'cv' if USE_CV_FOLDS else f'last{HOLDOUT_LAST_N_YEARS}y'}_bayesian_ridge.csv",
        "ridge": f"holdout_{'cv' if USE_CV_FOLDS else f'last{HOLDOUT_LAST_N_YEARS}y'}_ridge_lasso.csv",
        "lasso": f"holdout_{'cv' if USE_CV_FOLDS else f'last{HOLDOUT_LAST_N_YEARS}y'}_ridge_lasso.csv",
        "gradient_boosting_quantile": f"holdout_{'cv' if USE_CV_FOLDS else f'last{HOLDOUT_LAST_N_YEARS}y'}_gradient_boosting_quantile.csv",
        "arimax": f"holdout_{'cv' if USE_CV_FOLDS else f'last{HOLDOUT_LAST_N_YEARS}y'}_arimax.csv",
        "process_based_thermal": f"holdout_last{HOLDOUT_LAST_N_YEARS}y_process_based_thermal.csv",
        "dts": f"holdout_last{HOLDOUT_LAST_N_YEARS}y_dts.csv",
        "random_forest": f"holdout_last{HOLDOUT_LAST_N_YEARS}y_random_forest.csv",
    }
    
    for model in recommended_models:
        if model in holdout_mapping:
            filename = holdout_mapping[model]
            output_checks[f"{model} holdout ({holdout_type})"] = f"data/model_outputs/holdout/{filename}"

# Check ensemble outputs
output_checks["Stacked predictions"] = "data/model_outputs/predictions/final_2026_predictions_stacked_ensemble.csv"
output_checks["Stacked weights"] = "data/model_outputs/stacked_ensemble_meta_model_weights.csv"
output_checks["Stacked metrics"] = "data/model_outputs/stacked_ensemble_model_metrics.csv"

# Create status dataframe
status_df = pd.DataFrame(
    [
        {"artifact": name, "path": rel_path, "exists": (ROOT / rel_path).exists()}
        for name, rel_path in output_checks.items()
    ]
)

print(f"Config: USE_CV_FOLDS={USE_CV_FOLDS}, HOLDOUT_LAST_N_YEARS={HOLDOUT_LAST_N_YEARS}")
print(f"Checking {len(output_checks)} output files...\n")

status_df

Config: USE_CV_FOLDS=False, HOLDOUT_LAST_N_YEARS=20
Checking 10 output files...



Unnamed: 0,artifact,path,exists
0,Model selection summary,data/model_outputs/model_selection_metrics_sum...,True
1,Model selection recommended,data/model_outputs/model_selection_recommended...,True
2,weighted_lm holdout (last20y),data/model_outputs/holdout/holdout_last20y_wei...,True
3,linear_ols holdout (last20y),data/model_outputs/holdout/holdout_last20y_lin...,True
4,ridge holdout (last20y),data/model_outputs/holdout/holdout_last20y_rid...,True
5,bayesian_ridge holdout (last20y),data/model_outputs/holdout/holdout_last20y_bay...,True
6,lasso holdout (last20y),data/model_outputs/holdout/holdout_last20y_rid...,True
7,Stacked predictions,data/model_outputs/predictions/final_2026_pred...,True
8,Stacked weights,data/model_outputs/stacked_ensemble_meta_model...,True
9,Stacked metrics,data/model_outputs/stacked_ensemble_model_metr...,True


In [26]:
final_path = ROOT / "data/model_outputs/predictions/final_2026_predictions_stacked_ensemble.csv"
if not final_path.exists():
    raise FileNotFoundError(f"Expected final predictions file not found: {final_path}")

final_pred = pd.read_csv(final_path)
final_pred

Unnamed: 0,location,predicted_date,predicted_doy,90_pi_lower,90_pi_upper,interval_halfwidth_days,90_pi_lower_date,90_pi_upper_date,simple_average,stacked_ensemble,pred_weighted_lm,pred_linear_ols,pred_ridge,pred_bayesian_ridge,pred_lasso
0,kyoto,Apr 05,95.2,87.7,102.7,7.512814,Mar 28,Apr 12,98.48,95.218594,97.1,97.5,99.3,99.6,98.9
1,liestal,Apr 07,97.3,89.8,104.8,7.512814,Mar 30,Apr 14,103.0,97.300222,96.3,97.8,107.1,107.7,106.1
2,newyorkcity,Apr 13,103.0,95.4,110.5,7.512814,Apr 05,Apr 20,107.2,102.954853,106.1,106.2,108.4,108.9,106.4
3,vancouver,Apr 03,93.2,85.7,100.7,7.512814,Mar 26,Apr 10,97.82,93.221003,92.9,93.4,100.5,101.2,101.1
4,washingtondc,Apr 05,95.3,87.8,102.9,7.512814,Mar 28,Apr 12,99.3,95.339245,96.4,96.8,101.1,101.5,100.7


In [27]:
weights_path = ROOT / "data/model_outputs/stacked_ensemble_meta_model_weights.csv"
if not weights_path.exists():
    raise FileNotFoundError(f"Expected weights file not found: {weights_path}")

weights = pd.read_csv(weights_path)
weights

Unnamed: 0,feature,coefficient,weight_percent
0,pred_weighted_lm,0.315251,36.4
1,pred_linear_ols,0.264991,30.6
2,pred_ridge,0.139178,16.1
3,pred_bayesian_ridge,0.127086,14.7
4,pred_lasso,0.019354,2.2


In [28]:
submission_cols = [col for col in [
    "location",
    "predicted_date",
    "predicted_doy",
    "90_pi_lower",
    "90_pi_upper",
] if col in final_pred.columns]

final_pred[submission_cols].sort_values("location").reset_index(drop=True)

Unnamed: 0,location,predicted_date,predicted_doy,90_pi_lower,90_pi_upper
0,kyoto,Apr 05,95.2,87.7,102.7
1,liestal,Apr 07,97.3,89.8,104.8
2,newyorkcity,Apr 13,103.0,95.4,110.5
3,vancouver,Apr 03,93.2,85.7,100.7
4,washingtondc,Apr 05,95.3,87.8,102.9


In [29]:
# Model metrics
metrics_file = "data/model_outputs/stacked_ensemble_model_metrics.csv"

metrics = pd.read_csv(metrics_file)

metrics


Unnamed: 0,model,mae_days,mse_days2,rmse_days,holdout_rows,holdout_start_year,holdout_end_year,pi_alpha,pi_halfwidth_days,empirical_coverage,mae_improvement_pct_vs_simple_avg,mse_improvement_vs_simple_avg,rmse_improvement_vs_simple_avg_days
0,pred_weighted_lm,4.2875,24.74,4.9739,64,2006,2025,0.1,7.5128,0.8906,10.4287,6.483,0.6526
1,pred_linear_ols,4.4688,28.1191,5.3027,64,2006,2025,0.1,7.5128,0.8906,10.4287,6.483,0.6526
2,pred_ridge,4.6266,30.7002,5.5408,64,2006,2025,0.1,7.5128,0.8906,10.4287,6.483,0.6526
3,pred_bayesian_ridge,4.6547,31.0836,5.5753,64,2006,2025,0.1,7.5128,0.8906,10.4287,6.483,0.6526
4,pred_lasso,4.8562,34.2987,5.8565,64,2006,2025,0.1,7.5128,0.8906,10.4287,6.483,0.6526
5,stacked_ensemble,3.996,21.54,4.6411,64,2006,2025,0.1,7.5128,0.8906,10.4287,6.483,0.6526
6,simple_average,4.4613,28.0231,5.2937,64,2006,2025,0.1,7.5128,0.8906,10.4287,6.483,0.6526
