In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from datetime import date
from tqdm import tqdm

CSV_PATH = "cleaned_climate_data.csv"    # update path if needed
FORECAST_YEAR = 2026
FORECAST_WEEKS = list(range(1, 16))     # weeks 1..15
LAGS = 4                                # number of lag weeks to use as features
MIN_ROWS_TO_TRAIN = 52                  # minimum weekly rows to attempt training
RANDOM_STATE = 42
OUT_CSV = "weather_forecast_2026.csv"


In [2]:

# Helper: convert (year, iso_week) -> Monday date (ISO calendar)
def iso_week_to_date(y, w):
    try:
        return date.fromisocalendar(int(y), int(w), 1)  # Monday of that ISO week
    except Exception:
        return None

# Load
df = pd.read_csv(CSV_PATH)

required_cols = {"mount_id", "Year", "week", "max_temp", "min_temp", "rain"}
if not required_cols.issubset(df.columns):
    raise ValueError(f"Input CSV must contain columns: {required_cols}")

# Build date column (Monday of ISO week)
df["date"] = df.apply(lambda r: iso_week_to_date(r["Year"], r["week"]), axis=1)
df = df.dropna(subset=["date"]).copy()
df["date"] = pd.to_datetime(df["date"])

# Sort
df = df.sort_values(["mount_id", "date"]).reset_index(drop=True)

# Precompute historical weekly means per mount_id+week (for fallback)
hist_week_mean = df.groupby(["mount_id", "week"])[["max_temp","min_temp","rain"]].mean().reset_index()

# Per-mount forecasting
mount_ids = df["mount_id"].unique().tolist()
targets = ["max_temp", "min_temp", "rain"]

results = []

for mid in tqdm(mount_ids, desc="Mounts"):
    sub = df[df["mount_id"] == mid].copy().set_index("date").sort_index()
    if sub.shape[0] == 0:
        continue

    # Reindex to a weekly Monday frequency between first and last data point to create consistent lags
    start = sub.index.min()
    end = sub.index.max()
    weekly_idx = pd.date_range(start=start, end=end, freq="W-MON")
    sub = sub.reindex(weekly_idx)
    # keep Year and week (recompute from index)
    sub["year_iso"] = sub.index.isocalendar().year
    sub["week_iso"] = sub.index.isocalendar().week

    # Forward/backfill the mount_id column
    sub["mount_id"] = mid

    # Build lag features for each target
    for t in targets:
        for lag in range(1, LAGS+1):
            sub[f"{t}_lag{lag}"] = sub[t].shift(lag)

    # Feature columns: week, year, and lag columns
    feature_cols = ["week_iso", "year_iso"]
    for t in targets:
        for lag in range(1, LAGS+1):
            feature_cols.append(f"{t}_lag{lag}")

    # Training set: rows where target is not null and all lag features exist (not-null)
    # We'll train one model per target using the same feature set (lags of all targets can help)
    trained_models = {}
    for target in targets:
        train_mask = sub[target].notna()
        for lag in range(1, LAGS+1):
            train_mask &= sub[f"{target}_lag{lag}"].notna()
        # Also require at least some rows
        train_df = sub[train_mask].dropna(subset=feature_cols + [target])
        if train_df.shape[0] >= MIN_ROWS_TO_TRAIN:
            X = train_df[feature_cols].values
            y = train_df[target].values
            # fit RandomForestRegressor
            model = RandomForestRegressor(n_estimators=100, random_state=RANDOM_STATE, n_jobs=-1)
            model.fit(X, y)
            trained_models[target] = model
        else:
            trained_models[target] = None  # will fallback to weekly mean

    # Prepare the lag buffer for iterative forecasting: use the most recent available values
    last_row = sub.iloc[-1]
    # Build an array of last known lags for each target (lag1..lagLAGS)
    lag_buffer = {}
    for t in targets:
        buffer_vals = []
        # collect last LAGS values if available in the reindexed sub
        for lag in range(1, LAGS+1):
            # the value at -lag index relative to last valid index
            val = sub[t].shift(lag).iloc[-1]
            if pd.isna(val):
                # try to fallback to per-week mean for the corresponding ISO week (last known week - lag)
                wk_index = (last_row["week_iso"] - lag) % 52
                wk_index = wk_index if wk_index != 0 else 52
                row_mean = hist_week_mean[(hist_week_mean["mount_id"] == mid) & (hist_week_mean["week"] == wk_index)]
                if row_mean.shape[0] > 0:
                    val = float(row_mean[t].iloc[0])
                else:
                    val = sub[t].mean() if sub[t].dropna().shape[0] > 0 else 0.0
            buffer_vals.append(float(val))
        lag_buffer[t] = buffer_vals  # lag_buffer[t][0] is lag1 (most recent), [1] is lag2, ...

    # Iteratively predict weeks 1..15 of 2026
    # We'll iterate predictions: for each step, create feature vector = (week_iso, year_iso, all lag features)
    for wk in FORECAST_WEEKS:
        # compute current iso date for forecasting week
        try:
            wk_date = pd.to_datetime(date.fromisocalendar(FORECAST_YEAR, wk, 1))
            wk_iso_week = wk
            wk_iso_year = FORECAST_YEAR
        except Exception:
            wk_date = pd.NaT
            wk_iso_week = wk
            wk_iso_year = FORECAST_YEAR

        preds_step = {}
        for target in targets:
            model = trained_models.get(target)
            # build feature vector: week_iso, year_iso, then lag features for each target in same order
            feat = [wk_iso_week, wk_iso_year]
            # append lag features for all targets (we include cross-target lags)
            for t in targets:
                # use lag_buffer[t] in order lag1..lagLAGS
                feat.extend(lag_buffer[t])
            feat_arr = np.array(feat).reshape(1, -1)

            if model is not None:
                pred_val = float(model.predict(feat_arr)[0])
            else:
                # fallback: use historical mean for this mount_id and week
                row_mean = hist_week_mean[(hist_week_mean["mount_id"] == mid) & (hist_week_mean["week"] == wk_iso_week)]
                if row_mean.shape[0] > 0:
                    pred_val = float(row_mean[target].iloc[0])
                else:
                    # final fallback: use overall mean of the target for this mount
                    pred_val = float(sub[target].dropna().mean()) if sub[target].dropna().shape[0] > 0 else 0.0

            preds_step[target] = pred_val

        # append predicted values to results
        results.append({
            "mount_id": mid,
            "year": FORECAST_YEAR,
            "week": wk,
            "date": wk_date,
            "max_temp": round(preds_step["max_temp"], 3),
            "min_temp": round(preds_step["min_temp"], 3),
            "rain": round(preds_step["rain"], 3)
        })

        # update lag_buffer by shifting and inserting the newest predictions as lag1
        for t in targets:
            # insert pred at front, drop oldest
            lag_buffer[t] = [preds_step[t]] + lag_buffer[t][:LAGS-1]

# Put results into DataFrame
forecast_df = pd.DataFrame(results)
forecast_df = forecast_df[["mount_id","year","week","date","max_temp","min_temp","rain"]]
forecast_df = forecast_df.sort_values(["mount_id","week"]).reset_index(drop=True)

Mounts: 100%|██████████| 7/7 [00:15<00:00,  2.26s/it]


In [3]:
# Save and show
forecast_df.to_csv(OUT_CSV, index=False)
pd.set_option("display.max_rows", 300)
print(f"\nSaved forecast to: {OUT_CSV}")


Saved forecast to: forecast_2026_wks1_15_sklearn.csv
