In [None]:
# --- Colab mount ---
from google.colab import drive
drive.mount('/content/gdrive')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device:", device)

# --- Paths (make meaning explicit) ---
geo_dir = '/content/gdrive/MyDrive/CYL_geo'
raw_prep_dir = '/content/gdrive/MyDrive/CYL_GHI/prep_files'   # where *_prep.csv files live
artifacts_dir = '/content/gdrive/MyDrive/CYL_GHI/prep_files'  # where *.npz artifacts live (can be same)

# ================================================================
# 1) Stations + static geo features
# ================================================================
stations_csv = os.path.join(geo_dir, 'stations.csv')
df_geo = pd.read_csv(stations_csv)
station_files = df_geo['station_code'].tolist()

vars_geo = df_geo[["height", "Slope_DEM2_U1", "Aspect_DEM2_1", "rastercalc"]].rename(
    columns={"Slope_DEM2_U1": "slope", "Aspect_DEM2_1": "aspect", "rastercalc": "twi"}
)

aspect_rad = np.deg2rad(vars_geo["aspect"].to_numpy())
vars_geo["aspect_cos"] = np.cos(aspect_rad)
vars_geo["aspect_sin"] = np.sin(aspect_rad)
vars_geo = vars_geo.drop(columns=["aspect"])

geo_scaler = MinMaxScaler()
vars_geo_scaled = geo_scaler.fit_transform(vars_geo.to_numpy())
vars_geo = pd.DataFrame(vars_geo_scaled, columns=vars_geo.columns)

# ================================================================
# 2) Load or build temporal tensors
# ================================================================
loading_flag = True  # False => rebuild preprocessing

if loading_flag:
    files_to_load = {
        "node_tensor": os.path.join(artifacts_dir, "node_tensor2.npz"),
        "target_tensor": os.path.join(artifacts_dir, "target_tensor2.npz"),
        "columns": os.path.join(artifacts_dir, "columns2.npz"),
        "masks": os.path.join(artifacts_dir, "masks2.npz"),
    }

    loaded = {}
    for name, path in files_to_load.items():
        if not os.path.exists(path):
            raise FileNotFoundError(f"Missing required artifact: {path}")
        loaded[name] = np.load(path, allow_pickle=True)
        print(f"✅ Loaded: {name}")

    temporal_node_tensor = loaded["node_tensor"]["data"]
    temporal_target_tensor = loaded["target_tensor"]["data"]
    df_cols = loaded["columns"]["data"]

    masks_file = loaded["masks"]
    masks = {k: masks_file[k] for k in masks_file.files}

else:
    df_list, target_list = [], []
    df_cols_ref = None

    for station in station_files:
        path = os.path.join(raw_prep_dir, f"{station}_prep.csv")
        df = pd.read_csv(path)

        # --- Time encodings ---
        ts = pd.to_datetime(df.iloc[:, 0])
        doy = ts.dt.dayofyear.to_numpy()
        tod = (ts.dt.hour * 3600 + ts.dt.minute * 60 + ts.dt.second).to_numpy()

        df = df.assign(
            doy_sin=np.sin(2 * np.pi * doy / 365.0),
            doy_cos=np.cos(2 * np.pi * doy / 365.0),
            tod_sin=np.sin(2 * np.pi * tod / 86400.0),
            tod_cos=np.cos(2 * np.pi * tod / 86400.0),
            wind_dir_sin=lambda x: np.sin(np.radians(x["wind_dir"])),
            wind_dir_cos=lambda x: np.cos(np.radians(x["wind_dir"])),
            sun_azim_sin=lambda x: np.sin(np.radians(x["sun_azim"])),
            sun_azim_cos=lambda x: np.cos(np.radians(x["sun_azim"])),
        )

        # Drop raw azimuth but keep wind_dir last
        if "sun_azim" in df.columns:
            df = df.drop(columns=["sun_azim"])
        cols = [c for c in df.columns if c != "wind_dir"] + ["wind_dir"]
        df = df[cols]

        features = df.iloc[:, 1:]  # drop timestamp
        target = df["GHI"].to_numpy()

        # Column consistency check
        if df_cols_ref is None:
            df_cols_ref = features.columns.to_list()
        else:
            if features.columns.to_list() != df_cols_ref:
                raise ValueError(f"Column mismatch at station {station}. Check *_prep.csv consistency.")

        # Force numeric dtype (prevents object arrays)
        df_list.append(features.to_numpy(dtype=np.float32))
        target_list.append(target.astype(np.float32))

    df_cols = np.array(df_cols_ref, dtype=object)
    temporal_node_tensor = np.stack(df_list, axis=1)         # (T, N, F)
    temporal_target_tensor = np.stack(target_list, axis=1)   # (T, N)

    # --- Normalize selected input features ---
    non_norm_features = [
        "NDVI",
        *[c for c in df_cols if str(c).startswith("cloud_")],
        "wind_dir",  # keep raw wind_dir unscaled
    ]

    norm_mask = np.array([c not in non_norm_features for c in df_cols], dtype=bool)
    notnorm_mask = ~norm_mask

    norm_node = temporal_node_tensor[:, :, norm_mask]
    notnorm_node = temporal_node_tensor[:, :, notnorm_mask]

    T, N, F_norm = norm_node.shape
    scaler = MinMaxScaler()
    norm_2d = norm_node.reshape(-1, F_norm)
    norm_2d_scaled = scaler.fit_transform(norm_2d)
    norm_node_scaled = norm_2d_scaled.reshape(T, N, F_norm).astype(np.float32)

    temporal_node_tensor_scaled = np.zeros_like(temporal_node_tensor, dtype=np.float32)
    temporal_node_tensor_scaled[:, :, norm_mask] = norm_node_scaled
    temporal_node_tensor_scaled[:, :, notnorm_mask] = notnorm_node.astype(np.float32)
    temporal_node_tensor = temporal_node_tensor_scaled

    # --- Masks ---
    F = len(df_cols)
    mask_gate = np.zeros(F, dtype=bool)
    mask_wind = np.zeros(F, dtype=bool)
    mask_embed = np.zeros(F, dtype=bool)
    mask_forecast = np.zeros(F, dtype=bool)
    mask_cloud = np.zeros(F, dtype=bool)

    col_idx = {col: i for i, col in enumerate(df_cols)}

    for key in [
        "GHI","humidity","precipitation","air_temp","sun_elev","AOD",
        "C_GHI","Dew_Point","S_Albedo","Pressure","sun_azim_sin","sun_azim_cos",
        "cloud_Clear","cloud_Probably_Clear","cloud_Water","cloud_Super-Cooled_Water",
        "cloud_Mixed","cloud_Opaque_Ice","cloud_Cirrus","cloud_Overlapping","cloud_Overshooting",
        "NDVI","toa","wind_sp"
    ]:
        if key in col_idx:
            mask_gate[col_idx[key]] = True

    for key in ["wind_dir", "wind_sp"]:
        if key in col_idx:
            mask_wind[col_idx[key]] = True

    for key in [
        "GHI","humidity","precipitation","air_temp","sun_elev","AOD",
        "C_GHI","Dew_Point","S_Albedo","Pressure","sun_azim_sin","sun_azim_cos",
        "cloud_Clear","cloud_Probably_Clear","cloud_Water","cloud_Super-Cooled_Water",
        "cloud_Mixed","cloud_Opaque_Ice","cloud_Cirrus","cloud_Overlapping","cloud_Overshooting"
    ]:
        if key in col_idx:
            mask_embed[col_idx[key]] = True

    for key in [
        "cloud_Clear","cloud_Probably_Clear","cloud_Water","cloud_Super-Cooled_Water",
        "cloud_Mixed","cloud_Opaque_Ice","cloud_Cirrus","cloud_Overlapping","cloud_Overshooting"
    ]:
        if key in col_idx:
            mask_cloud[col_idx[key]] = True

    for key in list(col_idx.keys()):
        if key not in ["wind_dir", "toa", "NDVI"]:
            mask_forecast[col_idx[key]] = True

    masks = {
        "mask_gate": mask_gate,
        "mask_wind": mask_wind,
        "mask_embed": mask_embed,
        "mask_forecast": mask_forecast,
        "mask_cloud": mask_cloud,
    }

    # --- Save artifacts ---
    np.savez_compressed(os.path.join(artifacts_dir, "node_tensor2.npz"), data=temporal_node_tensor)
    np.savez_compressed(os.path.join(artifacts_dir, "target_tensor2.npz"), data=temporal_target_tensor)
    np.savez_compressed(os.path.join(artifacts_dir, "columns2.npz"), data=df_cols)
    np.savez_compressed(os.path.join(artifacts_dir, "masks2.npz"), **masks)
    print("✅ Preprocessing complete and saved.")

# ================================================================
# 3) Wind positions
# ================================================================
wind_cols = df_cols[masks["mask_wind"]].tolist()
if "wind_dir" not in wind_cols or "wind_sp" not in wind_cols:
    raise KeyError("wind_dir and/or wind_sp not found within mask_wind-selected columns.")
wind_dir_pos = wind_cols.index("wind_dir")
wind_sp_pos = wind_cols.index("wind_sp")
print("wind_dir_pos:", wind_dir_pos, "wind_sp_pos:", wind_sp_pos)
