In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
from torch.optim import Adam
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import pandas as pd
from tqdm import tqdm
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
from torch.utils.data import random_split
from torch.amp import autocast, GradScaler

In [None]:
file_paths_2 = {
    "GreenhouseClimate": "/home/cuongdo/AICU/GreenhouseClimate.csv",
    "GrodanSens": "/home/cuongdo/AICU/GrodanSens.csv",
    "Resources": "/home/cuongdo/AICU/Resources.csv",
    "Weather": "/home/cuongdo/Weather/Weather.csv",
    "CropParameters": "/home/cuongdo/AICU/CropParameters.csv"
}

In [None]:
dfs_2 = {}
for name, path in file_paths_2.items():
    df_2 = pd.read_csv(path, low_memory=False)
    df_2.columns = df_2.columns.str.strip().str.replace('\t', '')
    if '%time' in df_2.columns or '%Time' in df_2.columns:
            time_col = '%time' if '%time' in df_2.columns else '%Time'
            df_2[time_col] = pd.to_numeric(df_2[time_col], errors='coerce')
            df_2['Date'] = pd.to_datetime(df_2[time_col], origin='1899-12-30', unit='D')
    elif 'time' in df.columns:
        df_2['time'] = pd.to_numeric(df_2['time'], errors='coerce')
        df_2['Date'] = pd.to_datetime(df_2['time'], origin='1899-12-30', unit='D')
    elif 'Date' in df_2.columns:
        if pd.api.types.is_numeric_dtype(df_2['Date']):
            df_2['Date'] = pd.to_datetime(df_2['Date'], origin='1899-12-30', unit='D', errors='coerce')
        else:
            df_2['Date'] = pd.to_datetime(df_2['Date'], errors='coerce')
    else:
        df_2['Date'] = pd.NaT
    df_2.loc[:, df_2.columns != 'Date'] = df_2.loc[:, df_2.columns != 'Date'].apply(pd.to_numeric, errors='coerce')
    dfs_2[name] = df_2

In [None]:
soil_cols = ['EC_slab1', 'EC_slab2', 'WC_slab1', 'WC_slab2', 't_slab1', 't_slab2']
indoor_cols = ['Tair', 'Rhair', 'CO2air', 'HumDef', 'PipeLow', 'VentLee', 'Ventwind', 'Tot_PAR', 'Tot_PAR_Lamps', 'EC_drain_PC']
weather_cols = ['Tout', 'Rhout', 'Iglob', 'PARout', 'Pyrgeo', 'Rain', 'Winddir', 'Windsp']
crop_cols = ['Stem_elong', 'Stem_thick', 'Cum_trusses', 'stem_dens', 'plant_dens']

In [None]:
def reshape_sliding(df, cols, steps, stride=1):
    df = df.copy()
    df[cols] = df[cols].astype(np.float32)
    arr = df[cols].values
    if len(arr) < steps:
        return np.empty((0, steps, len(cols)))
    windows = [arr[i:i + steps] for i in range(0, len(arr) - steps + 1, stride)]
    return np.stack(windows)

In [None]:
def compute_delta(mask):
    B, T, D = mask.shape
    delta = np.zeros((B, T, D), dtype=np.float32)
    for b in range(B):
        for d in range(D):
            last_obs = 0
            for t in range(T):
                if mask[b, t, d] == 1:
                    delta[b, t, d] = 0
                    last_obs = 0
                else:
                    last_obs += 1
                    delta[b, t, d] = last_obs
    return delta

In [None]:
base_2 = dfs_2['GrodanSens'].copy()
base_2 = base_2.dropna(subset=['Date']).sort_values('Date').reset_index(drop=True)
merge_partners = ['GreenhouseClimate', 'Weather', 'CropParameters']
for name in merge_partners:
    df = dfs_2[name].copy().dropna(subset=['Date']).sort_values('Date')
    if '%time' in df.columns:
        df.drop(columns=['%time'], inplace=True)
    try:
        base_2 = pd.merge_asof(base_2, df, on='Date', direction='nearest', tolerance=pd.Timedelta('1D'))
    except ValueError as e:
        print(f"[ERROR] Skipped {name} during merge: {e}")
resources_df = dfs_2['Resources'].copy().dropna(subset=['Date']).sort_values('Date').reset_index(drop=True)
resources_df['Energy'] = resources_df[['Heat_cons', 'ElecHigh', 'ElecLow']].astype(np.float32).sum(axis=1)
resources_df = resources_df[['Date', 'Energy']]
resources_df['Date'] = resources_df['Date'].dt.floor('D')
base_2['Date'] = base_2['Date'].dt.floor('D')
base_2 = pd.merge_asof(base_2.sort_values('Date'), resources_df, on='Date', direction='nearest', tolerance=pd.Timedelta('1D'))
base_2 = base_2.dropna(subset=['Energy']).reset_index(drop=True)

In [None]:
steps = 288  # full day (5 min x 288 = 24 hours)
# --- Sliding window time-series ---
soil_data_2 = reshape_sliding(base_2, soil_cols, steps=steps, stride=1)
soil_mask_2 = (~np.isnan(soil_data_2)).astype(np.float32)
soil_delta_2 = compute_delta(soil_mask_2)
soil_data_2 = np.nan_to_num(soil_data_2)
indoor_data_2 = reshape_sliding(base_2, indoor_cols, steps=steps, stride=1)
weather_data_2 = reshape_sliding(base_2, weather_cols, steps=steps, stride=1)
# --- Crop is static per day, align with sliding windows ---
crop_data_raw = base_2[crop_cols].astype(np.float32).to_numpy()
crop_data_2 = crop_data_raw[steps - 1:]  # align with window end points

In [None]:
window_end_dates = base_2['Date'].iloc[steps - 1:].reset_index(drop=True)
targets = []
valid_indices = []
for i in range(len(soil_data_2)):
    end_date = window_end_dates[i].floor('D')
    match = resources_df[resources_df['Date'] == (end_date + pd.Timedelta(days=1))]
    if not match.empty:
        energy_val = match['Energy'].values[0]
        if not np.isnan(energy_val):
            targets.append(energy_val.astype(np.float32))
            valid_indices.append(i)

In [None]:
if valid_indices:
    soil_data_2 = soil_data_2[valid_indices]
    soil_mask_2 = soil_mask_2[valid_indices]
    soil_delta_2 = soil_delta_2[valid_indices]
    indoor_data_2 = indoor_data_2[valid_indices]
    weather_data_2 = weather_data_2[valid_indices]
    crop_data_2 = crop_data_2[valid_indices]
    targets = torch.tensor(np.array(targets), dtype=torch.float32).unsqueeze(1)
    dataset_2 = TensorDataset(
        torch.tensor(soil_data_2, dtype=torch.float32),
        torch.tensor(soil_mask_2, dtype=torch.float32),
        torch.tensor(soil_delta_2, dtype=torch.float32),
        torch.tensor(indoor_data_2, dtype=torch.float32),
        torch.tensor(weather_data_2, dtype=torch.float32),
        torch.tensor(crop_data_2, dtype=torch.float32),
        targets
    )
    dataloader_2 = DataLoader(dataset_2, batch_size=512, shuffle=True, num_workers=40, pin_memory=True)
    print(f"Dataloader ready: {len(dataset_2)} labeled windows with next-day energy targets")
else:
    print("No labeled windows found for energy prediction.")

In [None]:
# Lengths
total_len = len(dataset_2)
train_len = int(0.8 * total_len)
val_len = int(0.1 * total_len)
test_len = total_len - train_len - val_len  # handle rounding
# Split dataset
train, val, test = random_split(dataset_2, [train_len, val_len, test_len])
# DataLoaders
batch_size = 512
train_loader_2 = DataLoader(train, batch_size=batch_size, shuffle=True, num_workers=40, pin_memory=True, prefetch_factor = 4)
val_loader_2 = DataLoader(val, batch_size=batch_size, shuffle=False, num_workers=40, pin_memory=True, prefetch_factor = 4)
test_loader_2 = DataLoader(test, batch_size=batch_size, shuffle=False, num_workers=40, pin_memory=True, prefetch_factor = 4)
print(f"Split sizes — Train: {len(train)}, Val: {len(val)}, Test: {len(test)}")