In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import math

dataset_path = '/kaggle/input/unipd-deep-learning-2025-challenge-2/train_dataset.csv'

df = pd.read_csv(dataset_path)
feature_cols = [c for c in df.columns if c not in ('station', 'time')]
df_sorted = df.sort_values(['station', 'time'])
values = df_sorted[feature_cols].to_numpy()
n_stations = df['station'].nunique()
n_times = df['time'].nunique()
n_features = len(feature_cols)
dataset_tensor = values.reshape(n_stations, n_times, n_features)
input_window = 60
forecast_horizon = 30
train_split = 0.9999
seed = 42
torch.manual_seed(seed)
train_tensor = dataset_tensor[:, :-input_window]
test_tensor = dataset_tensor[:, -input_window:]

In [2]:
import numpy as np
from scipy.stats import skew
from sklearn.preprocessing import FunctionTransformer
import pandas as pd
from scipy import stats
import sklearn.preprocessing as sk 
from scipy import stats
import sklearn.preprocessing as sk

def transform_weather_data(data_tensor, forward=True, method='box-cox', standardize=True, transformers=None, skew_threshold=1.0):
    transformed_data = data_tensor.copy()
    n_stations, n_days, n_features = transformed_data.shape
    if forward:
        if transformers is not None:
            for i in range(n_features):
                transformer = transformers[i]
                if transformer is None:
                    raise ValueError(f'A provided transformer is missing for feature {i + 1}')
                feature_data = transformed_data[:, :, i].reshape(-1, 1)
                transformed_data[:, :, i] = transformer.transform(feature_data).reshape(n_stations, n_days)
        else:
            transformers = [None] * n_features
            for i in range(n_features):
                feature_data = transformed_data[:, :, i].reshape(-1, 1)
                skew = stats.skew(feature_data.flatten())
                if abs(skew) > skew_threshold:
                    if standardize:
                        transformer = sk.StandardScaler()
                    else:
                        transformer = sk.FunctionTransformer()
                else:
                    # If skew is below threshold, skip power transforms and just standardize or passthrough
                    if standardize:
                        transformer = sk.StandardScaler()
                    else:
                        transformer = sk.FunctionTransformer()

                transformed_data[:, :, i] = transformer.fit_transform(feature_data).reshape(n_stations, n_days)
                transformers[i] = transformer

        return transformed_data, transformers
    else:
        if transformers is None:
            raise ValueError("For backpass transformers must be provided")
        for i in range(n_features):
            feature_data = transformed_data[:, :, i].reshape(-1, 1)
            transformer = transformers[i]
            if transformer is None:
                raise ValueError(f"Transformer missing for feature {i + 1}")
            transformed_data[:, :, i] = transformer.inverse_transform(feature_data).reshape(n_stations, n_days)

        return transformed_data, transformers

transformed_train_tensor, transformers = transform_weather_data(train_tensor, forward=True, method='none', standardize=True, skew_threshold=0)
transformed_test_tensor, _ = transform_weather_data(test_tensor, forward=True, transformers=transformers)

In [3]:
import torch
from torch.utils.data import Dataset, DataLoader

class DatasetW(Dataset):
    def __init__(self, data, train=False, input_window=30, forecast_horizon=30):
        self.data = data
        self.input_window = input_window
        self.forecast_horizon = forecast_horizon
        self.samples = []

        num_stations, num_days, _ = data.shape

        for station in range(num_stations):
            if train:
                windows = range(num_days - input_window - forecast_horizon + 1)
            else:
                windows = range(num_days - input_window + 1)
            for start in windows:
                station_feature = torch.full((input_window, 1), station, dtype=torch.float32)
                day_feature = torch.arange(start, start+input_window, dtype=torch.float32).reshape((input_window, 1))
                weather_features = torch.tensor(data[station, start:start + input_window], dtype=torch.float32)
                input_seq = torch.cat([station_feature, day_feature, weather_features], dim=1)

                if train:
                    station_feature_target = torch.full((forecast_horizon, 1), station, dtype=torch.float32)
                    day_feature_target = torch.arange(start+input_window, start+input_window+forecast_horizon, dtype=torch.float32).reshape((forecast_horizon, 1))
                    weather_features_target = torch.tensor(data[station, start + input_window:start + input_window + forecast_horizon], dtype=torch.float32)
                    target_seq = torch.cat([station_feature_target, day_feature_target, weather_features_target], dim=1)
                
                    self.samples.append((input_seq, target_seq))
                else:
                    self.samples.append(input_seq)

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return self.samples[idx]

In [4]:
from torch.utils.data import DataLoader, random_split

batch_size = 1024

train_dataset, val_dataset = random_split(
  DatasetW(transformed_train_tensor, train=True, input_window=input_window, forecast_horizon=forecast_horizon),
  lengths=[train_split, 1 - train_split]
)
test_dataset = DatasetW(transformed_test_tensor, train=False, input_window=input_window)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

print(f'len(train_loader) = {len(train_loader)}')
print(f'len(val_loader) = {len(val_loader)}')
print(f'len(test_loader) = {len(test_loader)}')

len(train_loader) = 225
len(val_loader) = 1
len(test_loader) = 422


In [5]:
def mase_loss(y_pred, y_true, y_history):
    mae_model = torch.mean(torch.abs(y_true - y_pred))
    last_observed = y_history[:, -1:, :]  # (batch, 1, input_dim)
    naive_forecast = last_observed.repeat(1, y_true.shape[1], 1)  # (not repeat batch, forecast_horizon, not repeat input_dim)
    mae_naive = torch.mean(torch.abs(y_true - naive_forecast))
    return mae_model / (mae_naive + 1e-8)  # avoid division by zero

In [6]:
import torch
import torch.nn as nn
import math

class LSTM(nn.Module):
    def __init__(self,weather_features,num_stations,station_embed_dim,hidden_size,num_layers,dropout,forecast_horizon,):
        super().__init__()

        self.weather_features = weather_features
        self.station_embed_dim = station_embed_dim
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout_p = dropout
        self.forecast_horizon = forecast_horizon
        self.station_embedding = nn.Embedding(num_stations, station_embed_dim)
        self.input_size = station_embed_dim + 2 + weather_features
        self.lstm_cells = nn.ModuleList()
        for layer_idx in range(num_layers):
            input_dim = self.input_size if layer_idx == 0 else hidden_size
            cell = nn.LSTMCell(input_size=input_dim, hidden_size=hidden_size)
            self.lstm_cells.append(cell)
        self.dropout = nn.Dropout(self.dropout_p)
        self.output_projection = nn.Linear(hidden_size, weather_features)

    def _encode(self, station_ids: torch.LongTensor, day_values: torch.Tensor, weather_feats: torch.Tensor):
        station_embeds = self.station_embedding(station_ids)
        day_vals = day_values.float()
        day_rad = 2 * math.pi * day_vals / 365.25
        day_sin = torch.sin(day_rad).unsqueeze(1)
        day_cos = torch.cos(day_rad).unsqueeze(1)
        cyclical_day = torch.cat([day_sin, day_cos], dim=1)
        return torch.cat([station_embeds, cyclical_day, weather_feats], dim=1)

    def forward(self, src: torch.Tensor):
        batch_size, seq_len, input_dim = src.shape
        device = src.device
        station_ids = src[:, 0, 0].long().to(device)
        day_seq = src[:, :, 1].to(device)
        weather_seq = src[:, :, 2:].to(device)
        h_states = [
            torch.zeros(batch_size, self.hidden_size, device=device)
            for _ in range(self.num_layers)
        ]
        c_states = [
            torch.zeros(batch_size, self.hidden_size, device=device)
            for _ in range(self.num_layers)
        ]

        for t in range(seq_len):
            day_t = day_seq[:, t]
            weather_t = weather_seq[:, t, :]
            step_input = self._encode(station_ids, day_t, weather_t)
            for layer_idx, cell in enumerate(self.lstm_cells):
                h_prev, c_prev = h_states[layer_idx], c_states[layer_idx]
                if layer_idx == 0:
                    inp = step_input
                else:
                    inp = self.dropout(h_states[layer_idx - 1])
                h_new, c_new = cell(inp, (h_prev, c_prev))
                h_states[layer_idx], c_states[layer_idx] = h_new, c_new

        predictions = []
        last_day = day_seq[:, -1]
        last_weather = weather_seq[:, -1, :]

        for step in range(self.forecast_horizon):
            next_day = last_day + 1.0 + step
            step_inp = self._encode(station_ids, next_day, last_weather)
            for layer_idx, cell in enumerate(self.lstm_cells):
                h_prev, c_prev = h_states[layer_idx], c_states[layer_idx]
                if layer_idx == 0:
                    inp = step_inp
                else:
                    inp = self.dropout(h_states[layer_idx - 1])
                h_new, c_new = cell(inp, (h_prev, c_prev))
                h_states[layer_idx], c_states[layer_idx] = h_new, c_new
            top_h = h_states[-1]
            pred_weather = self.output_projection(self.dropout(top_h)).unsqueeze(1)
            predictions.append(pred_weather)
            last_weather = pred_weather.squeeze(1)

        return torch.cat(predictions, dim=1)

In [7]:
import os
import math
import torch
import torch.nn as nn
from torch.optim.lr_scheduler import LambdaLR
from torch.nn.utils import clip_grad_norm_ as clip_grad_norm

def lr_lambda(current_step: int, warmup_steps: int, total_steps: int):
    if current_step < warmup_steps:
        return float(current_step) / float(max(1, warmup_steps))
    progress = float(current_step - warmup_steps) / float(max(1, total_steps - warmup_steps))
    return 0.5 * (1.0 + math.cos(math.pi * progress))

def train_model(model: nn.Module,train_loader: torch.utils.data.DataLoader,val_loader: torch.utils.data.DataLoader,model_type: str = 'unk',num_epochs: int = 100,lr: float = 1e-4,warmup_steps: int = 1000,weight_decay: float = 1e-4,patience: int = 5,grad_clip: float = 1.3,device: str = 'cuda',save_path: str = 'models/'):

    model.to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)

    total_steps = len(train_loader) * num_epochs
    scheduler = LambdaLR(
        optimizer=optimizer,
        lr_lambda=lambda step: lr_lambda(step, warmup_steps, total_steps)
    )

    criterion = nn.L1Loss()
    os.makedirs(save_path, exist_ok=True)

    best_val_loss = float("inf")
    epochs_no_improve = 0

    for epoch in range(1, num_epochs + 1):
        model.train()
        total_train_loss = 0.0
        total_train_mase = 0.0

        for batch_idx, (src, tgt) in enumerate(train_loader, start=1):
            src = src.to(device)
            tgt = tgt.to(device)

            optimizer.zero_grad()
            output = model(src)

            loss = criterion(output, tgt[:, :, 2:])
            loss.backward()

            clip_grad_norm(model.parameters(), max_norm=grad_clip)
            optimizer.step()
            scheduler.step()

            total_train_loss += loss.item()
            total_train_mase += mase_loss(output, tgt[:, :, 2:], src[:, :, 2:]).item()

            if batch_idx % 300 == 0:
                print(
                    f"[Epoch {epoch:02d} | Batch {batch_idx:04d}] "
                    f"MAE: {loss.item():.4f} | MASE: {total_train_mase / batch_idx:.4f}"
                )

        mean_train_loss = total_train_loss / len(train_loader)
        mean_train_mase = total_train_mase / len(train_loader)

        model.eval()
        total_val_loss = 0.0
        total_val_mase = 0.0

        with torch.no_grad():
            for src, tgt in val_loader:
                src = src.to(device)
                tgt = tgt.to(device)
                pred = model(src)

                val_loss = criterion(pred, tgt[:, :, 2:]).item()
                val_mase = mase_loss(pred, tgt[:, :, 2:], src[:, :, 2:]).item()

                total_val_loss += val_loss
                total_val_mase += val_mase

        mean_val_loss = total_val_loss / len(val_loader)
        mean_val_mase = total_val_mase / len(val_loader)

        print(
            f"Epoch {epoch:02d}/{num_epochs} | "
            f"Train MAE: {mean_train_loss:.4f} | Train MASE: {mean_train_mase:.4f} | "
            f"Val   MAE: {mean_val_loss:.4f} | Val   MASE: {mean_val_mase:.4f}"
        )

        if mean_train_loss < best_val_loss - 0.004:
            best_val_loss = mean_train_loss
            epochs_no_improve = 0
            checkpoint_path = os.path.join(save_path, f"epoch_{epoch:02d}.pt")
            torch.save(model.state_dict(), checkpoint_path)
        else:
            epochs_no_improve += 1
            if epochs_no_improve >= patience:
                print(f"\nEarly stopping at epoch {epoch} (no improvement for {patience} epochs).")
                break
        print(epochs_no_improve)
    print(f"\nBest validation MAE = {best_val_loss:.4f}")


In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'device={device}')
model = LSTM(
    weather_features=76, 
    num_stations=422,
    station_embed_dim=12,      
    hidden_size=48,           
    num_layers=2,
    dropout=0.18,
    forecast_horizon=forecast_horizon
)
train_model(
    model,
	train_loader, val_loader,
    num_epochs=120, lr=1e-4, weight_decay=5e-2, patience=10, warmup_steps=500,
    device=device,
    save_path='models/'
)

device=cuda
Epoch 01/120 | Train MAE: 0.5840 | Train MASE: 0.9629 | Val   MAE: 0.4770 | Val   MASE: 0.9489
0
Epoch 02/120 | Train MAE: 0.5194 | Train MASE: 0.8564 | Val   MAE: 0.3741 | Val   MASE: 0.7441
0
Epoch 03/120 | Train MAE: 0.4768 | Train MASE: 0.7860 | Val   MAE: 0.3514 | Val   MASE: 0.6990
0
Epoch 04/120 | Train MAE: 0.4557 | Train MASE: 0.7512 | Val   MAE: 0.3369 | Val   MASE: 0.6701
0
Epoch 05/120 | Train MAE: 0.4409 | Train MASE: 0.7269 | Val   MAE: 0.3313 | Val   MASE: 0.6590
0
Epoch 06/120 | Train MAE: 0.4327 | Train MASE: 0.7135 | Val   MAE: 0.3286 | Val   MASE: 0.6537
0
Epoch 07/120 | Train MAE: 0.4274 | Train MASE: 0.7046 | Val   MAE: 0.3252 | Val   MASE: 0.6469
0
Epoch 08/120 | Train MAE: 0.4235 | Train MASE: 0.6983 | Val   MAE: 0.3212 | Val   MASE: 0.6389
1
Epoch 09/120 | Train MAE: 0.4206 | Train MASE: 0.6935 | Val   MAE: 0.3201 | Val   MASE: 0.6367
0
Epoch 10/120 | Train MAE: 0.4184 | Train MASE: 0.6897 | Val   MAE: 0.3190 | Val   MASE: 0.6346
1
Epoch 11/120 | Tra

In [9]:
import torch
import os
import glob
import pandas as pd

models_path = 'models'
model_name = sorted(glob.glob(os.path.join(models_path, '*.pt')), reverse = True)[0]
print(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if model_name is not None:
  model.load_state_dict(torch.load(model_name, weights_only=True))
model.eval()
model.to(device)

results_df = pd.DataFrame(columns=df.columns, index=None)
for station, station_seq in enumerate(test_loader):
  pred = model(station_seq.to(device))
  pred, _ = transform_weather_data(pred.detach().cpu().numpy(), forward=False, transformers=transformers)
  pred = pred[0]
  station_df = pd.DataFrame(pred, columns=results_df.columns[2:])
  station_df.insert(0, 'station', [station]*pred.shape[0])
  station_df.insert(1, 'time', range(df['time'].max(), df['time'].max() + pred.shape[0]))
  results_df = pd.concat([results_df, station_df], ignore_index=True)

print(results_df)
results = results_df.copy()
results['time'] -= results['time'].min()
results.insert(0, 'id', [ f'{station}_{time}' for station, time in zip(results['station'], results['time'])])
results.drop(['station', 'time'], axis=1, inplace=True)
print(results)

results.to_csv('submission.csv', index=False)

models/epoch_42.pt
      station time      var1       var2       var3       var4       var5  \
0           0  694  0.056131  32.286209  27.013058  22.337292  34.203274   
1           0  695  0.035928  32.836243  27.390656  22.546234  31.375746   
2           0  696  0.023281  33.286217  27.721127  22.763315  29.390656   
3           0  697  0.017860  33.656418  28.002705  22.971306  28.194546   
4           0  698  0.016914  33.947033  28.222969  23.146307  27.561415   
...       ...  ...       ...        ...        ...        ...        ...   
12655     421  719  0.098710  18.771652  13.812306   9.137703  30.071703   
12656     421  720  0.098034  18.765875  13.801530   9.117914  29.952831   
12657     421  721  0.097300  18.761576  13.791638   9.098948  29.827784   
12658     421  722  0.096521  18.758709  13.782686   9.080891  29.698051   
12659     421  723  0.095708  18.757233  13.774724   9.063821  29.564892   

            var6      var7      var8  ...     var67     var68     va