In [12]:
import math
import sys

from icecream import ic, install

install()

sys.path.append("..")

In [16]:
import logging
import os

import pandas as pd
import torch
import data.loading as loading
from data.dataset import Forcasting_ERA5Dataset
from data.processing import select_data
from model.velocity import get_kernel, get_velocities
from torch.utils.data import DataLoader
from utils.loss import CustomGaussianNLLLoss

variables_time_dependant = ["t2m", "t", "z", "u10", "v10"]
variables_static = ["lsm", "orography"]

gpu_device = torch.device("cpu")  # fallback to cpu
if torch.cuda.is_available():
    gpu_device = torch.device("cuda")
    torch.cuda.empty_cache()
elif torch.backends.mps.is_available():
    gpu_device = torch.device("mps")
    torch.mps.empty_cache()

config = {
    "data_path_wb1": "../data/era5_data/",
    "data_path_wb2": "../data/1959-2023_01_10-6h-64x32_equiangular_conservative.zarr",
    "freq": "6H",
    "periods": {
        "train": ("2006-01-01", "2015-12-31"),
        "val": ("2016-01-01", "2016-12-31"),
        "test": ("2017-01-01", "2018-12-31"),
    },
    "vel": {
        "rbf_alpha": 1.0,
        "stacking": 3,
        "bs": 50,
        "fitting_epoch": 200,
        "regul_coeff": 1e-7,
        "lr": 2,
        "device": gpu_device,
    },
    "model": {
        "emission_model": {
            "in_channels": 9 + 34,  # err_in
            "layers_length": [3, 2, 2],
            "layers_hidden_size": [
                128,
                64,
                2 * 9,
            ],  # 9 = out_types = len(paths_to_data)
        },
        "norm_type": "batch",
        "n_res_blocks": [3, 2, 2],
        "kernel_size": 3,
        "stride": 1,
        "dropout": 0.1,
    },
    "bs": 8,
    "device": gpu_device,
}

if __name__ == "__main__":
    # check the script is executed within the parent directory

    logging.basicConfig(level=logging.INFO)

    periods = {
        k: pd.date_range(*p, freq=config["freq"])
        for (k, p) in config["periods"].items()
    }
    raw_data = loading.wb1(config["data_path_wb1"], periods)
    # data = loading.wb2(config["data_path_wb2"], periods)

    logging.info("Raw data loaded, merged and normalized")
    logging.info("Raw data disk size: {} MiB".format(raw_data.nbytes / 1e6))

    data_selected = select_data(raw_data, periods)

    kernel = get_kernel(raw_data, config["vel"])
    data_velocities = get_velocities(data_selected, kernel, config)

    criterion = CustomGaussianNLLLoss()
    data = torch.cat([t.unsqueeze(-1) for t in data_selected["train"].values()], dim=-1)
    dataset = Forcasting_ERA5Dataset(data)
    train_loader = DataLoader(dataset, batch_size=config["bs"], shuffle=True)


  k: pd.date_range(*p, freq=config["freq"])
INFO:root:Raw data loaded, merged and normalized
INFO:root:Raw data disk size: 777.712696 MiB
INFO:root:Velocities for train loaded from cache
INFO:root:Velocities for val loaded from cache
INFO:root:Velocities for test loaded from cache


In [7]:
for epoch, (x,y) in enumerate(train_loader):
    ic(x.shape)
    ic(y.shape)

    break

ic| x.shape: torch.Size([8, 32, 64, 5])
ic| y.shape: torch.Size([8, 8, 32, 64, 5])


In [5]:
data_selected

{'train': TensorDict(
     fields={
         t2m: Tensor(shape=torch.Size([14605, 32, 64]), device=cpu, dtype=torch.float32, is_shared=False),
         t: Tensor(shape=torch.Size([14605, 32, 64]), device=cpu, dtype=torch.float32, is_shared=False),
         u10: Tensor(shape=torch.Size([14605, 32, 64]), device=cpu, dtype=torch.float32, is_shared=False),
         v10: Tensor(shape=torch.Size([14605, 32, 64]), device=cpu, dtype=torch.float32, is_shared=False),
         z: Tensor(shape=torch.Size([14605, 32, 64]), device=cpu, dtype=torch.float32, is_shared=False)},
     batch_size=torch.Size([14605]),
     device=None,
     is_shared=False),
 'val': TensorDict(
     fields={
         t2m: Tensor(shape=torch.Size([1461, 32, 64]), device=cpu, dtype=torch.float32, is_shared=False),
         t: Tensor(shape=torch.Size([1461, 32, 64]), device=cpu, dtype=torch.float32, is_shared=False),
         u10: Tensor(shape=torch.Size([1461, 32, 64]), device=cpu, dtype=torch.float32, is_shared=False),
    

In [6]:
data_selected["train"].unsqueeze(2)

RuntimeError: unsqueezing is allowed for dims comprised between `-td.batch_dims` and `td.batch_dims` only. Got dim=2 with a batch size of torch.Size([14605]).

In [None]:
torch.cat(tuple(data_selected["train"].unsqueeze(-1).values()), dim=-1).shape

torch.Size([14605, 1, 32, 320])

In [None]:
from torch.utils.data import Dataset


class CustomDataset(Dataset):
    def __init__(self, dataset, nb_timestep=8):
        """From the weather at one time step, we want to predict the weather at the next `nb_timestep` time steps.

        Parameters
        ----------
        dataset : _type_
            The dataset
        nb_timestep : int
            Number of timestep to predict
        """
        # Load and preprocess your data here
        self.data = dataset
        self.nb_timestep = nb_timestep

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        x = self.data[index]
        y = self.data[index + 1 : index + self.nb_timestep + 1]
        return x, y

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]
[2, 3, 4, 5, 6, 7, 8, 9]


In [None]:
from torch.utils.data import DataLoader

train_loader = DataLoader(data_selected["train"], batch_size=config["bs"])
test_loader = DataLoader(data_selected["test"], batch_size=config["bs"])
val_loader = DataLoader(data_selected["val"], batch_size=config["bs"])

In [None]:
lat = torch.tensor(raw_data.coords["lat"].values)
lon = torch.tensor(raw_data.coords["lon"].values)
lsm = torch.tensor(raw_data.lsm.values)
oro = torch.tensor(raw_data.orography.values)
raw_data

Unnamed: 0,Array,Chunk
Bytes,148.30 MiB,11.44 MiB
Shape,"(18983, 32, 64)","(1464, 32, 64)"
Dask graph,13 chunks in 66 graph layers,13 chunks in 66 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 148.30 MiB 11.44 MiB Shape (18983, 32, 64) (1464, 32, 64) Dask graph 13 chunks in 66 graph layers Data type float32 numpy.ndarray",64  32  18983,

Unnamed: 0,Array,Chunk
Bytes,148.30 MiB,11.44 MiB
Shape,"(18983, 32, 64)","(1464, 32, 64)"
Dask graph,13 chunks in 66 graph layers,13 chunks in 66 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,148.30 MiB,11.44 MiB
Shape,"(18983, 32, 64)","(1464, 32, 64)"
Dask graph,13 chunks in 66 graph layers,13 chunks in 66 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 148.30 MiB 11.44 MiB Shape (18983, 32, 64) (1464, 32, 64) Dask graph 13 chunks in 66 graph layers Data type float32 numpy.ndarray",64  32  18983,

Unnamed: 0,Array,Chunk
Bytes,148.30 MiB,11.44 MiB
Shape,"(18983, 32, 64)","(1464, 32, 64)"
Dask graph,13 chunks in 66 graph layers,13 chunks in 66 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,148.30 MiB,11.44 MiB
Shape,"(18983, 32, 64)","(1464, 32, 64)"
Dask graph,13 chunks in 66 graph layers,13 chunks in 66 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 148.30 MiB 11.44 MiB Shape (18983, 32, 64) (1464, 32, 64) Dask graph 13 chunks in 66 graph layers Data type float32 numpy.ndarray",64  32  18983,

Unnamed: 0,Array,Chunk
Bytes,148.30 MiB,11.44 MiB
Shape,"(18983, 32, 64)","(1464, 32, 64)"
Dask graph,13 chunks in 66 graph layers,13 chunks in 66 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,148.30 MiB,11.44 MiB
Shape,"(18983, 32, 64)","(1464, 32, 64)"
Dask graph,13 chunks in 66 graph layers,13 chunks in 66 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 148.30 MiB 11.44 MiB Shape (18983, 32, 64) (1464, 32, 64) Dask graph 13 chunks in 66 graph layers Data type float32 numpy.ndarray",64  32  18983,

Unnamed: 0,Array,Chunk
Bytes,148.30 MiB,11.44 MiB
Shape,"(18983, 32, 64)","(1464, 32, 64)"
Dask graph,13 chunks in 66 graph layers,13 chunks in 66 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,148.30 MiB,11.44 MiB
Shape,"(18983, 32, 64)","(1464, 32, 64)"
Dask graph,13 chunks in 66 graph layers,13 chunks in 66 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 148.30 MiB 11.44 MiB Shape (18983, 32, 64) (1464, 32, 64) Dask graph 13 chunks in 66 graph layers Data type float32 numpy.ndarray",64  32  18983,

Unnamed: 0,Array,Chunk
Bytes,148.30 MiB,11.44 MiB
Shape,"(18983, 32, 64)","(1464, 32, 64)"
Dask graph,13 chunks in 66 graph layers,13 chunks in 66 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [None]:
# TODO
def print_time(t, paper=False):
    day_in_years = t / 24  # 365 or 366
    hours_of_day = t % 24
    day_of_years = t // 24
    (torch.sin(2 * torch.pi * hours_of_day),)  # sin temporal embedding
    (torch.sin(2 * torch.pi * day_of_years / day_in_years),)  # sin seasonal embedding
    print(f"{t%24}ème heure")
    print(f"{t//24}ème jour")
    print(f"Sinus hour: {math.sin(t%24)}")
    print(f"Sinus day: {math.sin(t//24)}")
    if paper:
        t_papier = t % 24
        print("\npapier")
        print(f"{t_papier}ème heure")
        print(f"{t_papier/24}ème jour")
        print(f"Sinus hour: {math.sin(t_papier%24 - math.pi / 2)}")
        print(f"Sinus day: {math.sin(t_papier/24 - math.pi / 2)}")


feb_28 = (31 + 28) * 24 + 3  # 28 feb 3h
mar_1 = (31 + 28 + 1) * 24 + 3  # 1 mars 3h
mar_1_bi = (31 + 29 + 1) * 24 + 3  # 1 mars 3h
print_time(feb_28)
print("1er mars pas bissextile")
print_time(mar_1)
print("1er mars bissextile")
print_time(mar_1_bi)

3ème heure
59ème jour
Sinus hour: 0.1411200080598672
Sinus day: 0.535635744101569
1er mars pas bissextile
3ème heure
60ème jour
Sinus hour: 0.1411200080598672
Sinus day: -0.421174221610232
1er mars bissextile
3ème heure
61ème jour
Sinus hour: 0.1411200080598672
Sinus day: -0.9907585503180235
