# Pangu mini model demo
---
``python version = 3.11``

> [08/02/2024]: En este notebook se hace intenta probar el modelo [Pangu Weather mini](https://github.com/rudolfmard/Pangu-Weather-mini/tree/main).

In [1]:
# !pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
# !pip install timm
# !pip install pygrib # En linux
# !conda install conda-forge::pygrib # En windows use -y flag
# !pip install chardet.
# !pip install charset-normalizer

In [2]:
import sys
import os
import importlib
# Importamos paquete local
sys.path.append(os.path.dirname(os.getcwd()))

In [3]:
import xarray as xr
import torch

In [4]:
from trainer import Trainer
from model import WeatherModel
import data_handler as dh
import torch
from torch.distributed import init_process_group, destroy_process_group

In [5]:
# Esta debe usarse para configurar el uso de GPU
"""
if execution_mode == "single_gpu":
    print("Training on single GPU.")
elif execution_mode == "multi_gpu":
    print("Training on multiple GPUs.")
    init_process_group(backend="nccl")
else:
    raise ValueError("Invalid execution mode. Valid values are 'single_gpu' or 'multi_gpu'")"""

# This environment variable tells PyTorch CUDA allocator not to split memory blocks larger than certain size.
# Mitigates GPU memory fragmentation and allows the training of the full original model to fit onto one GPU.
"""os.environ["PYTORCH_CUDA_ALLOC_CONF"] = 'max_split_size_mb:1024'"""

# Path to checkpoint.pt to continue training from that checkpoint,
# if checkpoint.pt does not exist training starts from scratch.
"""checkpoint_path="checkpoint.pt" """

'checkpoint_path="checkpoint.pt" '

In [6]:

"""
Training parameters:
learning_rate (float):   Learning rate of the training, 5e-4 in original Pangu-Weather.
max_epochs (int):        Maximum number of epochs for training, 100 in original Pangu-Weather.
save_every (int):        Saves a checkpoint every save_every epoch.
batch_size (int):        Batch size of the training data, 1 in original Pangu-Weather.
"""
learning_rate = 5e-4
max_epochs = 10
save_every = 2
batch_size = 1

"""
Model parameters:
C (int):                Dimensionality of patch embedding of the tokens. 192 in original Pangu-Weather. Make sure C is divisible by n_heads.
depth (list[int]):      List with length of 4, defines the number of transformer blocks in each 4 EarthSpecificLayers. [2,6,6,2] in original Pangu-Weather.
n_heads (list[int]):    List with length of 4, defines the number of heads in transformer blocks of each 4 EarthSpecificLayers. [6, 12, 12, 6] in original Pangu-Weather.
D (int):                Dimensionality multiplier of hidden layer in transformer MLP. 4 in original Pangu-Weather.
"""
C = 192
depth = [2, 6, 6, 2]
n_heads = [6, 12, 12, 6]
D = 4

# Create a model object:
model = WeatherModel(C, depth, n_heads, D, batch_size, log_GPU_mem=False)

nW= 832
torch.Size([8, 16, 13])
mask_windows= torch.Size([8, 1, 8, 2, 13, 1])
nW= 832
torch.Size([8, 16, 13])
mask_windows= torch.Size([8, 1, 8, 2, 13, 1])
nW= 288
torch.Size([8, 8, 9])
mask_windows= torch.Size([8, 1, 4, 2, 9, 1])
nW= 288
torch.Size([8, 8, 9])
mask_windows= torch.Size([8, 1, 4, 2, 9, 1])
nW= 288
torch.Size([8, 8, 9])
mask_windows= torch.Size([8, 1, 4, 2, 9, 1])
nW= 288
torch.Size([8, 8, 9])
mask_windows= torch.Size([8, 1, 4, 2, 9, 1])
nW= 288
torch.Size([8, 8, 9])
mask_windows= torch.Size([8, 1, 4, 2, 9, 1])
nW= 288
torch.Size([8, 8, 9])
mask_windows= torch.Size([8, 1, 4, 2, 9, 1])
nW= 288
torch.Size([8, 8, 9])
mask_windows= torch.Size([8, 1, 4, 2, 9, 1])
nW= 288
torch.Size([8, 8, 9])
mask_windows= torch.Size([8, 1, 4, 2, 9, 1])
nW= 288
torch.Size([8, 8, 9])
mask_windows= torch.Size([8, 1, 4, 2, 9, 1])
nW= 288
torch.Size([8, 8, 9])
mask_windows= torch.Size([8, 1, 4, 2, 9, 1])
nW= 288
torch.Size([8, 8, 9])
mask_windows= torch.Size([8, 1, 4, 2, 9, 1])
nW= 288
torch.Size(

  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


[09/02/2024]
`C:\Users\gcuervo\miniconda3\envs\art1_pyenv\Lib\site-packages\timm\layers\drop.py`

> **line 170:** 
> ```python
> return f'drop_prob={self.drop_prob} ====> return f'drop_prob={round(self.drop_prob.item(),3):0.3f}
```
fix: ``.item()`` method added

In [7]:
model

WeatherModel(
  (input_layer): PatchEmbedding(
    (conv_air): Conv3d(5, 192, kernel_size=(2, 4, 4), stride=(2, 4, 4))
    (conv_surface): Conv2d(7, 192, kernel_size=(4, 4), stride=(4, 4))
  )
  (layer1): EarthSpecificLayer(
    (blocks): ModuleList(
      (0): EarthSpecificBlock(
        (attention): EarthAttention3D(
          (linear_qkv): Linear(in_features=192, out_features=576, bias=True)
          (linear): Linear(in_features=192, out_features=192, bias=True)
          (dropout): Dropout(p=0, inplace=False)
        )
        (norm1): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
        (feedforward): MLP(
          (linear1): Linear(in_features=192, out_features=768, bias=True)
          (linear2): Linear(in_features=768, out_features=192, bias=True)
          (drop): Dropout(p=0, inplace=False)
        )
        (norm2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
        (drop_path): DropPath(drop_prob=0.000)
      )
      (1): EarthSpecificBlock(
        (a

In [8]:
# ERA5_air = xr.load_dataset(
#    r"C:\Users\gcuervo\OneDrive - Universidad de Las Palmas de Gran Canaria\Documents\Doctorado\DB\2010-2023_01_10-6h-64x32_vars_filtered.nc")
# ERA5_air

In [9]:
# i_train = round(ERA5_air.time.values.size * 0.7)
# ERA5_air_train = ERA5_air.isel(time=slice(0, i_train))
# ERA5_air_val = ERA5_air.isel(time=slice(i_train, -1))

In [10]:
# ERA5_surf = xr.load_dataset(
#    r"C:\Users\gcuervo\OneDrive - Universidad de Las Palmas de Gran Canaria\Documents\Doctorado\DB\2010-2023_01_10-6h-64x32_surfvars_filtered.nc")
# ERA5_surf

In [11]:
# i_train = round(ERA5_surf.time.values.size * 0.7)
# ERA5_surf_train = ERA5_surf.isel(time=slice(0, i_train))
# ERA5_surf_val = ERA5_surf.isel(time=slice(i_train, -1))

In [12]:
# test

In [13]:
# test = ERA5_air_train.chunk({'time': 14})  # .expand_dims('batch')
# test

In [14]:
# test.chunk.__sizeof__()

In [15]:
# batch_coords = xr.DataArray(
#    range(test.chunk.__sizeof__()), dims=('batch',), name='batch')
# batch_coords

In [16]:
# test.coords['batch'] = batch_coords
# test.dims.mapping

In [17]:
# test.value

In [18]:
# xr.DataArray(range(len(test.batch)), dims=('batch',), name='batch')

In [19]:
# hunk_coords = xr.DataArray(
#    range(len(ERA5_air_train.time)), dims=('chunk',), name='chunk')

In [20]:
# test.as_numpy()

In [21]:
# = list(range(len(ERA5_air_train.batch)))

In [22]:
# Convertir el objeto xarray a un tensor de PyTorch
# ERA5_air_train_tensor = torch.from_numpy(ERA5_air_train.values)
#
# Guardar el tensor en un archivo .pt
# torch.save(ERA5_air_train_tensor,
#           r"C:\Users\gcuervo\OneDrive - Universidad de Las Palmas de Gran Canaria\Documents\Doctorado\DB\Pangu_data\ERA5_air_train.pt")

## Descarga de datos demo
---
La descarga de los datos se realiza utilizando el API  [Climate Data Store (CDS) Application Program Interface (API)](https://cds.climate.copernicus.eu/api-how-to). Para instalar el API en Windows se siguieron las instrucciones de [Use CDS API on Windows](https://confluence.ecmwf.int/display/CKB/How+to+install+and+use+CDS+API+on+Windows). Es necesario abrir una cuenta en ``cds.climate.copernicus.eu``.

Una ves instalada y configurada el CDS API se debe crear un archivo de credenciales en la ruta: ``%USERPROFILE%\.cdsapirc``, en su entorno windows, ``%USERPROFILE%`` suele estar en la carpeta ``C:\Users\Username``. Dicho archivo tiene que contener las siguientes credenciales:

```Bash
url: https://cds.climate.copernicus.eu/api/v2
key: 286528:90262cbb-a170-4868-929d-f652daa1d58e
```

Posteriormente para definir el dataset junto con los dominios espaciales y temporales se utiliza el script: `..\models\PanguWeather_mini_model\utils\load_data_from_CDS.py`

Al ejecutar este script la petición al servidor queda en una cola de espera. Esta petición se puede monitorear desde la cuenta personal ``cds.climate.copernicus.eu`` en [your requests](https://cds.climate.copernicus.eu/cdsapp#!/yourrequests).

Al ingresar a este panel de control se visualizará la petición de descarga y el tiempo transcurrido desde la petición. Para el primer dataset se tardó ``2.5 H``

![alt text](image.png)

In [23]:
data_dir = "C:\\Users\\gcuervo\\OneDrive - Universidad de Las Palmas de Gran Canaria\\Documents\\Doctorado\\DB\\Pangu_data\\"

In [24]:
data_dir + "air_test_validation.grib"

'C:\\Users\\gcuervo\\OneDrive - Universidad de Las Palmas de Gran Canaria\\Documents\\Doctorado\\DB\\Pangu_data\\air_test_validation.grib'

In [25]:
os.path.join(data_dir, "air_test_validation.grib")

'C:\\Users\\gcuervo\\OneDrive - Universidad de Las Palmas de Gran Canaria\\Documents\\Doctorado\\DB\\Pangu_data\\air_test_validation.grib'

In [26]:
file = "air_test_validation.grib"
# Verificar si el archivo ya existe
if not os.path.exists(os.path.join(data_dir, file)):
    # Si el archivo no existe, ejecutar la función para crearlo
    dh.air_grib_to_tensor(file, data_dir)
else:
    print(
        f"El archivo {file} ya existe en el directorio {data_dir}. No es necesario crearlo.")

El archivo air_test_validation.grib ya existe en el directorio C:\Users\gcuervo\OneDrive - Universidad de Las Palmas de Gran Canaria\Documents\Doctorado\DB\Pangu_data\. No es necesario crearlo.


In [27]:
file = "surface_test_validation.grib"
# Verificar si el archivo ya existe
if not os.path.exists(os.path.join(data_dir, file)):
    # Si el archivo no existe, ejecutar la función para crearlo
    dh.air_grib_to_tensor(file, data_dir)
else:
    print(
        f"El archivo {file} ya existe en el directorio {data_dir}. No es necesario crearlo.")

El archivo surface_test_validation.grib ya existe en el directorio C:\Users\gcuervo\OneDrive - Universidad de Las Palmas de Gran Canaria\Documents\Doctorado\DB\Pangu_data\. No es necesario crearlo.


In [28]:
air_data = torch.load(data_dir + "air_test_validation.pt")
surface_data = torch.load(data_dir + "surface_test_validation.pt")

In [29]:
air_data.shape

torch.Size([28, 13, 1440, 721, 5])

In [30]:
surface_data.shape

torch.Size([28, 1, 1440, 721, 4])

In [31]:
def calculate_statistics(air_data_path, surface_data_path):
    # Load the tensors:
    #    air_data shape:        (Hour, Z=13, H=1440, W=721, C=5)
    #    surface data shape:    (Hour, H=1440, W=721, C=4)
    air_data = torch.load(air_data_path)
    surface_data = torch.load(surface_data_path)

    # Create a dictionary to hold the data statistics:
    statistics = {}

    # Calculate mean and standard deviation:
    statistics["AIR_MEAN"] = air_data.mean(dim=(0, 2, 3), keepdim=True)
    statistics["AIR_SD"] = air_data.std(dim=(0, 2, 3), keepdim=True)
    statistics["SURFACE_MEAN"] = surface_data.mean(dim=(0, 1, 2), keepdim=True)
    statistics["SURFACE_SD"] = surface_data.std(dim=(0, 1, 2), keepdim=True)

    # Save the statistics dictionary to a file:
    dir_path = os.path.dirname(air_data_path)
    torch.save(statistics, dir_path + "/statistics.pt")
    print("Training data statistics saved at statistics.pt")

In [32]:
# Define paths to air and surface training data files:
# "../../weather_data/air_test.pt"
air_data_path = data_dir + "air_test_validation.pt"
# "../../weather_data/surface_test.pt"
surface_data_path = data_dir + "surface_test_validation.pt"

# Call calculate_statistics function from the data_handler file:
calculate_statistics(air_data_path, surface_data_path)

Training data statistics saved at statistics.pt


[13/02/2024]

Para correr los data loaders es necesario cambiar la ruta del archivo ``statistics.pt`` en la función ``normalize_data()`` del modulo `.\PanguWeather_mini_model\data_handler.py`:

>**Linea 143:**
>
> ```Python
> statistics_path = "statistics.pt" ===> dir_path = "C:\\Users\\gcuervo\\OneDrive - Universidad de Las Palmas de Gran Canaria\\Documents\\Doctorado\\DB\\Pangu_data\\"
> statistics_path = dir_path + "statistics.pt"
```

[13/02/2024]

Para correr los data loaders y que no haya problema con la normalización de los datos de superficie hacer un ``.squeeze()`` del tensor. Por tanto, se modifica la clase ``WeatherDataset`` del modulo `.\PanguWeather_mini_model\data_handler.py`:

>**Linea 169:**
>
> ```Python
> surface_data = normalize_data(torch.load(surface_data_path)) ===> surface_data = normalize_data(torch.load(surface_data_path).squeeze()).squeeze()
```
> fix: ``.squeeze()`` method added

In [33]:
import importlib
# Reload the module
importlib.reload(dh)

<module 'data_handler' from 'c:\\Users\\gcuervo\\OneDrive - Universidad de Las Palmas de Gran Canaria\\Documents\\Doctorado\\PhD_repo\\models\\PanguWeather_mini_model\\data_handler.py'>

In [34]:
execution_mode = "single_gpu"

# Create dataloader objects for training and validation data:


train_dataset = dh.WeatherDataset(lead_time=1,
                                  # "../weather_data/air_test.pt",
                                  air_data_path=data_dir + "air_test_validation.pt",
                                  # "../weather_data/surface_test.pt"
                                  surface_data_path=data_dir + "surface_test_validation.pt",
                                  )


train_dataloader = dh.prepare_dataloader(train_dataset, batch_size, execution_mode)



# If validation_dataloader is set to None, no validation is performed between epochs.


validation_dataset = dh.WeatherDataset(lead_time=1,
                                       # ../weather_data/air_test_validation.pt",
                                       air_data_path=data_dir + "air_test_validation.pt",
                                       # ../weather_data/surface_test_validation.pt"
                                       surface_data_path=data_dir + "surface_test_validation.pt",
                                       )


validation_dataloader = dh.prepare_dataloader(validation_dataset, batch_size, execution_mode)

In [35]:
b1 = next(iter(train_dataloader))

In [36]:
len(b1)

2

In [37]:
len(b1[0])

2

In [38]:
b1[0][0].shape

torch.Size([1, 13, 64, 32, 5])

In [39]:
b1[0][0].element_size() * b1[0][0].numel() / 1000**3

0.00053248

In [40]:

# Create loss loss function and optimizer objects:
optimizer = torch.optim.Adam(
    model.parameters(), lr=learning_rate, weight_decay=3e-6)
loss_fn = torch.nn.L1Loss()

 [14/02/2024]
 
 Para hacer un recorte en la latitud y longitud para tener matrices que no desborden la GPU
 data_handler.py: 170
```Python        
# self.x_air = air_data[:-lead_time]
self.x_air = air_data[:-lead_time, :, :64, :32, :]
# self.x_surface = surface_data[:-lead_time]
self.x_surface = surface_data[:-lead_time, :64, :32, :]
# self.y_air = air_data[lead_time:]
self.y_air = air_data[lead_time:, :, :64, :32, :]
# self.y_surface = surface_data[lead_time:]
self.y_surface = surface_data[lead_time:, :64, :32, :]
```
Como se modifico el tamaño de las matrices pues tocó modificar el de las mascaras
model.py: 9
```Python    
# land_mask = torch.ones((batch_size, 1, 1440, 721))
land_mask = torch.ones((batch_size, 1, 64, 32))
# soil_type = torch.ones((batch_size, 1, 1440, 721))
soil_type = torch.ones((batch_size, 1, 64, 32))
#topography = torch.ones((batch_size, 1, 1440, 721))
topography = torch.ones((batch_size, 1, 64, 32))
```
Como se modifico el tamaño de las matrices tocó modificar la resolucion de entrada de las capas del modelo
model.py: 27
```Python
# Resolucion UH
# kernel conv: 4 x 4 con saltos de 4
#              1440 / 4 = 360
#              721 / 4 = 181
# Four main layers:
# self.layer1 = EarthSpecificLayer(depth=depth[0], dim=C, input_resolution=(8, 360, 181),
#                                 heads=n_heads[0], drop_path_ratio_list=drop_path_list[:2], D=D)
# Resolucion UL
# kernel conv: 4 x 4 con saltos de 4
#              64 / 4 = 16
#              32 / 4 = 8
self.layer1 = EarthSpecificLayer(depth=depth[0], dim=C, input_resolution=(8, 16, 8),
                                heads=n_heads[0], drop_path_ratio_list=drop_path_list[:2], D=D)
# Resolucion UH
# EarthBLock: 2
#              360 / 2 = 180
#              181 / 2 = 90.5
#self.middleLayers = nn.Sequential(
# self.layer2 = EarthSpecificLayer(depth=depth[1], dim=2*C, input_resolution=(8, 180, 91), heads=n_heads[1], drop_path_ratio_list=drop_path_list[2:], D=D)
# self.layer3 = EarthSpecificLayer(depth=depth[2], dim=2*C, input_resolution=(8, 180, 91), heads=n_heads[2], drop_path_ratio_list=drop_path_list[2:], D=D)
# Resolucion Ul
# EarthBLock: 2
#              16 / 2 = 8
#              8 / 2 = 4
self.layer2 = EarthSpecificLayer(depth=depth[1], dim=2*C, input_resolution=(8, 8, 4), heads=n_heads[1], drop_path_ratio_list=drop_path_list[2:], D=D)
self.layer3 = EarthSpecificLayer(depth=depth[2], dim=2*C, input_resolution=(8, 8, 4), heads=n_heads[2], drop_path_ratio_list=drop_path_list[2:], D=D)
#)
# Resolucion UH
# kernel conv: 4 x 4 con saltos de 4
#              1440 / 4 = 360
#              721 / 4 = 181
# self.layer4 = EarthSpecificLayer(depth=depth[3], dim=C, input_resolution=(8, 360, 181),
#                                 heads=n_heads[3], drop_path_ratio_list=drop_path_list[:2], D=D)
# Resolucion UL
# kernel conv: 4 x 4 con saltos de 4
#              64 / 4 = 16
#              32 / 4 = 8
self.layer4 = EarthSpecificLayer(depth=depth[3], dim=C, input_resolution=(8, 16, 8),
                                heads=n_heads[3], drop_path_ratio_list=drop_path_list[:2], D=D)
```

In [41]:
checkpoint_path = data_dir + "checkpoint.pt"
# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = 'max_split_size_mb:1024'

# Create a trainer object and train the model:

trainer = Trainer(model, train_dataloader, validation_dataloader, loss_fn,
                  optimizer, max_epochs, save_every, execution_mode, checkpoint_path)

trainer.train()

Cuda available:  True




torch.Size([1, 8, 16, 9, 192])


RuntimeError: shape '[1, 8, 8, 2, 4, 2, 192]' is invalid for input of size 221184

In [None]:

# Calculate RMSE on a batch of data:
calculate_RMSE = True
if calculate_RMSE:
    with torch.no_grad():
        device = next(model.parameters()).device
        model.eval()

        # Fetch a batch from validation dataloader:
        data, targets = next(iter(validation_dataloader))

        # Move the data to the same device as the model:
        data_air, data_surface = data
        data_air = data_air.to(device)
        data_surface = data_surface.to(device)
        targets_air, targets_surface = targets
        targets_air = targets_air.to(device)
        targets_surface = targets_surface.to(device)
        # Make prediction with the model:
        output_air, output_surface = model((data_air, data_surface))
        # Calculate RMSE of the predictions on unnormalized data:
        rmse_values = dh.RMSE((dh.unnormalize_data(output_air), dh.unnormalize_data(output_surface)),
                              (dh.unnormalize_data(targets_air), dh.unnormalize_data(targets_surface)), save=True)
if execution_mode == "multi_gpu":
    destroy_process_group()