In [1]:
from ipsl_dcpp.model.ipsl_dataset import IPSL_DCPP
import torch
import lightning as pl
from ipsl_dcpp.model.pangu import PanguWeather
from hydra import compose, initialize
from omegaconf import OmegaConf

import hydra
import os
os.environ['SLURM_NTASKS_PER_NODE'] = '1'
torch.set_default_dtype(torch.float32)
# os.environ["CUDA_VISIBLE_DEVICES"]=""
#torch.set_default_tensor_type(torch.FloatTensor)

with initialize(version_base=None, config_path="../conf"):
    cfg = compose(config_name="config")
pl.seed_everything(cfg.experiment.seed)
train = hydra.utils.instantiate(
    cfg.experiment.train_dataset,
    generate_statistics=False,
    surface_variables=cfg.experiment.surface_variables,
    depth_variables=cfg.experiment.depth_variables,
    plev_variables=cfg.experiment.plev_variables,
    work_path=cfg.environment.work_path,
    scratch_path=cfg.environment.scratch_path,
)

train_dataloader = torch.utils.data.DataLoader(
    train,
    batch_size=2,
    shuffle=True,
    num_workers=1
)

val = hydra.utils.instantiate(
    cfg.experiment.val_dataset,
    generate_statistics=False,
    surface_variables=cfg.experiment.surface_variables,
    depth_variables=cfg.experiment.depth_variables,
    plev_variables=cfg.experiment.plev_variables,
    work_path=cfg.environment.work_path,
    scratch_path=cfg.environment.scratch_path,
)

val_dataloader = torch.utils.data.DataLoader(
    val,
    batch_size=1,
    shuffle=False,
    num_workers=1
)

#batch = next(iter(train_dataloader))
model = hydra.utils.instantiate(
    cfg.experiment.module,
    backbone=hydra.utils.instantiate(
        cfg.experiment.backbone,
    ),
    dataset=train_dataloader.dataset

)
trainer = pl.Trainer(
    max_epochs=cfg.experiment.max_epochs,
    enable_checkpointing=True,
    log_every_n_steps=1,
   # max_steps=cfg.experiment.max_steps if not cfg.debug else 10,
    precision="16-mixed",
    #precision='32',
    profiler='simple' if cfg.debug else None,
   # devices=cfg.experiment.num_gpus,
   # strategy='ddp_find_unused_parameters_true',
    #limit_train_batches=0.01 if cfg.debug else 1
    #limit_val_batches=0.01 if cfg.debug else 1,
    num_sanity_val_steps=1,
  #  device='cpu',
  #accelerator='mps',
  #CONV3D not supported by mps, have to use cpu when local 
    accelerator= 'mps' if cfg.environment.name == 'local' else 'gpu',
    fast_dev_run=1,
    limit_val_batches=0.001,
)


Seed set to 0
  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]
Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/Users/gclyne/miniforge3/envs/env_dcpp/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py:75: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
Running in `fast_dev_run` mode: will run the requested loop using 1 batch(es). Logging and checkpointing is suppressed.


In [2]:
trainer.test(model, val_dataloader)

/Users/gclyne/miniforge3/envs/env_dcpp/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:436: Consider setting `persistent_workers=True` in 'test_dataloader' to speed up the dataloader worker initialization.


Testing: |          | 0/? [00:00<?, ?it/s]



[{}]

In [3]:
trainer.lightning_module.metrics.compute()

dict_keys(['tas_err', 'tas_var', 'tas_spskr', 'tas_crps', 'gpp_err', 'gpp_var', 'gpp_spskr', 'gpp_crps', 'cVeg_err', 'cVeg_var', 'cVeg_spskr', 'cVeg_crps', 'evspsbl_err', 'evspsbl_var', 'evspsbl_spskr', 'evspsbl_crps', 'ps_err', 'ps_var', 'ps_spskr', 'ps_crps'])


{'tas_err': tensor(12.0633),
 'tas_var': tensor(5.9191e-07),
 'tas_spskr': tensor(0.0002),
 'tas_crps': tensor(2.3939),
 'gpp_err': tensor(2.6501e-17),
 'gpp_var': tensor(6.2836e-24),
 'gpp_spskr': tensor(0.0005),
 'gpp_crps': tensor(2.8582e-09),
 'cVeg_err': tensor(0.0011),
 'cVeg_var': tensor(2.1153e-10),
 'cVeg_spskr': tensor(0.0005),
 'cVeg_crps': tensor(0.0231),
 'evspsbl_err': tensor(2.6537e-11),
 'evspsbl_var': tensor(8.4090e-18),
 'evspsbl_spskr': tensor(0.0006),
 'evspsbl_crps': tensor(3.0864e-06),
 'ps_err': tensor(315111.0312),
 'ps_var': tensor(0.0361),
 'ps_spskr': tensor(0.0004),
 'ps_crps': tensor(343.9454)}

In [7]:
import numpy as np

x = torch.tensor([[1,2,3,4,5,6,7,8,9,torch.nan],[1,2,3,4,5,6,7,8,torch.nan,10]])
x.var(),x.mean()

(tensor(nan), tensor(nan))

In [None]:
trainer.fit(
    model=model,
    train_dataloaders=train_dataloader)
trainer.logged_metrics
# checkpoint_path = torch.load(f'../epoch=00.ckpt',map_location=torch.device('cpu'))
# model.load_state_dict(checkpoint_path['state_dict'])
#trainer.test(model, val_dataloader)
# batch = next(iter(val_dataloader))
#history = model.sample_rollout(batch)

In [None]:
sample,batch,steps = model.sample(batch)


In [None]:
import xarray as xr
import matplotlib.pyplot as plt
#get shell
ds = xr.open_dataset(train.files[0])
shell = ds.isel(time=0)
var_name = 'gpp'
var_index = cfg.experiment.surface_variables.index(var_name)

#plot lat lon map of first rollout
fig, (ax1,ax2) = plt.subplots(1, 2, figsize=(16, 6))
#shell[var_name].data = steps[49][0,0,0]
shell[var_name].data = batch['next_state_surface'][0][4]
shell[var_name].plot.pcolormesh(ax=ax1)
shell[var_name].data = sample['next_state_surface'][0][4]
shell[var_name].plot.pcolormesh(ax=ax2)

In [None]:
# from celluloid import Camera
# fig, ax1 = plt.subplots(1, figsize=(16, 6))
# camera = Camera(fig)
# ax1.set_title("diffusion steps")
# ds = xr.open_dataset(val.files[0])
# shell = ds.isel(time=0)

# # Animate plot over time
# for time_step in range(len(steps)):
#     shell['tas'].data = steps[time_step][0,0,0]
#     shell['tas'].plot.pcolormesh(ax=ax1,add_colorbar=False)
#     ax1.set_title(f"diffusion step {time_step}")    
#     camera.snap()
# anim = camera.animate()
# anim.save(f"diffusion.gif")

from matplotlib import animation
import matplotlib.pyplot as plt
import numpy as np


fig, ax1 = plt.subplots(1, figsize=(16, 6))
container = []
for time_step in range(len(steps)):
    shell['tas'].data = steps[time_step][0,4]
   # line = ax1.pcolormesh(steps[time_step][0,0,0])
    line = shell['tas'].plot.pcolormesh(ax=ax1,add_colorbar=False)
    title = ax1.text(0.5,1.05,"Diffusion Step {}".format(time_step), 
                    size=plt.rcParams["axes.titlesize"],
                    ha="center", transform=ax1.transAxes, )
    container.append([line, title])
plt.title('')

ani = animation.ArtistAnimation(fig, container, interval=100, blit=True)
ani.save("diffusion.gif")

In [None]:
trainer.lightning_module.metrics.compute()