In [2]:
import pandas as pd
import numpy as np
from fco2models.utraining import prep_data
val_df = pd.read_parquet('../data/training_data/valdf_100km_random_reshaped.pq')
vald_df2021 = pd.read_parquet('../data/training_data/df_100km_random_reshaped_2021.pq')

In [4]:
predictors = ['sst_cci', 'sss_cci', 'chl_globcolour']
val_ds = prep_data(val_df, predictors)
vald_ds2021 = prep_data(vald_df2021, predictors)
val_ds = np.concatenate((val_ds, vald_ds2021), axis=0)
print("val_ds shape: ", val_ds.shape)

2025-04-15 15:51:54,837 - INFO - Filling missing sss_cci values with salt_soda values
2025-04-15 15:51:54,911 - INFO - predictors: ['sst_cci', 'sss_cci', 'chl_globcolour']
2025-04-15 15:51:55,038 - INFO - clipping fco2 values to 0-500


Number of samples after filtering:  22441
(3, 22441, 64) (22441, 64)
(3, 22441, 64) (1, 22441, 64)
number of fco2 measurements greater than 500:  7189


2025-04-15 15:51:55,369 - INFO - Filling missing sss_cci values with salt_soda values
2025-04-15 15:51:55,390 - INFO - predictors: ['sst_cci', 'sss_cci', 'chl_globcolour']
2025-04-15 15:51:55,440 - INFO - clipping fco2 values to 0-500


Number of samples after filtering:  11327
(3, 11327, 64) (11327, 64)
(3, 11327, 64) (1, 11327, 64)
number of fco2 measurements greater than 500:  17121
val_ds shape:  (33768, 4, 64)


In [5]:
val_ds.shape

(33768, 4, 64)

In [6]:
from diffusers import DDPMScheduler, UNet1DModel
from fco2models.models import MLP, UNet2DModelWrapper
import torch
import json

def load_model(save_dir, model_path, model_class):
    #read model hyperparameters
    with open(save_dir+'hyperparameters.json', 'r') as f:
        params = json.load(f)

    with open(save_dir+'losses.json', 'r') as f:
        losses = json.load(f)
    
    model_params = params['model_params']
    noise_params = params['noise_params']

    # load the model
    model = model_class(**model_params)
    model.load_state_dict(torch.load(save_dir+model_path))
    model.eval()
    noise_scheduler = DDPMScheduler(**noise_params)

    return model, noise_scheduler, params, losses

# load model
save_path = '../models/renko/unet2d_noattn/'
model_path = 'e_100.pt'
model_class = UNet2DModelWrapper


model, noise_scheduler, params, losses = load_model(save_path, model_path, model_class)
print("Model loaded")

Model loaded


In [7]:
train_losses = losses['train_losses']
val_losses = losses['val_losses']
# print epoch with minimum loss
print("Epoch with minimum training loss: ", np.argmin(train_losses))
print("Minimum training loss: ", np.min(train_losses))

val_losses_mean = np.mean(val_losses, axis=1)
print("Epoch with minimum validation loss: ", np.argmin(val_losses_mean))
print("Minimum validation loss: ", np.min(val_losses_mean))

Epoch with minimum training loss:  99
Minimum training loss:  0.024658476514286366
Epoch with minimum validation loss:  98
Minimum validation loss:  0.0780834779717569


In [8]:
from fco2models.utraining import full_denoise
from torch.utils.data import Dataset, DataLoader


n_rec = 10
np.random.seed(0)
np.random.shuffle(val_ds)
train_maxs = params['train_maxs']
train_mins = params['train_mins']
for i in range(val_ds.shape[1]):
    val_ds[:, i, :] = 2 * (val_ds[:, i, :] - train_mins[i]) / (train_maxs[i] - train_mins[i]) - 1
context = val_ds[:500, 1:, :]
context_ds = torch.from_numpy(np.repeat(context, n_rec, axis=0)).float()
print("context_ds shape: ", context_ds.shape)
context_loader = DataLoader(context_ds, batch_size=128, shuffle=False)

with torch.no_grad():
    # denoise the samples
    print("Denoising samples")
    samples = full_denoise(model, noise_scheduler, context_loader, jump=5)

context_ds shape:  torch.Size([5000, 3, 64])
Denoising samples
Training on cuda


Inference:   0%|          | 0/40 [00:03<?, ?it/s]


KeyboardInterrupt: 

In [28]:
samples.shape

(5000, 1, 64)

In [1]:
import matplotlib.pyplot as plt

plt.plot(samples[:10, 1, :].T, label='sample 1 - 10', color='blue', alpha=0.5)
plt.plot(val_ds[1, 0, :].T, label='original', color='red', alpha=1, linewidth=2)
plt.title('Denoised samples vs original')
plt.xlabel('bins')
plt.ylabel('fCO2')

plt.grid()
plt.show()

NameError: name 'samples' is not defined

In [None]:
preds = samples.cpu().numpy()

analyse baseline model

In [None]:
import pandas as pd
import numpy as np
from fco2models.utraining import prep_data
val_df = pd.read_parquet('../data/training_data/valdf_100km_random_reshaped.pq')
vald_df2021 = pd.read_parquet('../data/training_data/df_100km_random_reshaped_2021.pq')

In [68]:
predictors = ['sst_cci', 'sss_cci', 'chl_globcolour', 'year', 'lon', 'lat']
val_ds_baseline = prep_data(val_df, predictors)
val_ds_2021_baseline = prep_data(vald_df2021, predictors)
val_ds_baseline = np.concatenate((val_ds_baseline, val_ds_2021_baseline), axis=0)
print("val_ds_baseline shape: ", val_ds_baseline.shape)

2025-04-15 19:50:49,816 - INFO - Filling missing sss_cci values with salt_soda values
2025-04-15 19:50:49,851 - INFO - predictors: ['sst_cci', 'sss_cci', 'chl_globcolour', 'year', 'lon', 'lat']
2025-04-15 19:50:49,977 - INFO - clipping fco2 values to 0-500


Number of samples after filtering:  22441
(6, 22441, 64) (22441, 64)
(6, 22441, 64) (1, 22441, 64)
number of fco2 measurements greater than 500:  7189


2025-04-15 19:50:50,193 - INFO - Filling missing sss_cci values with salt_soda values
2025-04-15 19:50:50,213 - INFO - predictors: ['sst_cci', 'sss_cci', 'chl_globcolour', 'year', 'lon', 'lat']
2025-04-15 19:50:50,277 - INFO - clipping fco2 values to 0-500


Number of samples after filtering:  11327
(6, 11327, 64) (11327, 64)
(6, 11327, 64) (1, 11327, 64)
number of fco2 measurements greater than 500:  17121
val_ds_baseline shape:  (33768, 7, 64)


In [69]:
import json
# load baseline model
save_path = '../models/baseline/'
model_path = 'final_model_e_5.pt'
model_class = UNet2DModelWrapper

params = json.load(open(save_path + 'hyperparameters.json', 'r'))
checkpoint = torch.load(save_path + model_path)
model = UNet2DModelWrapper(**params['model_params'])
model.load_state_dict(checkpoint['model_state_dict'])

<All keys matched successfully>

In [70]:
# normalize the data
train_maxs = params['train_maxs']
train_mins = params['train_mins']

for i in range(val_ds_baseline.shape[1]):
    val_ds_baseline[:, i, :] = 2 * (val_ds_baseline[:, i, :] - train_mins[i]) / (train_maxs[i] - train_mins[i]) - 1
    # print max and min values
    print("Max value of predictor ", i, ": ", np.nanmax(val_ds_baseline[:, i, :]))
    print("Min value of predictor ", i, ": ", np.nanmin(val_ds_baseline[:, i, :]))

Max value of predictor  0 :  1.0
Min value of predictor  0 :  -0.9042751149016895
Max value of predictor  1 :  0.9907780542065303
Min value of predictor  1 :  -0.994580936660723
Max value of predictor  2 :  0.9894330785391408
Min value of predictor  2 :  -1.2438073092147035
Max value of predictor  3 :  0.6568312421999736
Min value of predictor  3 :  -0.9999623124466104
Max value of predictor  4 :  1.0
Min value of predictor  4 :  -1.0
Max value of predictor  5 :  0.999994405110677
Min value of predictor  5 :  -0.999987777777844
Max value of predictor  6 :  0.9898566590539928
Min value of predictor  6 :  -0.9961489627443083


In [None]:
import tqdm


def baseline_eval_loop(model, val_dataloader, device, random_model=None):
    loss_fn = torch.nn.MSELoss(reduction='none')
    losses = []
    preds = []
    random_preds = []
    with torch.no_grad():
        for batch in tqdm(val_dataloader):
            batch = batch.to(device)
            target = batch[:, 0:1, :]
            context = batch[:, 1:, :]
            nan_mask = torch.isnan(target)
            # replace nan with zeros
            target = torch.where(nan_mask, torch.zeros_like(target), target).float()
            #concatenate the noisy target with the context and the mask
            input = torch.cat([context, (~nan_mask).float()], dim=1)
            input = input.to(device).float()
            mean_pred = model(input, torch.zeros(batch.shape[0], ).to(device), return_dict=False)[0]
            # Calculate the loss
            loss = loss_fn(mean_pred[~nan_mask], target[~nan_mask])
            #val_loss += loss
            losses.append(loss.cpu().numpy())
            preds.append(mean_pred.cpu().numpy())
            if random_model is not None:
                # calculate random model prediction
                random_pred = random_model(input, torch.zeros(batch.shape[0], ).to(device), return_dict=False)[0]
                random_preds.append(random_pred.cpu().numpy())
    
    preds = np.concatenate(preds, axis=0)
    print("preds shape: ", preds.shape)
    losses = np.concatenate(losses, axis=0)
    print("losses shape: ", losses.shape)
    random_preds = np.concatenate(random_preds, axis=0)
    print("random_preds shape: ", random_preds.shape)
    return preds, losses, random_preds

In [None]:
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch.nn as nn

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
val_dataloader = DataLoader(val_ds_baseline, batch_size=64, shuffle=False)

model.to(device)
model.eval()
preds = []
losses = []

random_model = UNet2DModelWrapper(**params['model_params'])
random_model.to(device)
random_model.eval()
random_preds = []
preds, losses, random_preds = baseline_eval_loop(model, val_dataloader, device, random_model=random_model)

100%|██████████| 528/528 [00:29<00:00, 17.90it/s]

preds shape:  (33768, 1, 64)
losses shape:  (1751727,)
random_preds shape:  (33768, 1, 64)





In [77]:
losses.mean()

np.float32(0.030027466)

In [1]:
# plot a prediction and the original data
import matplotlib.pyplot as plt
import matplotlib


ix = 158
plt.plot(preds[ix, 0, :].T, 'o-', label='prediction', color='blue', alpha=0.5)
plt.plot(val_ds_baseline[ix, 0, :].T, 'o-', label='original', color='red', alpha=1, linewidth=2)
plt.plot(random_preds[ix, 0, :].T, 'o-', label='random prediction', color='green', alpha=0.5)
plt.title('Prediction vs original')
plt.xlabel('bins')
plt.ylabel('fCO2')
plt.legend()
plt.grid()
# set figure size

plt.gcf().set_size_inches(10, 5)
plt.show()

NameError: name 'preds' is not defined

In [96]:
preds[:, 0, :].shape, val_ds_baseline[:, 0, :].shape

((33768, 64), (33768, 64))

In [104]:
# calculate average correlation coefficient
from scipy.stats import pearsonr

def calculate_pearsonr(preds, targets):
    preds = preds.reshape(preds.shape[0], -1)
    targets = targets.reshape(targets.shape[0], -1)
    corr = np.zeros(preds.shape[0])
    for i in range(preds.shape[0]):
        corr[i], _ = pearsonr(preds[i, :], targets[i, :])
    return np.nanmean(corr)

corr = calculate_pearsonr(preds, val_ds_baseline[:, 0, :])
print("Average correlation coefficient: ", corr)

  corr[i], _ = pearsonr(preds[i, :], targets[i, :])


Average correlation coefficient:  0.2158618129307957


In [106]:
random_corr = calculate_pearsonr(random_preds, val_ds_baseline[:, 0, :])
print("Average correlation coefficient of random model: ", random_corr)

  corr[i], _ = pearsonr(preds[i, :], targets[i, :])


Average correlation coefficient of random model:  0.022956432073763855


In [111]:
scaled_preds =  (preds[:, 0, :] + 1) * (train_maxs[0] - train_mins[0]) / 2 + train_mins[0]
scaled_true = (val_ds_baseline[:, 0, :] + 1) * (train_maxs[0] - train_mins[0]) / 2 + train_mins[0]

# calculate RMSE
from sklearn.metrics import mean_squared_error

rmses = []
for i in range(scaled_preds.shape[0]):
    nan_mask = np.isnan(scaled_true[i, :])
    rmses.append(np.sqrt(mean_squared_error(scaled_preds[i, :][~nan_mask], scaled_true[i, :][~nan_mask])))
print("RMSE: ", np.mean(rmses))
rmses = np.array(rmses)
    

RMSE:  24.511798163255413
