In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from dataset_create.ucruise import divide_cruise, df_to_numpy

NUM_BINS = 64
WINDOW_LEN = 5

COLS = ['sal', 'sst_deg_c', 'pppp_hpa', 'woa_sss', 'ncep_slp_hpa',
       'dist_to_land_km', 'fco2rec_uatm', 'fco2rec_flag', 'time', 'year',
       'chl_globcolour', 'chl_globcolour_uncert', 'chl_globcolour_flags',
       'time_chlgc', 'lat_chlgc', 'lon_chlgc', 'temp_soda', 'salt_soda',
       'mld_dens_soda', 'lon_soda', 'lat_soda', 'depth_soda', 'time_soda',
       'ssh_adt', 'ssh_sla', 'lat_sshcm', 'lon_sshcm', 'time_sshcm', 'sst_cci',
       'sst_cci_uncertainty', 'ice_cci', 'time_sstcci', 'lat_sstcci',
       'lon_sstcci', 'sss_cci', 'sss_cci_random_error', 'lat_ssscci',
       'lon_ssscci', 'time_ssscci', 'interpolated']

predictors = ['sst_cci', 'sss_cci', 'pppp_hpa', 'woa_sss', 'ncep_slp_hpa', 'dist_to_land_km', 
              'chl_globcolour', 'temp_soda', 'salt_soda', 'mld_dens_soda', 
              'ssh_adt', 'ssh_sla', 'ice_cci']

dss = []
for year in range(1982, 2022):
    # Load interpolated data
    fname_interp_socat = f'../data/SOCATv2024_interpolated2/.gridded_2d_ocean_data_for_ML/SOCATv2024-cruise_tracks_interp-collocated/socat2024-interp_tracks-collocated-{year}.pq'
    df_interp_socat = pd.read_parquet(fname_interp_socat)
    df_interp_socat.set_index(['expocode', 'time_1d', 'lat', 'lon'], inplace=True)
    df_interp_socat = df_interp_socat.groupby(level=[0,1,2,3]).mean()

    # read raw socat data
    fname_socat = f'../data/SOCATv2024_raw-collocated-1982_2021/SOCATv2024_raw_r20250307-1982_2021/SOCATv2024v_collocated-{year}.pq'
    df_socat = pd.read_parquet(fname_socat)
    df_socat['time_1d'] = df_socat['time'].dt.round('D')
    df_socat.set_index(['expocode', 'time_1d','lat', 'lon'], inplace=True)
    df_socat['interpolated'] = False

    df = pd.concat([df_socat, df_interp_socat], axis=0)
    df = df.reindex(columns=COLS, fill_value=np.nan)
    print(df.shape)
    df2 = df.groupby(level=0).apply(
        lambda cruise: divide_cruise(
            cruise,
            num_windows=NUM_BINS,
            len_window=WINDOW_LEN,
            max_time_delta=pd.Timedelta(days=2)
        )
    )
    
    ds, expomap = df_to_numpy(df2, NUM_BINS, predictors)
    dss.append(ds)
    print(f"finished year {year}")

ds = np.concatenate(dss, axis=1)
ds.shape

(11391, 40)
finished year 1982
(15142, 40)
finished year 1983
(13119, 40)
finished year 1984
(21576, 40)
finished year 1985
(28840, 40)


KeyboardInterrupt: 

In [None]:
# save as numpy array
np.save('../data/training_data/trainds_100km.npy', ds)
# save as dataframe
columns = ['fco2rec_uatm', 'sst_cci', 'sss_cci', 'pppp_hpa', 'woa_sss', 'ncep_slp_hpa', 'dist_to_land_km', 
           'chl_globcolour', 'temp_soda', 'salt_soda', 'mld_dens_soda', 
           'ssh_adt', 'ssh_sla', 'ice_cci']
segment_ix = np.repeat(np.arange(0, ds.shape[1]), ds.shape[2])
bin_ix = np.tile(np.arange(0, ds.shape[2]), ds.shape[1])
index = pd.MultiIndex.from_arrays([segment_ix, bin_ix], names=['segment', 'bin'])
df = pd.DataFrame(ds.reshape(-1, len(columns)), columns=columns, index=index)
df.to_parquet('../data/training_data/traindf_100km.pq')

In [None]:

import numpy as np
# read the numpy array
ds = np.load('../data/training_data/trainds_100km.npy')
print(ds.shape)

col_map = dict(zip(columns, range(len(columns))))

# ESA-CCI > SODA 
# merge salt_soda and sss_cci, where sss_cci has priority, use salt_soda where sss_cci is nan
salt_soda = ds[col_map['salt_soda']]
sss_cci = ds[col_map['sss_cci']]
mask = np.isnan(sss_cci)
ds[col_map['sss_cci'], mask] = salt_soda[mask]

(14, 117337, 65)


In [None]:
from dataset_create.ucruise import filter_nans
X, y = filter_nans(ds, ds[col_map['fco2rec_uatm']], predictors, col_map)

In [None]:
fig, axs = plt.subplots(X.shape[0] + 1, 1, figsize=(5*(X.shape[0] + 1), 10), sharex=True)

plt.xlim((0, NUM_BINS))
titles = ['fco2', 'temperature', 'salinity']
seg = 20
axs[0].plot(y[seg])
print(y[seg])
axs[0].set_title(titles[0])
for i in range(X.shape[0]):
    axs[i + 1].plot(X[i, seg])
    axs[i + 1].set_title(titles[i + 1])
plt.show()

NameError: name 'plt' is not defined