In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from fco2dataset.ucruise import divide_cruise, df_to_numpy, divide_cruise_random

NUM_BINS = 64
WINDOW_LEN = 5

COLS = ['sal', 'sst_deg_c', 'pppp_hpa', 'woa_sss', 'ncep_slp_hpa',
       'dist_to_land_km', 'fco2rec_uatm', 'fco2rec_flag', 'time', 'year',
       'chl_globcolour', 'chl_globcolour_uncert', 'chl_globcolour_flags',
       'time_chlgc', 'lat_chlgc', 'lon_chlgc', 'temp_soda', 'salt_soda',
       'mld_dens_soda', 'lon_soda', 'lat_soda', 'depth_soda', 'time_soda',
       'ssh_adt', 'ssh_sla', 'lat_sshcm', 'lon_sshcm', 'time_sshcm', 'sst_cci',
       'sst_cci_uncertainty', 'ice_cci', 'time_sstcci', 'lat_sstcci',
       'lon_sstcci', 'sss_cci', 'sss_cci_random_error', 'lat_ssscci',
       'lon_ssscci', 'time_ssscci', 'interpolated']

predictors = ['sst_cci', 'sss_cci', 'pppp_hpa', 'woa_sss', 'ncep_slp_hpa', 'dist_to_land_km', 
              'chl_globcolour', 'temp_soda', 'salt_soda', 'mld_dens_soda', 
              'ssh_adt', 'ssh_sla', 'ice_cci', 'day_of_year', 'year', 'lon', 'lat', 'expocode']
dss = []
total_cruises = 0
total_segments = 0
expcode_offset = 0
for year in range(1982, 2021):
    # Load interpolated data
    fname_interp_socat = f'../data/SOCATv2024_interpolated2/.gridded_2d_ocean_data_for_ML/SOCATv2024-cruise_tracks_interp-collocated/socat2024-interp_tracks-collocated-{year}.pq'
    df_interp_socat = pd.read_parquet(fname_interp_socat)
    df_interp_socat.set_index(['expocode', 'time_1d', 'lat', 'lon'], inplace=True)
    df_interp_socat = df_interp_socat.groupby(level=[0,1,2,3]).mean()

    cruises = len(df_interp_socat.index.unique(level=0))
    total_cruises += cruises
    print(f"number of cruises in {year}: {cruises}")

    # read raw socat data
    fname_socat = f'../data/SOCATv2024_raw-collocated-1982_2021/SOCATv2024_raw_r20250307-1982_2021/SOCATv2024v_collocated-{year}.pq'
    df_socat = pd.read_parquet(fname_socat)
    df_socat['time_1d'] = df_socat['time'].dt.round('D')
    df_socat.set_index(['expocode', 'time_1d','lat', 'lon'], inplace=True)
    df_socat['interpolated'] = False

    df = pd.concat([df_socat, df_interp_socat], axis=0)
    df = df.reindex(columns=COLS, fill_value=np.nan)
    # print(df.shape)
    df2 = df.groupby(level=0, group_keys=False).apply(
        lambda cruise: divide_cruise_random(
            cruise,
            num_windows=NUM_BINS,
            len_window=WINDOW_LEN,
            max_time_delta=pd.Timedelta(days=2),
            rep_ratio=3,
        )
    )

    # add time, location and expocode features
    dates = df2.index.get_level_values('time_1d')
    df2['day_of_year'] = dates.dayofyear.astype(np.int32)
    df2['year'] = dates.year.astype(np.int32)
    lons = df2.index.get_level_values('lon')
    lats = df2.index.get_level_values('lat')
    df2['lon'] = lons.astype(np.float32)
    df2['lat'] = lats.astype(np.float32)
    expocodes = df2.index.get_level_values('expocode').sort_values().astype(str)
    # check that expocode does not contain nans
    assert expocodes.notna().all(), "expocode contains nans"
    # map expocode to a unique integer id
    expocode_map = {expocode: expcode_offset + i for i, expocode in enumerate(expocodes.unique())}
    df2['expocode'] = expocodes.map(expocode_map).astype(np.int32)
    expcode_offset += len(expocode_map)


    segments = df2.groupby([pd.Grouper(level=0), 'segment_id']).size().size
    total_segments += segments
    print(f"number of segments in {year}: {segments}")
    
    ds, expomap = df_to_numpy(df2, NUM_BINS - 1, predictors)
    dss.append(ds)
    print(f"finished year {year}")

ds = np.concatenate(dss, axis=1)
print(f"total number of cruises: {total_cruises}")
print(f"total number of segments: {total_segments}")
print(f"total number of samples: {ds.shape[1]}")
ds.shape

number of cruises in 1982: 8
number of segments in 1982: 336
binned shape:  (19113, 45)
finished year 1982
number of cruises in 1983: 7
number of segments in 1983: 495
binned shape:  (29546, 45)
finished year 1983
number of cruises in 1984: 5
number of segments in 1984: 423
binned shape:  (25366, 45)
finished year 1984
number of cruises in 1985: 9
number of segments in 1985: 650
binned shape:  (38276, 45)
finished year 1985
number of cruises in 1986: 11
number of segments in 1986: 1073
binned shape:  (61575, 45)
finished year 1986
number of cruises in 1987: 19
number of segments in 1987: 1575
binned shape:  (93866, 45)
finished year 1987
number of cruises in 1988: 13
number of segments in 1988: 1009
binned shape:  (62225, 45)
finished year 1988
number of cruises in 1989: 15
number of segments in 1989: 938
binned shape:  (55645, 45)
finished year 1989
number of cruises in 1990: 15
number of segments in 1990: 982
binned shape:  (55205, 45)
finished year 1990
number of cruises in 1991: 28

In [7]:
# there are 4710 segments in 2021 remove those from the datasets to use them for testing
# ds_1982_2020 = ds[:, :-4710, :]
# ds_2021 = ds[:, -4710:, :]

# np.save('../data/training_data/ds_1982_2020.npy', ds_1982_2020)
# np.save('../data/training_data/ds_2021.npy', ds_2021)

In [98]:
# save as numpy array
# np.save('../data/training_data/trainds_100km.npy', ds)
# save as dataframe
columns = ['fco2rec_uatm', 'sst_cci', 'sss_cci', 'pppp_hpa', 'woa_sss', 'ncep_slp_hpa', 'dist_to_land_km', 
           'chl_globcolour', 'temp_soda', 'salt_soda', 'mld_dens_soda', 
           'ssh_adt', 'ssh_sla', 'ice_cci', 'day_of_year', 'year', 'lon', 'lat', 'expocode']
segment_ix = np.repeat(np.arange(0, ds.shape[1]), ds.shape[2])
bin_ix = np.tile(np.arange(0, ds.shape[2]), ds.shape[1])
index = pd.MultiIndex.from_arrays([segment_ix, bin_ix], names=['segment', 'bin'])
# df = pd.DataFrame(ds.reshape(-1, len(columns)), columns=columns, index=index)
df = pd.DataFrame(0, columns=columns, index=index)
for i, col in enumerate(columns):
    df[col] = ds[i, :, :].reshape(-1)
# df.to_parquet('../data/training_data/df_100km_random.pq')

In [117]:
def df_to_ds(df,):
    num_bins = df.index.get_level_values('bin').unique().shape[0]
    num_segments = df.index.get_level_values('segment').unique().shape[0]
    ds_reconstructed = df.values.reshape(len(df.columns), num_segments, num_bins)
    ds_reconstructed = np.zeros((len(df.columns), num_segments, num_bins))
    for i, col in enumerate(df.columns):
        ds_reconstructed[i, :, :] = df[col].values.reshape(num_segments, num_bins)
    return ds_reconstructed

In [None]:
ds_reconstructed = df_to_ds(df)
# check if the reconstruction is correct
assert np.allclose(ds, ds_reconstructed, equal_nan=True), "Reconstruction failed!"

In [106]:
i = 1001
assert np.allclose(ds[:, i, :], df.loc[i].values.T, equal_nan=True), "Reconstruction failed!" 

In [None]:
nan_in_expocode = df.groupby('segment')['expocode'].apply(lambda x: ~x.isna().any())
df_indices = pd.DataFrame(nan_in_expocode[df.index.get_level_values('segment')].values, columns=['valid'], index=df.index)
df_filtered = df[df_indices['valid']]

Unnamed: 0_level_0,Unnamed: 1_level_0,valid
segment,bin,Unnamed: 2_level_1
0,0,False
0,1,False
0,2,False
0,3,False
0,4,False


In [110]:
# split df in train and test
unique_segments = df_filtered['expocode'].unique()
print(f"number of unique cruises: {len(unique_segments)}")
# select 10% segments for validation
valid_segments = np.random.choice(unique_segments, size=int(len(unique_segments) * 0.1), replace=False)

df_valid = df_filtered[df_filtered['expocode'].isin(valid_segments)]  
df_train = df_filtered[~df_filtered['expocode'].isin(valid_segments)]
print(df_valid.shape, df_train.shape)
# df_valid.to_parquet('../data/training_data/valdf_100km_random.pq')
# df_train.to_parquet('../data/training_data/traindf_100km_random.pq')

number of unique cruises: 121
(42880, 19) (341184, 19)
