This notebook gathers all the sea data processed through BasicPreprocessing-HourslySeaDataV3.py, which was run eight times, in half year increments between 2018 and 2021, to create one coherent data set which occupies minimum memory.

BasicPreprocessing-HourslySeaDataV3.py created sea_data_hourly_2018H1.pickle - sea_data_hourly_2021H2.pickle and contain sea height measured at various latitudes and longitudes, with six months' worth of data stored in each file.

# Import Libraries

In [1]:
import pickle
import pandas as pd
import os
from glob import glob
import numpy as np

# Define paths

In [2]:
#Define paths to data and find the files

data_path = "Processed Data"
file_pattern = "sea_data_hourly*.pickle"

file_list = sorted(glob(os.path.join(data_path, file_pattern)))

In [3]:
combined_data = [] #list for data
processed_frames = {} #dict for batches of data
time_orig_dict = {} #dict for batches of data

# Clean Data

In [4]:
#26 January 2025: I realised I entered the wrong dates for 2019H2, 2020H2 and 2021H2 when running the code to generate the raw data
#When getting H2 data, I started from June instead of July for three years.
#To save time on re-running the scripts from scratch for these dates, I am cropping them at this stage.
#This function would not be required if the right dates are entered in the script.

#02 February 2025: I had to fix the region of interest in the scripts generating the data, so this function is no longer technically required.

def filtertimerange(df_time, period):
    if period == '2019H2':
        return df_time[(df_time >= pd.Timestamp('2019-07-01'))]
    elif period == '2020H2':
        return df_time[(df_time >= pd.Timestamp('2020-07-01'))]
    elif period == '2021H2':
        return df_time[(df_time >= pd.Timestamp('2021-07-01'))]
    else:
        return df_time

In [5]:
#Process each pickle file. Y, lat and lon are ndarrays, time is a pd series

def get_time_and_Y(file_path):
    with open(file_path, 'rb') as file:
        data = pickle.load(file)
        
    #Extract the period from the filename to check if corpping needs to take place due to the error I made in the dates when generating the data
    period = file_path.split('/')[-1].split('_')[3].replace('.pickle', '')
    
    for key, value in data.items():
        #process series (for time)
        if key == 'time' and isinstance(value, pd.Series):
            #Remove timezone information to facilitate further processing
            value = pd.to_datetime(value.apply(lambda x: x.replace(tzinfo=None)))
            time_orig = value.copy() #keep copy of time to create a mask and crop Y in the same way
            #Crop extra time periods
            value = filtertimerange(value, period)
            processed_frames[key] = value

    #Two for loops are required to force start with time due to dependency of Y on time in cropping
    for key, value in data.items():
        #handle arrays
        if key == 'Y' and value.ndim == 3 and isinstance(value, np.ndarray):
            time_mask = time_orig.isin(processed_frames['time']) #Mask based on time
            value = value[time_mask]
            
            value = value.reshape(value.shape[0], -1) #flatten grid
            processed_frames[key] = pd.DataFrame(value)
    
    return processed_frames

In [6]:
#Process files and combine data

def combine_pickle_files(file_list):
    combined_data = []
    combined_time = []
    combined_Y = []
    for file in file_list:
        #extract all the 'Y's and 'time's from each file
        print(f"Processing: {file}")
        file_data = get_time_and_Y(file)
        combined_data.append(file_data)
        combined_time.append(file_data['time'])
        combined_Y.append(file_data['Y'])
    
    return combined_data, combined_time, combined_Y

In [7]:
combined_data, combined_time, combined_Y = combine_pickle_files(file_list)

Processing: Processed Data\sea_data_hourly_2018H1.pickle
Processing: Processed Data\sea_data_hourly_2018H2.pickle
Processing: Processed Data\sea_data_hourly_2019H1.pickle
Processing: Processed Data\sea_data_hourly_2019H2.pickle
Processing: Processed Data\sea_data_hourly_2020H1.pickle
Processing: Processed Data\sea_data_hourly_2020H2.pickle
Processing: Processed Data\sea_data_hourly_2021H1.pickle
Processing: Processed Data\sea_data_hourly_2021H2.pickle


In [8]:
combined_time_series = pd.concat(combined_time, ignore_index=True)
df_time = pd.DataFrame({'time': combined_time_series})
print(df_time)

                     time
0     2018-01-01 00:00:00
1     2018-01-01 01:00:00
2     2018-01-01 02:00:00
3     2018-01-01 03:00:00
4     2018-01-01 04:00:00
...                   ...
35059 2021-12-31 19:00:00
35060 2021-12-31 20:00:00
35061 2021-12-31 21:00:00
35062 2021-12-31 22:00:00
35063 2021-12-31 23:00:00

[35064 rows x 1 columns]


In [9]:
df_Y = pd.concat([pd.DataFrame(y) for y in combined_Y], ignore_index=True)
print(df_Y)

        0      1      2      3      4      5      6      7      8      9     \
0      0.637  0.650  0.665  0.681  0.694  0.703  0.708  0.710  0.711  0.713   
1      0.637  0.655  0.671  0.689  0.704  0.715  0.722  0.724  0.726  0.729   
2      0.623  0.643  0.663  0.682  0.699  0.712  0.720  0.725  0.728  0.733   
3      0.602  0.623  0.645  0.667  0.686  0.702  0.712  0.719  0.724  0.729   
4      0.567  0.590  0.613  0.638  0.660  0.679  0.692  0.701  0.708  0.715   
...      ...    ...    ...    ...    ...    ...    ...    ...    ...    ...   
35059  1.151  1.177  1.198  1.222  1.241  1.258  1.272  1.282  1.287  1.289   
35060  1.127  1.149  1.169  1.191  1.209  1.224  1.238  1.247  1.251  1.254   
35061  1.056  1.079  1.097  1.118  1.134  1.149  1.161  1.169  1.172  1.174   
35062  1.038  1.059  1.076  1.096  1.111  1.125  1.136  1.143  1.146  1.148   
35063  1.015  1.035  1.051  1.070  1.085  1.097  1.108  1.115  1.117  1.119   

       ...   9954   9955   9956   9957   9958   995

In [10]:
# Read lat and lon from one file. These are the same for all the files, so we only need to extract them once (they are the 'map')

file_path = "Processed Data/sea_data_hourly_2018H1.pickle"
with open(file_path, 'rb') as file:
    data = pickle.load(file)

#extract lat and lon as distinct data frames
for key, value in data.items():
    if key == 'lat' and value.ndim == 2:
        lat = pd.DataFrame(value)
    elif key == 'lon' and value.ndim == 2:
        lon = pd.DataFrame(value)

# Save Data

In [11]:
#Export dataset to parquet format - one file for each dataframe

df_time.to_parquet('Processed Data/SEA_data_time.parquet', engine='pyarrow', compression='snappy')
df_Y.to_parquet('Processed Data/SEA_data_Y.parquet', engine='pyarrow', compression='snappy')
lat.to_parquet('Processed Data/SEA_data_lat.parquet', engine='pyarrow', compression='snappy')
lon.to_parquet('Processed Data/SEA_data_lon.parquet', engine='pyarrow', compression='snappy')

print("Data saved to folder Processed Data")

Data saved to folder Processed Data
