In [2]:
# system management
import glob
import os
import importlib

# Array
import xarray as xr
import dask
import numpy as np
import pandas as pd

# See time evolution
from tqdm import tqdm

# Define daily 

In [3]:
data_folder = r"C:\Users\gezas\kDrive\SIE-Project\CL61\Cl61_module\temp"
output_folder = r'C:\Users\gezas\kDrive\SIE-Project\CL61\Data_daily'

In [22]:
# Get info on all files in folder and store it into pandas dataframe
filepaths = glob.glob(os.path.join(data_folder, '*.nc'))
filenames = [fname for fname in os.listdir(data_folder) if fname.endswith(".nc")]

df_data_files = pd.DataFrame({'file_path': filepaths, 'filename':filenames})
df_data_files['timestamp'] = df_data_files['filename'].str.extract(r'^live_(\d+_\d*)\.nc$')
df_data_files = df_data_files.set_index(pd.to_datetime(df_data_files['timestamp'], format='%Y%m%d_%H%M%S'))
df_data_files.sort_index(inplace=True)
df_data_files.head(5)

Unnamed: 0_level_0,file_path,filename,timestamp
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2023-02-22 00:04:10,C:\Users\gezas\kDrive\SIE-Project\CL61\Cl61_mo...,live_20230222_000410.nc,20230222_000410
2023-02-22 00:09:10,C:\Users\gezas\kDrive\SIE-Project\CL61\Cl61_mo...,live_20230222_000910.nc,20230222_000910
2023-02-22 00:14:10,C:\Users\gezas\kDrive\SIE-Project\CL61\Cl61_mo...,live_20230222_001410.nc,20230222_001410
2023-02-22 00:19:10,C:\Users\gezas\kDrive\SIE-Project\CL61\Cl61_mo...,live_20230222_001910.nc,20230222_001910
2023-02-22 00:24:11,C:\Users\gezas\kDrive\SIE-Project\CL61\Cl61_mo...,live_20230222_002411.nc,20230222_002411


In [38]:
first_date = "2023-02-22 00:00:00"
last_date = "2023-02-23 23:59:00"

In [60]:
selected_dates = pd.date_range(first_date, last_date, freq='D')
selected_dates

DatetimeIndex(['2023-02-22', '2023-02-23'], dtype='datetime64[ns]', freq='D')

# Combine data into daily datasets

In [40]:
for selected_date in selected_dates:
    # check date validity
    if selected_date.date() not in df_data_files.index.date:
        print(f"Date {selected_date} not found in data files..")
        continue

    # Get rows with corresponding date
    day_timestamp = selected_date.strftime('%Y-%m-%d')
    print(day_timestamp)
    selected_rows = df_data_files.loc[day_timestamp]

    #Initialize dataset to store all daily data
    combined_dataset = None # type: xr.Dataset

    # Open and combine all dataset of the day
    for row in tqdm(selected_rows.iterrows(), total=selected_rows.shape[0]):
        # Open the dataset for the selected date
        row_array = xr.open_dataset(row[1]['file_path'], chunks='auto')
        # Combine the datasets
        if combined_dataset is None:
            combined_dataset = row_array
        else:
            combined_dataset = xr.concat([combined_dataset, row_array], dim='time')

    # Define the output filename
    output_filename = f"cl61_{day_timestamp}.nc"

    # Define the output file path
    output_filepath = os.path.join(output_folder, output_filename)
    print(f"Day : {day_timestamp} --> writing to : {output_filepath}")

    combined_dataset.to_netcdf(output_filepath)


2023-02-22
                                                             file_path  \
timestamp                                                                
2023-02-22 00:04:10  C:\Users\gezas\kDrive\SIE-Project\CL61\Cl61_mo...   
2023-02-22 00:09:10  C:\Users\gezas\kDrive\SIE-Project\CL61\Cl61_mo...   
2023-02-22 00:14:10  C:\Users\gezas\kDrive\SIE-Project\CL61\Cl61_mo...   
2023-02-22 00:19:10  C:\Users\gezas\kDrive\SIE-Project\CL61\Cl61_mo...   
2023-02-22 00:24:11  C:\Users\gezas\kDrive\SIE-Project\CL61\Cl61_mo...   
...                                                                ...   
2023-02-22 23:35:11  C:\Users\gezas\kDrive\SIE-Project\CL61\Cl61_mo...   
2023-02-22 23:40:11  C:\Users\gezas\kDrive\SIE-Project\CL61\Cl61_mo...   
2023-02-22 23:45:11  C:\Users\gezas\kDrive\SIE-Project\CL61\Cl61_mo...   
2023-02-22 23:50:11  C:\Users\gezas\kDrive\SIE-Project\CL61\Cl61_mo...   
2023-02-22 23:55:11  C:\Users\gezas\kDrive\SIE-Project\CL61\Cl61_mo...   

                          

  0%|          | 0/287 [00:00<?, ?it/s]

100%|██████████| 287/287 [00:30<00:00,  9.42it/s]
