In [None]:
import xarray as xr
import pandas as pd
import os
import tempfile
import rasterio
import rioxarray as rio
from netCDF4 import Dataset
import cdsapi
import requests
import numpy as np

In [None]:
# Function used to slice the ERA5 files to match the IFS-HRES files and the AIFS and GraphCast files

In [None]:
def filter_era5_time_range(input_file_path, output_file_path, start_time, end_time):
    """
    Filters an ERA5 NetCDF dataset (time steps are saved as "valid_time") 
    based on a specified time range and saves it to a new file.

    Parameters:
        input_file_path (str): Path to the original NetCDF file.
        output_file_path (str): Path where the filtered NetCDF file will be saved.
        start_time (str): Start time for filtering (ISO 8601 format, e.g., "2023-12-01T06:00:00").
        end_time (str): End time for filtering (ISO 8601 format, e.g., "2023-12-11T06:00:00").
    """
    # Load the dataset
    era5ds = xr.open_dataset(input_file_path)

    try:
        # Filter the dataset for the specified time range
        filtered_era5ds = era5ds.sel(valid_time=slice(start_time, end_time))

        # Save the filtered dataset to the specified output path
        filtered_era5ds.to_netcdf(output_file_path)

        print(f"Filtered data saved to {output_file_path}")

    except Exception as e:
        print(f"An error occurred: {e}")
    finally:
        era5ds.close()
        if 'filtered_era5ds' in locals():
            filtered_era5ds.close()

In [None]:
# Slice all ERA5 files time range to match IFS-HRES time range
# Time range: 2024-XX-01T00:00:00 - 2024-XX-11T00:00:00
# Apply the function to all monthly ERA5 files both in the Pressure Variable folder and the Surface Variable folder

In [None]:
input_file_path = "Pressure Variables/20240301/20240301_era5_q.nc"
output_file_path = "Pressure Variables/20240301/20240301_era5_fc_q.nc"
start_time = "2024-03-01T00:00:00"
end_time = "2024-03-11T00:00:00"
filter_era5_time_range(input_file_path, output_file_path, start_time, end_time)

In [None]:
# Slice all ERA5 files time range to match AIFS and GraphCast time range
# Time range: 2024-XX-01T06:00:00 - 2024-XX-11T06:00:00
# Apply the function to all monthly ERA5 files both in the Pressure Variable folder and the Surface Variable folder

In [None]:
input_file_path = "Pressure Variables/20240301/20240301_era5_q.nc"
output_file_path = "Pressure Variables/20240301/20240301_era5_gcai_q.nc"
start_time = "2024-03-01T06:00:00"
end_time = "2024-03-11T06:00:00"
filter_era5_time_range(input_file_path, output_file_path, start_time, end_time)

In [None]:
# Fix the Longitude of the data files
# Change longitude range from 0 to 360 to -180 to 180. 
# Make sure new netcdf is correctly georeferenced to same crs as the original one (crs WGS84 epsg:4326)

In [None]:
# Function to fix the longitude range

In [None]:
def long_fix(input_file_path):

    # Load the dataset
    ds = xr.open_dataset(input_file_path)

    try:
    
        # Ensure longitude and latitude are correctly assigned as coordinates
        if "longitude" not in ds.coords:
            ds = ds.assign_coords(longitude=ds["longitude"])
        if "latitude" not in ds.coords:
            ds = ds.assign_coords(latitude=ds["latitude"])

        # Check if longitude is already in the -180 to 180 range
        lon_min, lon_max = ds.longitude.min().item(), ds.longitude.max().item()
        if lon_min >= -180 and lon_max <= 180:
            print("Longitude is already in the correct range. No conversion needed.")
            ds.close()
            return  # Exit without modifying the file

        # Convert longitudes from 0-360 to -180 to 180
        ds.coords['longitude'] = ((ds.coords['longitude'] + 180) % 360) - 180
        ds = ds.sortby(ds.longitude)  # Sort longitudes after conversion
        
        # Convert longitudes from 0-360 to -180 to 180
        ds.coords['longitude'] = ((ds.coords['longitude'] + 180) % 360) - 180
        ds = ds.sortby(ds.longitude)  # Sort longitudes after conversion
        
        # Ensure CRS is assigned
        ds.rio.write_crs("epsg:4326", inplace=True)
        
        # Set explicit encoding (GDAL needs this - so that QGIS visualises the extent correctly)
        ds['longitude'].attrs['standard_name'] = 'longitude'
        ds['latitude'].attrs['standard_name'] = 'latitude'
        ds['longitude'].attrs['units'] = 'degrees_east'
        ds['latitude'].attrs['units'] = 'degrees_north'
    
        # Create a temporary file path
        temp_file_path = input_file_path + ".tmp"
    
        # Save filtered dataset to a temporary file
        ds.to_netcdf(temp_file_path)

        # Close the original file
        ds.close()
    
        # Replace the original file safely
        os.replace(temp_file_path, input_file_path)
        
        print(f"Processed file saved as: {input_file_path}")

    except Exception as e:
        print(f"An error occurred: {e}")
        ds.close()

In [None]:
# Fix the longitude of all files in the Surface Variable folder

In [None]:
# Set base directory
base_dir = "Surface Variables"

# Iterate through all subfolders in "Surface Variables"
for subfolder in sorted(os.listdir(base_dir)):  # Sort to process in order
    subfolder_path = os.path.join(base_dir, subfolder)

    if os.path.isdir(subfolder_path):  # Check if it's a directory
        for file_name in os.listdir(subfolder_path):
            if file_name.endswith(".nc"):

                input_file_path = os.path.join(subfolder_path, file_name)

                long_fix(input_file_path)

                #print(input_file_path)

print("All surface variable files have been processed.")

In [None]:
# Fix the longitude of all files in the Pressure Variable folder

In [None]:
# Set base directory
base_dir = "Pressure Variables"

# Iterate through all subfolders in "Surface Variables"
for subfolder in sorted(os.listdir(base_dir)):  # Sort to process in order
    subfolder_path = os.path.join(base_dir, subfolder)

    if os.path.isdir(subfolder_path):  # Check if it's a directory
        for file_name in os.listdir(subfolder_path):
            if file_name.endswith(".nc"):

                input_file_path = os.path.join(subfolder_path, file_name)

                long_fix(input_file_path)

                #print(input_file_path)

print("All pressure variable files have been processed.")