This script downloads CFS grib2 files from YESTERDAY using AWS, converts them to netcdf files using the wgrib2 command, merges the netcdf files, cuts to the Great Lakes domain, and drops irrelevant variables to reduce file size.

The following is required to run this script:
- wgrib2 (https://www.ftp.cpc.ncep.noaa.gov/wd51we/wgrib2/Windows10/v3.1.3/wgrib2.exe)

Example output files:
- flxf.01.2024072100.allmonths.remapped.nc
- flxf.01.2024072106.allmonths.remapped.nc
- flxf.01.2024072112.allmonths.remapped.nc
- flxf.01.2024072118.allmonths.remapped.nc
- pgbf.01.2024072100.allmonths.remapped.nc
- pgbf.01.2024072106.allmonths.remapped.nc
- pgbf.01.2024072112.allmonths.remapped.nc
- pgbf.01.2024072118.allmonths.remapped.nc

In [1]:
from datetime import datetime, timedelta
import os
import xarray as xr
import boto3
from botocore import UNSIGNED
from botocore.config import Config
from collections import defaultdict

In [2]:
## USER INPUTS ##

# Path to download data to
dir = 'C:/Users/fitzpatrick/Desktop/Data/'
# Location of the mask file
mask = 'C:/Users/fitzpatrick/Desktop/Data/Input/GL_mask.nc'

In [3]:
## Presets ##
products = ['pgb','flx']
utc = ['00','06','12','18']
lonlatbox = [250,295,30,70]

today = datetime.today().strftime('%Y%m%d')
yesterday = (datetime.today() - timedelta(days=1)).strftime('%Y%m%d')

In [4]:
def download_grb2_aws(product, bucket_name, folder_path, download_dir):

    num_files_downloaded = 0

    # Create a boto3 client for S3
    s3_config = Config(signature_version=UNSIGNED)
    s3 = boto3.client('s3', config=s3_config)

    # List all objects in the specified folder path
    continuation_token = None
    objects = []

    # Use a loop to handle pagination
    while True:
        list_objects_args = {'Bucket': bucket_name, 'Prefix': folder_path}
        if continuation_token:
            list_objects_args['ContinuationToken'] = continuation_token

        list_objects_response = s3.list_objects_v2(**list_objects_args)

        objects.extend(list_objects_response.get('Contents', []))

        if not list_objects_response.get('IsTruncated', False):
            break

        continuation_token = list_objects_response.get('NextContinuationToken')

    # Iterate over each object and download if it ends with '.grb2'
    for obj in objects:
        key = obj['Key']
        if product in key and key.endswith('grib.grb2'): #if key.endswith('.grb2'):
            local_file_path = os.path.join(download_dir, os.path.relpath(key, folder_path))

            # Ensure the directory structure exists
            os.makedirs(os.path.dirname(local_file_path), exist_ok=True)

            # Download the file
            s3.download_file(bucket_name, key, local_file_path)
            num_files_downloaded += 1

            print(f"Downloaded: {key}")

    print(f'Total number of CFS files downloaded from AWS: {num_files_downloaded}')

In [5]:
def get_files(directory, where, format):
    """
    Get a list of all GRIB2 files in the specified directory.

    Parameters:
    - directory: Path to the directory containing the GRIB2 files.
    - where: 'starts' or 'ends'
    - format: either '.grb2' or '.nc'
    Returns:
    - List of file paths to the GRIB2 files.
    """
    files = []
    for file_name in os.listdir(directory):
        if where == 'ends':
            if file_name.endswith(format):
                file_path = os.path.join(directory, file_name)
                files.append(file_path)
        elif where == 'starts':
            if file_name.startswith(format):
                file_path = os.path.join(directory, file_name)
                files.append(file_path)
    return files

In [6]:
## In order to convert grb2 files to netcdf on a windows machine, you need to download wgrib2.exe
## https://www.ftp.cpc.ncep.noaa.gov/wd51we/wgrib2/Windows10/v3.1.3/wgrib2.exe

import subprocess

def grb2_to_netcdf(input_file, output_file):
    # Define the command to convert GRIB2 to NetCDF using wgrib2
    # Need to download wgrib2 and write the full path unless it is set to your PATH
    command = ["C:/Users/fitzpatrick/Downloads/wgrib2", input_file, "-netcdf", output_file]

    # Execute the command
    try:
        subprocess.run(command, check=True)
        # Remove the grb2 file
        os.remove(input_file)
        print(f"Conversion successful. NetCDF file saved as {output_file}")
    except subprocess.CalledProcessError as e:
        print(f"Conversion failed with error: {e}")

In [7]:
def merge_netcdf_files(dir_save, input_files):
    # Group input files by common prefix
    file_groups = defaultdict(list)
    for file in input_files:
        filename = os.path.basename(file)
        prefix = filename.split('.')[0] + '.' + filename.split('.')[1] + '.' + filename.split('.')[2]  # Extracting prefix
        file_groups[prefix].append(file)

    # Iterate over each group of files and merge them
    for prefix, files in file_groups.items():
        # List to store all datasets
        datasets = []

        # Loop through each input file
        for file in files:
            # Open the NetCDF file using xarray
            ds = xr.open_dataset(file)

            # Is this a pgb or flx file?
            file_prefix = os.path.basename(file).split('.')[0]

            if file_prefix == 'pgbf':
                ds = ds[['APCP_surface']]  # Keep only APCP_surface variable
                # Add dataset to the list
                datasets.append(ds)

            elif file_prefix == 'flxf':
                ds = ds[['LHTFL_surface', 'TMP_2maboveground']] # Keep only LHTFL and TMP
                # Add dataset to the list
                datasets.append(ds)

            else:
                print('File not compatible')
            # Close the dataset
            ds.close()        
        
        # Merge all datasets along time dimension
        combined = xr.concat(datasets, dim='time')

        # Save to a new NetCDF file
        output_filename = dir_save + f"{prefix}.allmonths.nc"
        combined.to_netcdf(output_filename)

        print(f'Complete: {output_filename}')
            

In [8]:
def cut_remap_netcdf(input_file, reference_file, output_file, lonlatbox):
    """
    Cut a NetCDF file to a specified lon-lat bounding box and save it as a new file.

    Parameters:
        input_file (str): Path to the input NetCDF file.
        reference_file (str): File to remap the input_file like
        output_file (str): Path and filename of the output file
        lonlatbox (list): Bounding box in the format [lon_min, lon_max, lat_min, lat_max].

    Returns:
        ds_remapped
    """
    # Unpack the lon-lat bounding box
    lon_min, lon_max, lat_min, lat_max = lonlatbox

    # Open the input NetCDF file
    ds = xr.open_dataset(input_file)
    ds_mask = xr.open_dataset(reference_file)

    # Select lat/lon range
    ds_cut = ds.sel(longitude=slice(lon_min, lon_max), latitude=slice(lat_min, lat_max))

    # Interpolate the input data onto the grid of the reference data
    ds_remapped = ds_cut.interp_like(ds_mask)

    # Save the remapped data to a new NetCDF file
    ds_remapped.to_netcdf(output_file)
    ds.close()
    ds_mask.close()
    os.remove(input_file)
    print('Processing complete: ',output_file)
    
    return ds_remapped

In [9]:
download_dir = f'{dir}{today}/CFS/downloaded/'
if not os.path.exists(download_dir):
    os.makedirs(download_dir)
    print("Directory created.")
else:
    print("Directory already exists.")

Directory created.


In [10]:
# Uses the AWS to download the grib2 files
bucket_name = 'noaa-cfs-pds'

for utc in utc:
    for product in products:
        folder_path = f'cfs.{yesterday}/{utc}/monthly_grib_01/'
        download_grb2_aws(product, bucket_name, folder_path, download_dir)

Downloaded: cfs.20240813/00/monthly_grib_01/pgbf.01.2024081300.202408.avrg.grib.grb2
Downloaded: cfs.20240813/00/monthly_grib_01/pgbf.01.2024081300.202409.avrg.grib.grb2
Downloaded: cfs.20240813/00/monthly_grib_01/pgbf.01.2024081300.202410.avrg.grib.grb2
Downloaded: cfs.20240813/00/monthly_grib_01/pgbf.01.2024081300.202411.avrg.grib.grb2
Downloaded: cfs.20240813/00/monthly_grib_01/pgbf.01.2024081300.202412.avrg.grib.grb2
Downloaded: cfs.20240813/00/monthly_grib_01/pgbf.01.2024081300.202501.avrg.grib.grb2
Downloaded: cfs.20240813/00/monthly_grib_01/pgbf.01.2024081300.202502.avrg.grib.grb2
Downloaded: cfs.20240813/00/monthly_grib_01/pgbf.01.2024081300.202503.avrg.grib.grb2
Downloaded: cfs.20240813/00/monthly_grib_01/pgbf.01.2024081300.202504.avrg.grib.grb2
Downloaded: cfs.20240813/00/monthly_grib_01/pgbf.01.2024081300.202505.avrg.grib.grb2
Total number of CFS files downloaded from AWS: 10
Downloaded: cfs.20240813/00/monthly_grib_01/flxf.01.2024081300.202408.avrg.grib.grb2
Downloaded: cfs

In [11]:
# Set up a loop to convert all the grib2 files to netcdf in a given directory
grb2_files = get_files(download_dir, 'ends', '.grb2')

for grib2_file in grb2_files:
    output_netcdf_file = grib2_file[:-5] + '.nc'  # Replace .grb2 with .nc in file name
    grb2_to_netcdf(grib2_file, output_netcdf_file)

C:/Users/fitzpatrick/Desktop/Data/20240814/CFS/downloaded/
['C:/Users/fitzpatrick/Desktop/Data/20240814/CFS/downloaded/flxf.01.2024081300.202408.avrg.grib.grb2', 'C:/Users/fitzpatrick/Desktop/Data/20240814/CFS/downloaded/flxf.01.2024081300.202409.avrg.grib.grb2', 'C:/Users/fitzpatrick/Desktop/Data/20240814/CFS/downloaded/flxf.01.2024081300.202410.avrg.grib.grb2', 'C:/Users/fitzpatrick/Desktop/Data/20240814/CFS/downloaded/flxf.01.2024081300.202411.avrg.grib.grb2', 'C:/Users/fitzpatrick/Desktop/Data/20240814/CFS/downloaded/flxf.01.2024081300.202412.avrg.grib.grb2', 'C:/Users/fitzpatrick/Desktop/Data/20240814/CFS/downloaded/flxf.01.2024081300.202501.avrg.grib.grb2', 'C:/Users/fitzpatrick/Desktop/Data/20240814/CFS/downloaded/flxf.01.2024081300.202502.avrg.grib.grb2', 'C:/Users/fitzpatrick/Desktop/Data/20240814/CFS/downloaded/flxf.01.2024081300.202503.avrg.grib.grb2', 'C:/Users/fitzpatrick/Desktop/Data/20240814/CFS/downloaded/flxf.01.2024081300.202504.avrg.grib.grb2', 'C:/Users/fitzpatrick/

In [13]:
process_dir = f'{dir}{today}/CFS/processed/'
if not os.path.exists(process_dir):
    os.makedirs(process_dir)
    print("Directory created.")
else:
    print("Directory already exists.")

Directory created.


In [14]:
# This step drops unused variables and merges the months together into a 1 netcdf file
nc_files = get_files(download_dir, 'ends', '.nc')
merge_netcdf_files(process_dir, nc_files)

Cannot find the ecCodes library


Complete: C:/Users/fitzpatrick/Desktop/Data/20240814/CFS/processed/flxf.01.2024081300.allmonths.nc
Complete: C:/Users/fitzpatrick/Desktop/Data/20240814/CFS/processed/flxf.01.2024081306.allmonths.nc
Complete: C:/Users/fitzpatrick/Desktop/Data/20240814/CFS/processed/flxf.01.2024081312.allmonths.nc
Complete: C:/Users/fitzpatrick/Desktop/Data/20240814/CFS/processed/flxf.01.2024081318.allmonths.nc
Complete: C:/Users/fitzpatrick/Desktop/Data/20240814/CFS/processed/pgbf.01.2024081300.allmonths.nc
Complete: C:/Users/fitzpatrick/Desktop/Data/20240814/CFS/processed/pgbf.01.2024081306.allmonths.nc
Complete: C:/Users/fitzpatrick/Desktop/Data/20240814/CFS/processed/pgbf.01.2024081312.allmonths.nc
Complete: C:/Users/fitzpatrick/Desktop/Data/20240814/CFS/processed/pgbf.01.2024081318.allmonths.nc


In [15]:
file_list = get_files(process_dir, 'ends', '.nc')

# Use the function defined above to cut the netcdf files to the GL domain and upscale to 0.25 degrees
for file in file_list:
    # Split the filename
    filename = os.path.basename(file) #pulls the filename from the entire path
    name, ext = os.path.splitext(filename) #splits the filename at the '.nc' so we can change the filename

    # Create new file names for the new files
    new_filename = name + '.remapped' + ext
    new_netcdf = cut_remap_netcdf(file, mask, process_dir+new_filename, lonlatbox)

Processing complete:  C:/Users/fitzpatrick/Desktop/Data/20240814/CFS/processed/flxf.01.2024081300.allmonths.remapped.nc
Processing complete:  C:/Users/fitzpatrick/Desktop/Data/20240814/CFS/processed/flxf.01.2024081306.allmonths.remapped.nc
Processing complete:  C:/Users/fitzpatrick/Desktop/Data/20240814/CFS/processed/flxf.01.2024081312.allmonths.remapped.nc
Processing complete:  C:/Users/fitzpatrick/Desktop/Data/20240814/CFS/processed/flxf.01.2024081318.allmonths.remapped.nc
Processing complete:  C:/Users/fitzpatrick/Desktop/Data/20240814/CFS/processed/pgbf.01.2024081300.allmonths.remapped.nc
Processing complete:  C:/Users/fitzpatrick/Desktop/Data/20240814/CFS/processed/pgbf.01.2024081306.allmonths.remapped.nc
Processing complete:  C:/Users/fitzpatrick/Desktop/Data/20240814/CFS/processed/pgbf.01.2024081312.allmonths.remapped.nc
Processing complete:  C:/Users/fitzpatrick/Desktop/Data/20240814/CFS/processed/pgbf.01.2024081318.allmonths.remapped.nc
