# Download/Pre-process CFS forecast data
Lindsay Fitzpatrick
ljob@umich.edu
08/28/2024
Updated: 03/14/2024

This script reads downloads CFS forecast data from the AWS as grib2 files. It then opens the grib2 files, calculates total basin, lake, and land, precipitation, evaporation, and average 2m air temperature. These calculations are then added to the running CSV files. This script needs the following files:

- GL_mask.nc
- cfs_forecast_data.db

In [1]:
import os
import sys
import pandas as pd
import netCDF4 as nc

In [2]:
# Add the path to the src directory (two levels up)
sys.path.append(os.path.abspath('../../'))

from importlib import reload
import src.misc
reload(src.misc)

from src.data_retrieve import *
from src.data_processing import process_grib_files
from src.calculations import calculate_grid_cell_areas
from src.misc import create_directory

## User Inputs

In [12]:
# This is the directory where you cloned the repo
path_to_repo = '/Users/ljob/Desktop/'

# Path to download data to
download_dir = path_to_repo + 'cnbs-predictor/data/CFS/'

# Path to input files
input_dir = path_to_repo + 'cnbs-predictor/data/input/'

# Location of the mask file
mask_file = input_dir + 'GL_mask.nc'

# Location of database with CFS forecast data
database = input_dir + 'cfs_forecast_data.db'

# Where would you like to pull the data from?
source = 'aws' # 'aws' or 'ncei'

# Do you need to download CFS data?
download_cfs = 'yes'

# Do you want to process the CFS data and 
process_cfs = 'yes'

# Delete grib files after processing and saving data?
delete_files = 'no'

# Auto 'yes' will open the existing files, pull the last date to determine the start date and yesterday becomes the end date in
# order to make the csvs up-to-date
auto = 'yes'

start_date = '03-15-2025'
end_date = '03-16-2025'

Presets

These shouldn't change unless the location changes for CFS data or the user wants different files (products specifies the prefix of the files. Different files contain different variables) or a specific forecast (utc specifies the forecast time).

In [13]:
## Presets ##
products = ['pgb','flx']
utc = ['00','06','12','18']

# Define mask variables
mask_variables = ['eri_lake','eri_land',
                  'ont_lake','ont_land',
                  'mih_lake','mih_land',
                  'sup_lake','sup_land']

#AWS bucket name to locate the CFS forecast
bucket_name = 'noaa-cfs-pds'

## Begin Script

Check the download directory to see if it exists or create it if it does not.

In [6]:
create_directory(download_dir)

Directory '/Users/ljob/Desktop/cnbs-predictor/data/CFS/' created.


Open existing Database or create new one if it does not already exist.

In [None]:
open_cfs_db(database)

This section allows the user to set the script to auto. When auto = yes, the script opens one of the CSVs (temperature), reads the last date that it recorded and automatically makes the start date the next day. It then will run through yesterday's date in order to be caught up. If auto = no, then the user can input a date range. This option is convienent for testing or for starting new CSVs.

In [14]:
if auto == 'yes':
        # Fetch next cfs_run date and use yesterday's date for the end date
        start_date_i = get_next_cfs_run(database, 'cfs_forecast_data')
        end_date_i = (datetime.now() - timedelta(days=1)).strftime("%m-%d-%Y") + " 18"
        # Validate dates
        if start_date_i >= end_date_i:
            print("The csv files are up-to-date.")
        else:
            print(f"Starting from: {start_date_i}Z and continuing through: {end_date_i}Z")

else:
    # Ensure both start_date and end_date have hour info
    start_date = (start_date + " 00") if len(start_date) == 10 else start_date
    end_date = (end_date + " 18") if len(end_date) == 10 else end_date

    # Convert to datetime objects for comparison
    start_date_i = datetime.strptime(start_date, "%m-%d-%Y %H")
    end_date_i = datetime.strptime(end_date, "%m-%d-%Y %H")

    # Validate dates
    if start_date_i == end_date_i:
        print(start_date_i)
        print("The csv files are up-to-date.")
    elif start_date_i > end_date_i:
        print(start_date_i)
        print("There is an error in the input dates. Please try again.")
    else:
        print(f"Starting from: {start_date_i.strftime('%m-%d-%Y %H')}Z and continuing through: {end_date_i.strftime('%m-%d-%Y %H')}Z")

date_array = pd.date_range(start=start_date_i, end=end_date_i, freq='6h')

Starting from: 03-17-2025 00Z and continuing through: 03-19-2025 18Z


Open the mask file. Pull the latitude and longitude to be used to cut the global variable down to just the Great Lakes domain and upscale. Also calculates area of each of the grid cells.

In [15]:
# Open the mask file and calculate the grid cell areas
mask_ds = nc.Dataset(mask_file)
mask_lat = mask_ds.variables['latitude'][:]
mask_lon = mask_ds.variables['longitude'][:]
area = calculate_grid_cell_areas(mask_lon, mask_lat)

Begin loop to go through the user input dates. Loop creates a directory to download the CFS grib files, runs through the download_grb2_aws funtion to download and then run through the process_grib_files to do the calculations. It then saves the calculations to the CSV files, deletes the grib2 files and moves on to the next date.

In [None]:
for date in date_array:
    print(f"Beginning Files for {date}.")

    YYYY = date.strftime("%Y")
    MM = date.strftime("%m")
    DD = date.strftime("%d")
    HH = date.strftime("%H")

    #date = date.strftime('%Y%m%d')
    download_path = f'{download_dir}{YYYY}{MM}{DD}/CFS/'
    if not os.path.exists(download_path):
        os.makedirs(download_path)

    # Download the grib2 files using AWS or NCEI
    if download_cfs == 'yes':
        for product in products:
            if source == 'aws':
                url_path = f'cfs.{YYYY}{MM}{DD}/{HH}/monthly_grib_01/'
                download_grb2_aws(product, bucket_name, url_path, download_path)
            elif source == 'ncei':
                base_url = 'https://www.ncei.noaa.gov/data/climate-forecast-system/access/operational-9-month-forecast/monthly-means/'
                url_path = f'{base_url}/{YYYY}/{YYYY}{MM}/{YYYY}{MM}{DD}/{YYYY}{MM}{DD}{HH}/'
                if not url_path or not check_url_exists(url_path):
                    print(f"No files available for {date}.")
                else:
                    download_grb2_ncei(product, url_path, download_path)
            else:
                print('Input source does not exist. Source must be aws or ncei.')
    
    if process_cfs == 'yes':

        process_grib_files(download_path, database, 'cfs_forecast_data', f'{YYYY}{MM}{DD}{HH}', mask_lat, mask_lon, mask_ds, mask_variables, area)

        if delete_files == 'yes':
            os.rmdir(download_path)
    
    print(f'Done with {date}.')
print("Process Complete")

Beginning Files for 2025-03-17 00:00:00.
Downloaded: cfs.20250317/00/monthly_grib_01/pgbf.01.2025031700.202503.avrg.grib.grb2
Downloaded: cfs.20250317/00/monthly_grib_01/pgbf.01.2025031700.202504.avrg.grib.grb2
Downloaded: cfs.20250317/00/monthly_grib_01/pgbf.01.2025031700.202505.avrg.grib.grb2
Downloaded: cfs.20250317/00/monthly_grib_01/pgbf.01.2025031700.202506.avrg.grib.grb2
Downloaded: cfs.20250317/00/monthly_grib_01/pgbf.01.2025031700.202507.avrg.grib.grb2
Downloaded: cfs.20250317/00/monthly_grib_01/pgbf.01.2025031700.202508.avrg.grib.grb2
Downloaded: cfs.20250317/00/monthly_grib_01/pgbf.01.2025031700.202509.avrg.grib.grb2
Downloaded: cfs.20250317/00/monthly_grib_01/pgbf.01.2025031700.202510.avrg.grib.grb2
Downloaded: cfs.20250317/00/monthly_grib_01/pgbf.01.2025031700.202511.avrg.grib.grb2
Downloaded: cfs.20250317/00/monthly_grib_01/pgbf.01.2025031700.202512.avrg.grib.grb2
Downloaded: cfs.20250317/00/monthly_grib_01/flxf.01.2025031700.202503.avrg.grib.grb2
Downloaded: cfs.20250317

Close any open files before finishing script.

In [17]:
mask_ds.close()