## Import libraries

These are the required libraries to run this script. Libraries can be individually
installed using pip/conda (ie. conda install xarray, conda install -c conda-forge xarray
pip install xarray).

In [1]:
from datetime import datetime, timedelta
import os
import xarray as xr
import boto3
from botocore import UNSIGNED
from botocore.config import Config
from collections import defaultdict
import cfgrib
import pandas as pd
import netCDF4 as nc
import numpy as np
import calendar
import shutil

## User Inputs

In [52]:
# Path to download data to
dir = 'C:/Users/fitzpatrick/Desktop/Data/'
# Location of the mask file
mask_file = dir + 'Input/GL_mask.nc'

tmp_csv = dir + 'CFS_TMP_forecasts_Avg.csv'
evap_csv = dir + 'CFS_EVAP_forecasts_Sums.csv'
pcp_csv = dir + 'CFS_PCP_forecasts_Sums.csv'


## Pre-defined Constants

In [53]:
## Presets ##
products = ['pgb','flx']
utc = ['00','06','12','18']

today = datetime.today().strftime('%Y%m%d')
yesterday = (datetime.today() - timedelta(days=1)).strftime('%Y%m%d')

date = 20240802

## Define Functions

This function downloads the CFS forecast data from AWS

In [32]:
def download_grb2_aws(product, bucket_name, folder_path, download_dir):
    """
    Download the CFS forecast from AWS

    Parameters:
    - product: 'flx' or 'pgb'
    - bucket_name: for CFS data it is 'noaa-cfs-pds'
    - folder_path: the url path to data
    - download_dir: location to download data to
    """
    num_files_downloaded = 0

    # Create a boto3 client for S3
    s3_config = Config(signature_version=UNSIGNED)
    s3 = boto3.client('s3', config=s3_config)

    # List all objects in the specified folder path
    continuation_token = None
    objects = []

    # Use a loop to handle pagination
    while True:
        list_objects_args = {'Bucket': bucket_name, 'Prefix': folder_path}
        if continuation_token:
            list_objects_args['ContinuationToken'] = continuation_token

        list_objects_response = s3.list_objects_v2(**list_objects_args)

        objects.extend(list_objects_response.get('Contents', []))

        if not list_objects_response.get('IsTruncated', False):
            break

        continuation_token = list_objects_response.get('NextContinuationToken')

    # Iterate over each object and download if it ends with '.grb2'
    for obj in objects:
        key = obj['Key']
        if product in key and key.endswith('grib.grb2'): #if key.endswith('.grb2'):
            local_file_path = os.path.join(download_dir, os.path.relpath(key, folder_path))

            # Ensure the directory structure exists
            os.makedirs(os.path.dirname(local_file_path), exist_ok=True)

            # Download the file
            s3.download_file(bucket_name, key, local_file_path)
            num_files_downloaded += 1

            print(f"Downloaded: {key}")

    print(f'Total number of CFS files downloaded from AWS: {num_files_downloaded}')

In [33]:
def get_files(directory, where, format):
    """
    Get a list of all GRIB2 files in the specified directory.

    Parameters:
    - directory: Path to the directory containing the GRIB2 files.
    - where: 'starts' or 'ends'
    - format: either '.grb2' or '.nc'
    Returns:
    - List of file paths to the GRIB2 files.
    """
    files = []
    for file_name in os.listdir(directory):
        if where == 'ends':
            if file_name.endswith(format):
                file_path = os.path.join(directory, file_name)
                files.append(file_path)
        elif where == 'starts':
            if file_name.startswith(format):
                file_path = os.path.join(directory, file_name)
                files.append(file_path)
    return files

Once the calculations for precip, evap, and temperature are added to the csv files, the original grib2 files do not need to be kept and can be deleted.

In [35]:
def delete_directory(directory_path):
    # Check if the directory exists
    if not os.path.isdir(directory_path):
        print(f"The directory {directory_path} does not exist.")
        return
    
    try:
        # Remove the entire directory tree
        shutil.rmtree(directory_path)
        print(f"Successfully deleted the directory and all its contents: {directory_path}")
    except Exception as e:
        print(f"Error deleting {directory_path}: {e}")

This function calculates the area of each grid cell. The surface area is needed to calculate
total precipitation because the unit is per meter.

In [36]:
def calculate_grid_cell_areas(lon, lat):
    # Calculate grid cell areas
    # Assuming lat and lon are 1D arrays
    # Convert latitude to radians

    R = 6371000.0  # Radius of Earth in meters
    lat_rad = np.radians(lat)

    # Calculate grid cell width in radians
    dlat = np.radians(lat[1] - lat[0])
    dlon = np.radians(lon[1] - lon[0])

    # Calculate area of each grid cell in square kilometers
    area = np.zeros((len(lat), len(lon)))
    for i in range(len(lat)):
        for j in range(len(lon)):
            area[i, j] = R**2 * dlat * dlon * np.cos(lat_rad[i])

    return area

This function calculated evaporation rate using latent heat flux and aire temperature.

In [37]:
# ET = kg/(m^2*time^1) or 1 mm
# LE = MJ/(M^2*time^1)
# λ  = MJ/kg

# Latent heat of vaporization varies slightly with temperature. Allen et al. (1998) provides an equation 
# for calculating λ with air  temperature variation. Temperature in this case must be in degrees Celcius.

# λ=2.501−(2.361×10−3)×Temp Celcius

# so for our data with Temp in Kelvin...

# λ=2.501−((2.361×10−3)×(Temp-273.15))

# Our variable_lhf is in W/m^2 or J/(m^2*time^1). In order to convert to MJ we must multiply by 10^-6 or 
# 0.000001. Now we have lamba and variable_lhf both in terms of MJ.

# Boiling all this down we get these final equations below that provide us with a final evaporation for 
# the mean daily variable_lfh values in kg/m^2 or millimeters of water. 

def calculate_evaporation(temperature_K, latent_heat):
    lamda=(2.501-(0.002361*(temperature_K-273.15)))
    evaporation_rate=((latent_heat)*0.000001)/lamda

    return evaporation_rate

## Begin Script

Creates a folder to download the data into.

In [54]:
download_dir = f'{dir}{date}/CFS/'
if not os.path.exists(download_dir):
    os.makedirs(download_dir)
    print("Directory created.")
else:
    print("Directory already exists.")

Directory created.


Downloads the CFS forecast data

In [55]:
# Uses the AWS to download the grib2 files
bucket_name = 'noaa-cfs-pds'

for utc in utc:
    for product in products:
        folder_path = f'cfs.{date}/{utc}/monthly_grib_01/'
        download_grb2_aws(product, bucket_name, folder_path, download_dir)

Downloaded: cfs.20240802/00/monthly_grib_01/pgbf.01.2024080200.202408.avrg.grib.grb2
Downloaded: cfs.20240802/00/monthly_grib_01/pgbf.01.2024080200.202409.avrg.grib.grb2
Downloaded: cfs.20240802/00/monthly_grib_01/pgbf.01.2024080200.202410.avrg.grib.grb2
Downloaded: cfs.20240802/00/monthly_grib_01/pgbf.01.2024080200.202411.avrg.grib.grb2
Downloaded: cfs.20240802/00/monthly_grib_01/pgbf.01.2024080200.202412.avrg.grib.grb2
Downloaded: cfs.20240802/00/monthly_grib_01/pgbf.01.2024080200.202501.avrg.grib.grb2
Downloaded: cfs.20240802/00/monthly_grib_01/pgbf.01.2024080200.202502.avrg.grib.grb2
Downloaded: cfs.20240802/00/monthly_grib_01/pgbf.01.2024080200.202503.avrg.grib.grb2
Downloaded: cfs.20240802/00/monthly_grib_01/pgbf.01.2024080200.202504.avrg.grib.grb2
Downloaded: cfs.20240802/00/monthly_grib_01/pgbf.01.2024080200.202505.avrg.grib.grb2
Total number of CFS files downloaded from AWS: 10
Downloaded: cfs.20240802/00/monthly_grib_01/flxf.01.2024080200.202408.avrg.grib.grb2
Downloaded: cfs

Open existing CSV files..

In [56]:
if os.path.exists(tmp_csv):
    df_tmp_forecasts = pd.read_csv(tmp_csv)
else: print('Error opening Temp file')

if os.path.exists(evap_csv):
    df_evap_forecasts = pd.read_csv(evap_csv)
else: print('Error opening Evap file')

if os.path.exists(pcp_csv):
    df_pcp_forecasts = pd.read_csv(pcp_csv)
else: print('Error opening Precip file')

Open the mask file and pull the lat/lon data from it. This will be needed to cut the CFS domain
to the Great Lakes and regrid to match the mask file. It also calculates the area of each of the 
grid cells using the defined function above.

In [57]:
mask_ds = nc.Dataset(mask_file)
mask_lat = mask_ds.variables['latitude'][:]
mask_lon = mask_ds.variables['longitude'][:]
area = calculate_grid_cell_areas(mask_lon, mask_lat)

In [58]:
mask_variables = ['eri_basin','eri_lake','eri_land',
                 'hur_basin','hur_lake','hur_land',
                 'ont_basin','ont_lake','ont_land',
                 'mic_basin','mic_lake','mic_land',
                 'sup_basin','sup_lake','sup_land']

#df_apcp_forecasts = pd.DataFrame(columns=['cfs_run', 'forecast_year', 'forecast_month'] + mask_variables)
#df_tmp_forecasts = pd.DataFrame(columns=['cfs_run', 'forecast_year', 'forecast_month'] + mask_variables)
#df_evap_forecasts = pd.DataFrame(columns=['cfs_run', 'forecast_year', 'forecast_month'] + mask_variables)

In [59]:
# Find all the files the pgb files in the directory
file_list = get_files(download_dir, 'ends', '.grb2')
index = len(df_tmp_forecasts) # Picks up on the last line of the CSV

for grib2_file in file_list:

    filename = os.path.basename(grib2_file)
    print(filename)
    parts = filename.split('.')
    cfs_run = parts[2]
    print(cfs_run)
    date_part = parts[3]  # Assuming parts[2] is in the format YYYYMM
    print(date_part)
    forecast_year = date_part[:4]
    forecast_month = date_part[4:6] 
    days_in_month = calendar.monthrange(int(forecast_year), int(forecast_month))[1]

    if filename.startswith('flxf'):
        #open the flx file at the 2m level to pull the 2m air temperature
        print('This is a flx file')
        flx_2mabove = cfgrib.open_dataset(grib2_file, engine='cfgrib', filter_by_keys={'typeOfLevel': 'heightAboveGround', 'level': 2})
        df_tmp_forecasts.loc[index, 'cfs_run'] = cfs_run
        df_tmp_forecasts.loc[index, 'forecast_year'] = forecast_year
        df_tmp_forecasts.loc[index, 'forecast_month'] = forecast_month
        print('Pulling the 2m air temperature')
        mean2t = flx_2mabove['mean2t']
        # cut the variable to the mask domain
        mean2t_cut = mean2t.sel(
                                latitude=slice(mask_lat.max(), mask_lat.min()),
                                longitude=slice(mask_lon.min(), mask_lon.max())
                            )
        # remap and upscale the variable to match the mask domain 
        mean2t_remap = mean2t_cut.interp(latitude=mask_lat, longitude=mask_lon, method='linear')
        # calculate mean2t for each of the mask variables (ie. eri_lake, eri_basin, etc.)
        for mask_var in mask_variables:
            mask = mask_ds.variables[mask_var][:]
            tmp_avg = np.mean(mean2t_remap*mask)
            df_tmp_forecasts.loc[index, mask_var] = tmp_avg.data
            print(f'Calculating air temperature for {mask_var}')
        print(f'Air temperature complete for {filename}')

        ###############################################################################

        # open the flx file again but at the surface level to pull the latent heat flux
        flx_surface = cfgrib.open_dataset(grib2_file, engine='cfgrib', filter_by_keys={'typeOfLevel': 'surface'})
        df_evap_forecasts.loc[index, 'cfs_run'] = cfs_run
        df_evap_forecasts.loc[index, 'forecast_year'] = forecast_year
        df_evap_forecasts.loc[index, 'forecast_month'] = forecast_month
        print('Pulling the latent heat flux')
        mslhf = flx_surface['mslhf']
        # cut the variable to the mask domain
        mslhf_cut = mslhf.sel(
                                latitude=slice(mask_lat.max(), mask_lat.min()),
                                longitude=slice(mask_lon.min(), mask_lon.max())
                            )
        # remap and upscale the variable to match the mask domain 
        mslhf_remap = mslhf_cut.interp(latitude=mask_lat, longitude=mask_lon, method='linear')
        # calculate evaporation across the entire domain using air temp and latent heat flux
        print(f'Calculating evaporation rate')
        evap = calculate_evaporation(mean2t_remap, mslhf_remap)
        # calculate evaporation for each of the mask variables (ie. eri_lake, eri_basin, etc.)
        for mask_var in mask_variables:
            mask = mask_ds.variables[mask_var][:]
            total_evap = (np.sum(evap*area*mask) * 86400 * days_in_month)
            df_evap_forecasts.loc[index, mask_var] = total_evap.data
            print(f'Calculating evaporation rate for {mask_var}')
        print(f'Evaporation complete for {filename}')

        ###############################################################################

    elif filename.startswith('pgbf'):
        # open the pgb file at the surface level to pull the precipitation
        print('This is a pgb file')
        pgb_surface = cfgrib.open_dataset(grib2_file, engine='cfgrib', filter_by_keys={'typeOfLevel': 'surface'})
        df_pcp_forecasts.loc[index, 'cfs_run'] = cfs_run
        df_pcp_forecasts.loc[index, 'forecast_year'] = forecast_year
        df_pcp_forecasts.loc[index, 'forecast_month'] = forecast_month
        print('Pulling the precipitation')
        pcp = pgb_surface['tp'] # total precip
        # cut the variable to the mask domain
        pcp_cut = pcp.sel(
                            latitude=slice(mask_lat.max(), mask_lat.min()),
                            longitude=slice(mask_lon.min(), mask_lon.max())
                            )
        # remap and upscale the variable to match the mask domain 
        pcp_remap = pcp_cut.interp(latitude=mask_lat, longitude=mask_lon, method='linear')
        # Precip is the mean precip over a 6hr period per meter.
        # To get the total, you have to multiple by 4 (4 6hr periods in a day) * number of days
        # in the month * area of each grid cell in the masked area (over lake/land/erie/ontario/etc)
        for mask_var in mask_variables:
            mask = mask_ds.variables[mask_var][:]
            total_pcp = (np.sum(pcp_remap*area*mask*4*days_in_month))
            df_pcp_forecasts.loc[index, mask_var] = total_pcp.data
            print(f'Calculating total precipitation for {mask_var}')
        print(f'Precipitation complete for {filename}')

    index += 1


flxf.01.2024080200.202408.avrg.grib.grb2
2024080200
202408
This is a flx file
Pulling the 2m air temperature
Calculating air temperature for eri_basin
Calculating air temperature for eri_lake
Calculating air temperature for eri_land
Calculating air temperature for hur_basin
Calculating air temperature for hur_lake
Calculating air temperature for hur_land
Calculating air temperature for ont_basin
Calculating air temperature for ont_lake
Calculating air temperature for ont_land
Calculating air temperature for mic_basin
Calculating air temperature for mic_lake
Calculating air temperature for mic_land
Calculating air temperature for sup_basin
Calculating air temperature for sup_lake
Calculating air temperature for sup_land
Air temperature complete for flxf.01.2024080200.202408.avrg.grib.grb2


  df_tmp_forecasts.loc[index, 'cfs_run'] = cfs_run
  df_tmp_forecasts.loc[index, 'forecast_year'] = forecast_year
  df_tmp_forecasts.loc[index, 'forecast_month'] = forecast_month
  df_evap_forecasts.loc[index, 'cfs_run'] = cfs_run
  df_evap_forecasts.loc[index, 'forecast_year'] = forecast_year
  df_evap_forecasts.loc[index, 'forecast_month'] = forecast_month


Pulling the latent heat flux
Calculating evaporation rate
Calculating evaporation rate for eri_basin
Calculating evaporation rate for eri_lake
Calculating evaporation rate for eri_land
Calculating evaporation rate for hur_basin
Calculating evaporation rate for hur_lake
Calculating evaporation rate for hur_land
Calculating evaporation rate for ont_basin
Calculating evaporation rate for ont_lake
Calculating evaporation rate for ont_land
Calculating evaporation rate for mic_basin
Calculating evaporation rate for mic_lake
Calculating evaporation rate for mic_land
Calculating evaporation rate for sup_basin
Calculating evaporation rate for sup_lake
Calculating evaporation rate for sup_land
Evaporation complete for flxf.01.2024080200.202408.avrg.grib.grb2
flxf.01.2024080200.202409.avrg.grib.grb2
2024080200
202409
This is a flx file
Pulling the 2m air temperature
Calculating air temperature for eri_basin
Calculating air temperature for eri_lake
Calculating air temperature for eri_land
Calculat

  df_pcp_forecasts.loc[index, 'cfs_run'] = cfs_run
  df_pcp_forecasts.loc[index, 'forecast_year'] = forecast_year
  df_pcp_forecasts.loc[index, 'forecast_month'] = forecast_month


Pulling the precipitation
Calculating total precipitation for eri_basin
Calculating total precipitation for eri_lake
Calculating total precipitation for eri_land
Calculating total precipitation for hur_basin
Calculating total precipitation for hur_lake
Calculating total precipitation for hur_land
Calculating total precipitation for ont_basin
Calculating total precipitation for ont_lake
Calculating total precipitation for ont_land
Calculating total precipitation for mic_basin
Calculating total precipitation for mic_lake
Calculating total precipitation for mic_land
Calculating total precipitation for sup_basin
Calculating total precipitation for sup_lake
Calculating total precipitation for sup_land
Precipitation complete for pgbf.01.2024080200.202408.avrg.grib.grb2
pgbf.01.2024080200.202409.avrg.grib.grb2
2024080200
202409
This is a pgb file
Pulling the precipitation
Calculating total precipitation for eri_basin
Calculating total precipitation for eri_lake
Calculating total precipitation

In [60]:
# Save the updated DataFrames to CSV files
df_tmp_forecasts.to_csv(tmp_csv, sep=',', index=False)
print("CSV updated with air temperature data.")

df_evap_forecasts.to_csv(evap_csv, sep=',', index=False)
print("CSV updated with evaporation data.")

df_pcp_forecasts.to_csv(pcp_csv, sep=',', index=False)
print("CSV updated with precipitation data.")

CSV updated with air temperature data.
CSV updated with evaporation data.
CSV updated with precipitation data.


Close the mask. Also, delete grib2 files if they are no longer needed.

In [51]:
mask_ds.close()
#delete_directory(download_dir)