This script downloads all of the long-range National Water Model forecasts, from TODAY going out 30 days, and calculates the total runoff into each of the lakes.

The following files are required to run this script:
- ER_link.csv
- ON_link.csv
- SU_link.csv
- MIHU_link.csv

The following is an example of the output. Output will be total runoff (cms) for each lake for each of the 4 ensemble NWM members.

              Superior          Erie       Ontario      MichHuron
       1  85160.098097  32324.209277  34796.599222  111990.107497
       2  85504.198089  32454.819275  34937.199219  112442.617487
       3  85113.598098  32306.559278  34777.599223  111928.957498
       4  87987.298033  33397.329254  35951.799196  115708.027414



In [1]:
# Install needed libraries
!pip install netCDF4

Collecting netCDF4
  Downloading netCDF4-1.6.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m52.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting cftime (from netCDF4)
  Downloading cftime-1.6.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m63.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: cftime, netCDF4
Successfully installed cftime-1.6.3 netCDF4-1.6.5


In [2]:
# Import libraries
import os
import urllib.request
import urllib.error
import netCDF4
from netCDF4 import Dataset
from datetime import datetime
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import calendar

In [3]:
# Mount my google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Defined Functions

Function to download the NWM forecasts.

In [4]:
def download_nwm_forecast(forecast, ens_members, download_dir):
    # Initialize a file counter
    num_files_downloaded = 0

    base_url = 'https://nomads.ncep.noaa.gov/pub/data/nccf/com/nwm/v3.0/'
    # This will need to be updated in the function to open using ftplib
    base_url_fallback = 'ftp://ftpprd.ncep.noaa.gov/pub/data/nccf/com/nwm/'

    dir = f'{download_dir}{YYYYMMDD}/'
    # Create the download directory if it doesn't exist
    if not os.path.exists(dir):
        os.makedirs(dir)

    for ens in range(1,ens_members+1):

      # Retrieve HTML content from the URL
        path = base_url+f'nwm.{YYYYMMDD}/{forecast}_mem{ens}/'
        files = f'nwm.t00z.{forecast}.channel_rt_{ens}'

        response = urllib.request.urlopen(path)
        html_content = response.read().decode('utf-8')
        soup = BeautifulSoup(html_content, 'html.parser')
        links = soup.find_all('a', href=lambda href: href and href.startswith(files))
        for link in links:
            file_url = path + link['href']
            filename = link['href'].split('/')[-1]
            file_path = os.path.join(dir, filename)
            print("Downloading", filename)
            urllib.request.urlretrieve(file_url, file_path)
            num_files_downloaded += 1
    print(f'Total number of NWM files downloaded: {num_files_downloaded}')

In [5]:
# Function currently being worked on that would use the FTP address as a backup if Nomads were to fail.
import requests
from ftplib import FTP

def download_nwm_forecast_backup(forecast, ens_members, download_dir):
    # Initialize a file counter
    num_files_downloaded = 0

    base_url = 'https://nomads.ncep.noaa.gov/pub/data/nccf/com/nwm/v3.0/'
    # This will need to be updated in the function to open using ftplib
    ftp_address = 'ftp://ftpprd.ncep.noaa.gov/pub/data/nccf/com/nwm/'
    ftp_path = '...'

    # Create the download directory if it doesn't exist
    if not os.path.exists(download_dir):
        os.makedirs(download_dir)

    for ens in range(1,ens_members+1):

      # Retrieve HTML content from the URL
        path = base_url+f'nwm.{YYYYMMDD}/{forecast}_mem{ens}/'
        files = f'nwm.t00z.{forecast}.channel_rt_{ens}'

        try:
        # Try to open the URL
            response = requests.get(path)
            response.raise_for_status()
            return response.content
        except requests.exceptions.RequestException:
            # If URL access fails, switch to FTP
            print("URL access failed, switching to FTP...")
            try:
                ftp = FTP(ftp_address)
                ftp.login()  # Log in to the FTP server
                ftp.cwd(ftp_path)  # Change to the specified path
                data = []
                ftp.retrlines('RETR ' + ftp_path, data.append)  # Retrieve the file data
                ftp.quit()  # Quit FTP connection
                return '\n'.join(data)
            except Exception as e:
                print("Failed to access FTP. Try again later.")
                return None

        soup = BeautifulSoup(html_content, 'html.parser')
        links = soup.find_all('a', href=lambda href: href and href.startswith(files))
        for link in links:
            file_url = path + link['href']
            filename = link['href'].split('/')[-1]
            file_path = os.path.join(download_dir, filename)
            print("Downloading", filename)
            urllib.request.urlretrieve(file_url, file_path)
            num_files_downloaded += 1
    print(f'Total number of NWM files downloaded: {num_files_downloaded}')

Function to calculate RUNOFF into each lake from the NWM long range forecast.

In [6]:
def calculate_runoff(ens_member, download_dir):

    runoff_su, runoff_er, runoff_on, runoff_mh = 0, 0, 0, 0

    # Loop through all files in the directory
    for filename in os.listdir(download_dir):
        if f'channel_rt_{ens_member}' in filename:

            file = Dataset(download_dir+filename, mode='r', format="NETCDF4")
            feature_id = file.variables['feature_id']
            streamflow = file.variables['streamflow'] #streamflow is in m3/s

            for f_su in id_su:
                runoff_su += streamflow[feature_id == f_su]
            for f_er in id_er:
                runoff_er += streamflow[feature_id == f_er]
            for f_on in id_on:
                runoff_on += streamflow[feature_id == f_on]
            for f_mh in id_mh:
                runoff_mh += streamflow[feature_id == f_mh]

    return runoff_su, runoff_er, runoff_on, runoff_mh

This function loops through the above function using all the ensemble members and then puts the total runoff for each lake into an easy to read dataframe. It could probably be combined later with the above function.

In [7]:
def calculate_runoff_all(ens_members, directory):
    ens = np.arange(1, ens_members + 1)
    df_lakes = pd.DataFrame(index=ens, columns=lakes)
    for i in range(1, ens_members+1):  # Assuming 4 columns for runoff values
        runoff_values = calculate_runoff(i, directory)
        for j, lake in enumerate(lakes):
            df_lakes[lake][i] = runoff_values[j]
    return df_lakes

This function is not used in this script anymore but wanted to leave it in here for future use in the src folder.

In [None]:
def cms_to_mm(flow_rate_cms, lake, month, year):
    # Conversion factor from cubic meters per second to millimeters per month
    # 1 m³/s * (60 * 60 * 24 * days_in_month) seconds/month = cubic meters per month
    # To convert to millimeters per month, we divide by the lake area in square meters

    # Determine lake area of the input lake
    great_lakes_areas = {
        "Superior": 82100,
        "Mich-Huron": 57800+59600,
        "Erie": 25700,
        "Ontario": 19300
    }
    # Convert square kilometers to square meters
    conversion_factor = 1000000

    if lake in great_lakes_areas:
        lake_area_m2 = great_lakes_areas[lake] * conversion_factor
    else:
        print("Lake input should be Superior, Erie, Ontario, or Mich-Huron.")

    # Determine number of days in the month using input month/year.
    year = int(year)
    month = int(month)
    if month < 1 or month > 12:
        raise ValueError("Month should be between 1 and 12.")
    # Get the number of days in the month
    days_in_month = calendar.monthrange(year, month)[1]

    conversion_factor = (60 * 60 * 24 * days_in_month) / lake_area_m2

    # Convert flow rate from m³/s to mm/month
    total_mm = flow_rate_cms * conversion_factor

    return total_mm

# Begin Script
Preset Variables

In [8]:
# Grab today's date for pulling the most current NWM forecast
YYYYMMDD = datetime.today().strftime('%Y%m%d')

# This will allow functions to be used to pull short_range, medium_range or long_range forecasts.
forecast = 'long_range'

# There are 4 ensemble members. If ens_members = 1, it will only pull the first ensember member.
# If ens_members = 4, it will pull all 4 members.
ens_members = 4

# This allows you to specify a specific lake if not all are needed.
lakes = ['Superior','Erie','Ontario','MichHuron']

input_dir = '/content/drive/MyDrive/BIL SA Project/Modeling/Inventories/Data/'

# This is where you want to download the NWM files to.
save_dir = '/content/drive/MyDrive/BIL SA Project/Modeling/Data-driven Modeling/Input datasets/Downloaded Data/'

# Path you want it to save the output csv file to.
output_path = '/content/drive/MyDrive/BIL SA Project/Modeling/Data-driven Modeling/Input datasets/'

# Output filename name
outfile = "total_nwm_runoff_cms_"+YYYYMMDD+".csv"

This section reads in each of Yi's csv files that contain all of the feature_ids for the streams that discharge into each lake.

In [9]:
# Lake Superior
su_links = pd.read_csv(input_dir+'NWM_runoff/SU_link.csv')
id_su= su_links['ID']

# Lake Erie
er_links = pd.read_csv(input_dir+'NWM_runoff/ER_link.csv')
id_er = er_links['ID']

# Lake Ontario
on_links = pd.read_csv(input_dir+'NWM_runoff/ON_link.csv')
id_on = on_links['ID']

# Lake Erie
mh_links = pd.read_csv(input_dir+'NWM_runoff/MIHU_link.csv')
id_mh = mh_links['ID']

This downloads the NWM files

In [None]:
download_nwm_forecast(forecast, ens_members, save_dir)

This opens each file and calculates the total runoff into each lake for each ensemble member (1-4).

In [11]:
# Example if you only wanted the runoff into each lake seperately.
# su_runoff_1, er_runoff_1, on_runoff_1, mh_runoff_1 = calculate_runoff(1, save_dir)

# This creates the dataframe with runoff into all the lakes for all the ensemble members.
df_lakes = calculate_runoff_all(ens_members,save_dir)
print(df_lakes)

        Superior           Erie        Ontario      MichHuron
1  439564.490175  166845.446271  179606.995985   578050.93708
2  436820.990236  165804.096294  178485.996011   574443.08716
3   441136.19014  167442.016257  180249.195971  580117.807033
4  433231.190317  164441.516324  177019.196043  569722.307266


In [None]:
# Saves a csv file with total flow into each lake [cms] for that time period
df_lakes.to_csv(output_path+outfile, sep='\t')

# Ignore All Below
Below sections can be ignored. These sections were first drafted but accurately calculate runoff into each lake. I have kept them in here in order to compare to the output from the above functions.

In [None]:
# Currently ignore. This was a working section that I am writing into a function.
# Keeping to test the function.
hour = np.arange(6,721,6)
members = 1,2,3,4

# Set up empty data frames where the feature ids, pulled from the csvs above, are
# the indices and there's an empty column to insert the streamflow for each file.
df_su_hr = pd.DataFrame(index=id_su,columns=['streamflow'])
df_er_hr = pd.DataFrame(index=id_er,columns=['streamflow'])
df_on_hr = pd.DataFrame(index=id_on,columns=['streamflow'])
df_mh_hr = pd.DataFrame(index=id_mh,columns=['streamflow'])

# Empty data frame to insert the total runoff for each hour.
df_su = pd.DataFrame(index=hour,columns=['streamflow'])
df_er = pd.DataFrame(index=hour,columns=['streamflow'])
df_on = pd.DataFrame(index=hour,columns=['streamflow'])
df_mh = pd.DataFrame(index=hour,columns=['streamflow'])

# Save each ensemble total streamflows for each lake.
df_lakes = pd.DataFrame(index=members,columns=lakes)

In [None]:
# Currently ignore. This was a working section that I am writing into a function.
# Keeping to test the function.
for ens in range(1,5):
  #for hr in hour:
  for hr in range(6,721,6):
    filename = "nwm.t00z.long_range.channel_rt_"+str(ens)+".f"+str(hr).zfill(3)+".conus.nc"
    file = Dataset(directory+filename, mode='r', format="NETCDF4")
    feature_id = file.variables['feature_id']
    streamflow = file.variables['streamflow'] #streamflow is in m3/s

    for f_su in id_su:
      df_su_hr['streamflow'][f_su]= streamflow[feature_id == f_su]
    for f_er in id_er:
      df_er_hr['streamflow'][f_er]= streamflow[feature_id == f_er]
    for f_on in id_on:
      df_on_hr['streamflow'][f_on]= streamflow[feature_id == f_on]
    for f_mh in id_mh:
      df_mh_hr['streamflow'][f_mh]= streamflow[feature_id == f_mh]
    file.close()

    df_su['streamflow'][hr] = df_su_hr['streamflow'].sum()
    df_er['streamflow'][hr] = df_er_hr['streamflow'].sum()
    df_on['streamflow'][hr] = df_on_hr['streamflow'].sum()
    df_mh['streamflow'][hr] = df_mh_hr['streamflow'].sum()
    print(ens,hr,df_su['streamflow'][hr])

  df_lakes[lakes[0]][ens] = df_su['streamflow'].sum()
  df_lakes[lakes[1]][ens] = df_er['streamflow'].sum()
  df_lakes[lakes[2]][ens] = df_on['streamflow'].sum()
  df_lakes[lakes[3]][ens] = df_mh['streamflow'].sum()
  print(ens,df_lakes[lakes[0]][ens])

In [None]:
print(df_lakes)

        Superior           Erie        Ontario       MichHuron
1  797205.282181  302595.123236  325739.792719  1048367.806567
2  789941.982343  299838.193298  322771.992785  1038816.176781
3  769454.082801  292061.603472  314400.592973  1011873.487383
4  800469.582108  303834.153209  327073.592689  1052660.536471
