In [None]:
import xarray as xr
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import requests
from tqdm import tqdm
from urllib.parse import urlparse, unquote

## I. Downloading MERRA-2 data

This notebook contains the code for the first step of preparing MERRA-2 data for the PEBSI model. It walks through creating the download files for the user-specified region and dates of interest.

Hopefully this all works right out of the box and is super streamlined, but feel free to contact the author (cvwilson@cmu.edu) if you run into any issues you can't solve.

First, specify data_fp which should be some folder where you want everything to be stored; recommended to name it MERRA-2.

In [None]:
data_fp= '../MERRA-2/'

# =============================================================================================================================
# Do not touch this code, it is needed later
filename = 'MERRA2_VERSION.tavg1_2d_DATASET_Nx.DATE.nc4.nc4'
def version(year,new_version=False):
    if year < 1992:
        version = '100'
    elif year <= 2000:
        version = '200'
    elif year <= 2010:
        version = '300'
    else:
        version = '400'
    if new_version:
        version = version[0:2]
        version = version + '1'
    return version

### 1. Get a global sample file. (Only need to do this once.)

Download a reference file which contains the geopotential for every grid cell here:
https://opendap.earthdata.nasa.gov/collections/C1276812819-GES_DISC/granules/M2C0NXASM.5.12.4%3AMERRA2_101.const_2d_asm_Nx.00000000.nc4.dap.nc4?dap4.ce=/PHIS;/time;/lat;/lon

You will need to create or log in with your NASA EarthData login to get this file. While you're at it, set up your .netrc file which stores your username and password so you can download the rest of the MERRA-2 data. Check out documentation online for more information on this: https://nsidc.org/data/user-resources/help-center/creating-netrc-file-earthdata-login

### 2. Specify the latitude and longitude of a bounding box.

The following code will extract the integers which will be filled into each URL to fetch the correct file.

Specify fn_gp and the lat/lon min/max in degrees. (Use negatives for west longitudes / south latitudes)

In [None]:
# specify the filepath where you saved the global sample file
fn_gp = data_fp + 'MERRA2constants.nc4'
ds_gp = xr.open_dataset(fn_gp)
ds_gp = ds_gp.drop_dims('time')

# specify the bounding box
lat_min = 50      # ALASKA
lat_max = 72
lon_min = -180
lon_max = -133.25

# find the integer values bounding this lat/lon box
lat_min_idx = np.where(ds_gp.lat.values >= lat_min)[0][0]
lat_max_idx = np.where(ds_gp.lat.values <= lat_max)[0][-1]
lon_min_idx = np.where(ds_gp.lon.values >= lon_min)[0][0]
lon_max_idx = np.where(ds_gp.lon.values <= lon_max)[0][-1]
print(f'latitude bounded by {lat_min_idx}:{lat_max_idx}')
print(f'longitude bounded by {lon_min_idx}:{lon_max_idx}')

# Sanity check: make sure the lon/lat lines up with what you specified
print(ds_gp.isel(lat=np.arange(lat_min_idx, lat_max_idx + 1), 
                 lon=np.arange(lon_min_idx, lon_max_idx + 1)).coords)

### 3. Specify the time bounds and dataset to download.

Specify start and end time and the dataset to download.

In [None]:
dataset = 'slv'             # slv, flx, adg, or rad
start_time = '2000-01-01'   # defaults to 00:00 hrs
end_time = '2025-10-01'     # data will be downloaded up to and not including this date

# default configuration is to save the urls to a folder named with the dataset
data_fp += dataset + '/'

# if that path does not exist, create the folder
if not os.path.exists(data_fp):
    os.mkdir(data_fp)

### 4. Check which files need to be downloaded.

The following cell will check if any of the files already exist and compile a list of the dates still needed.

When you download the files, you might get an issue where some of them randomly fail. I made this code to check which files are missing, so you will need to iterate through this and the next step until you get the "Got all files!" print statement.

In [None]:
missing_days = []
for date in pd.date_range(start_time, end_time):
    # version numbers change with the year
    year = date.year
    v = version(year)

    # loop through days and check if the file is there
    date_fmtd = date.strftime('%Y%m%d')
    date_fn = filename.replace('DATE',str(date_fmtd)).replace('DATASET', dataset)

    # file can exist under two different version types (e.g., 400 and 401)
    if not os.path.exists(data_fp+date_fn.replace('VERSION',v)):
        if not os.path.exists(data_fp+date_fn.replace('VERSION',version(year,True))):
            missing_days.append(date_fmtd)

# print out which days are missing for a sanity check
if len(missing_days) > 0:
    print(f'Need to download {len(missing_days)} missing files:',missing_days)
else:
    print('Got all files!')

### 5. Create the new urls file with missing dates

The next block will write a new url.txt file which you can then execute from the command-line using `download_urls.py`

In [None]:
# Filepath to the .txt file you will use to download 
fn_urls = data_fp + 'urls.txt'

# Specify if the new version should be used (see markdown explanation above)
NEW_VERSION = False

# Integers you determined above from lat/lon bounding box
X1 = lon_min_idx
X2 = lon_max_idx
Y1 = lat_min_idx
Y2 = lat_max_idx

# URLs for each dataset, already filled out to contain the correct variables from each dataset
newlines = {
            'slv':'https://goldsmr4.gesdisc.eosdis.nasa.gov/opendap/MERRA2/M2T1NXSLV.5.12.4/YEAR/MONTH/MERRA2_VERSION.tavg1_2d_slv_Nx.DATE.nc4.nc4?PS[0:23][Y1:Y2][X1:X2],QV2M[0:23][Y1:Y2][X1:X2],T2M[0:23][Y1:Y2][X1:X2],U2M[0:23][Y1:Y2][X1:X2],V2M[0:23][Y1:Y2][X1:X2],QV2M[0:23][Y1:Y2][X1:X2],time,lat[Y1:Y2],lon[X1:X2]', # 
            'rad':'https://goldsmr4.gesdisc.eosdis.nasa.gov/opendap/MERRA2/M2T1NXRAD.5.12.4/YEAR/MONTH/MERRA2_VERSION.tavg1_2d_rad_Nx.DATE.nc4.nc4?CLDTOT[0:23][Y1:Y2][X1:X2],LWGAB[0:23][Y1:Y2][X1:X2],SWGDN[0:23][Y1:Y2][X1:X2],time,lat[Y1:Y2],lon[X1:X2]', # 
            'flx':'https://goldsmr4.gesdisc.eosdis.nasa.gov/opendap/MERRA2/M2T1NXFLX.5.12.4/YEAR/MONTH/MERRA2_VERSION.tavg1_2d_flx_Nx.DATE.nc4.nc4?PRECTOTCORR[0:23][Y1:Y2][X1:X2],time,lat[Y1:Y2],lon[X1:X2]',
            'adg':'https://goldsmr4.gesdisc.eosdis.nasa.gov/opendap/MERRA2/M2T1NXADG.5.12.4/YEAR/MONTH/MERRA2_VERSION.tavg1_2d_adg_Nx.DATE.nc4.nc4?OCDP002[0:23][Y1:Y2][X1:X2],OCWT002[0:23][Y1:Y2][X1:X2],BCDP002[0:23][Y1:Y2][X1:X2],BCWT002[0:23][Y1:Y2][X1:X2],DUDP003[0:23][Y1:Y2][X1:X2],DUWT003[0:23][Y1:Y2][X1:X2]', # 
}

# Loop through missing days and add a new line to the url.txt file for each
if len(missing_days) > 0:
    f = open(fn_urls, 'w')
    for date in missing_days: # [len(missing_days) // 2:]
        newline = newlines[dataset]
        newline = newline.replace('YEAR',date[:4])
        newline = newline.replace('MONTH',date[4:6])
        newline = newline.replace('DATE',date)
        newline = newline.replace('VERSION',version(int(date[:4]),NEW_VERSION))
        newline = newline.replace('X1', str(X1))
        newline = newline.replace('X2', str(X2))
        newline = newline.replace('Y1', str(Y1))
        newline = newline.replace('Y2', str(Y2))
        f.write(newline+'\n')
    f.close()
    n_missing = len(missing_days)
    print(f'Wrote {fn_urls} with {n_missing} urls')

### 6. Download files.

The following code will loop through each URL and download the data if you just have a few things to grab.

Note: the same code is available in script format (`python download_urls.py`) to do large downloads from the command line instead of in a notebook. Flagging `-url_name` with a string or list of strings containing the filepaths of URL text files as generated in this notebook.

In [None]:
list_datasets = ['adg','rad','slv','flx'] # List containing any combination of ['adg','rad','slv','flx'] to download in series

# Define function to get good filenames
def safe_filename(url):
    base = urlparse(url).path
    return os.path.basename(unquote(base))

for dataset in list_datasets:
    # Open url file
    with open(fn_urls, 'r') as f:
        urls = [line.strip() for line in f if line.strip()]

    # Download with progress bar
    for url in tqdm(urls, desc=f'Downloading files from {dataset}', unit="file"):
        filename = os.path.join(data_fp, safe_filename(url))
        response = requests.get(url)
        with open(filename, 'wb') as out_file:
            out_file.write(response.content)

### Congratulations, you have all the data!