# To Handle New Data Requests Automatically
- beginning of notebook is assumed to be interactive until the requests have been checked
- all progress and exception logging is done only for main loop
- copy and paste the e-mail response and send from gcs.cmip6.ldeo@gmail.com account

In [1]:
import numpy as np
import pandas as pd
import os
import gcsfs
import xarray as xr
from functools import partial

### Local modules

In [2]:
from request import requests, set_request_id
from search import search, esgf_search_sites
from netcdf import get_ncfiles, concatenate
from identify import needed
from response import response, dict_to_dfcat, get_details

In [None]:
def getFolderSize(p):
    prepend = partial(os.path.join, p)
    return sum([(os.path.getsize(f) if os.path.isfile(f) else getFolderSize(f)) for f in map(prepend, os.listdir(p))])

### Initialization

In [3]:
# Where to write NEW zarr stores:
zarr_local = '/h63/naomi/zarr-minimal'

fs = gcsfs.GCSFileSystem(token='anon', access='read_only')

### Choose basic configuration parameters

In [4]:
dtype = esgf_search_sites()
print('possible ESGF API search nodes: ',list(dtype.keys()))
ESGF_site = dtype['llnl']
#ESGF_site = dtype['gfdl']
#ESGF_site = dtype['ipsl']

# List sites to skip for aquiring new netcdf files: broken or slow sites
skip_sites = ['esg.lasg.ac.cn','esgf-data2.diasjp.net'] # ,'esgf-data1.llnl.gov'] #['dist.nmlab.snu.ac.kr','esg.lasg.ac.cn'] # #]

possible ESGF API search nodes:  ['llnl', 'ipsl', 'nci', 'ceda', 'jpl', 'gfdl', 'dkrz']


### Get prior Google Sheet requests

In [5]:
df_prior = pd.read_csv('csv/requests.csv')
df_prior

Unnamed: 0,Timestamp,E-mail,response status,members,experiments,models,variables,table,requester,science,comments
0,9/9/2019 14:13:56,rpa@ldeo.columbia.edu,once,[''],"['1pctCO2', '1pctCO2-4xext', 'esm-ssp585', 'hi...",['All'],[''],Oclim,Test,Trends in ocean heat budget under global warming,
1,9/10/2019 16:51:18,naomi@ldeo.columbia.edu,once,[''],"['ssp585', 'historical']","['BCC-CSM2-MR', 'BNU-ESM-1-1', 'CAMS-CSM1-0', ...","['ps', 'ua', 'va', 'hus']",Amon,Test,Compute atmospheric moisture budgets in future...,
2,9/11/2019 15:21:35,ryan.abernathey@gmail.com,once,[''],"['esm-ssp585', 'historical', 'piControl']",['All'],"['bigthetao', 'hfx', 'hfy', 'umo', 'uo', 'vmo'...",Omon,Test,Ocean heat budget,Thanks!
3,9/11/2019 18:25:47,nhn2@columbia.edu,once,[''],['historical'],['All'],"['ps', 'hus', 'ua', 'va', 'ta']",6hrLev,Test,Use the atmospheric moisture budget eddy terms...,
4,9/12/2019 17:38:54,mclong@ucar.edu,done,[''],"['ssp126', 'ssp245', 'ssp585']",['All'],"['o2', 'o2sat']",Oyr,Matt Long,Ocean oxygen in a warming world,
5,9/12/2019 17:44:16,mclong@ucar.edu,done,[''],"['ssp245', 'ssp585']",['All'],['thetao'],Omon,Matthew Long,Ocean oxygen in a warming world,Filter to include only models that have 'o2' a...
6,9/13/2019 12:18:31,islas@ucar.edu,done,[''],"['historical', 'ssp370', 'ssp585']",['All'],"['ua', 'va', 'ts', 'pr']",Amon,Isla Simpson,Test whether emergent constraints on the large...,
7,9/16/2019 13:36:55,afahad@gmu.edu,done,[''],"['abrupt-4xCO2', 'historical']",['All'],"['psl', 'ts', 'ua', 'va', 'pr', 'hfls', 'rlds'...",Amon,Abdullah al Fahad,large scale circulation change in CMIP5 and CM...,is it possible to get observed data of these v...
8,9/16/2019 14:29:37,sridge@ldeo.columbia.edu,done,[''],['historical'],['All'],"['thetao', 'so', 'dissic', 'no3', 'cfc11', 'sf...",Omon,Sean Ridge,CMIP6 Ocean Atlas,
9,9/16/2019 16:28:13,jhamman@ucar.edu,done,[''],"['historical', 'ssp245', 'ssp585']",['All'],"['pr', 'prc', 'rlds', 'rsds', 'sfcWind', 'hur'...",day,Joe Hamman,changes in hydrologic cycle and met extremes,


### Get new Google Sheet requests
- by default, only the new rows from the sheet are considered
- specifying a list of rows or emails will add older entries 

In [None]:
rows = []   
emails = []

# modify here:

#rows = [97]

#emails = ['duncan.watson-parris@physics.ox.ac.uk']

df_request_new, dtrouble = requests(df_prior,rows=rows,emails=emails)

request_id = set_request_id()

# Check for mal-formed requests (non-existent variables, etc)
if len(dtrouble)>=1:
    print(dtrouble)

df_request_new

In [7]:
# choose a new request to process:
timestamps = df_request_new.Timestamp.unique()
print(timestamps)
#df_request_new = df_request_new[df_request_new.Timestamp == timestamps[-1]]
df_request_new

['1/29/2020 17:24:30']


Unnamed: 0,Timestamp,E-mail,response status,members,experiments,models,variables,table,requester,science,comments
97,1/29/2020 17:24:30,orianac@uw.edu,,[All],"[historical, ssp126, ssp245, ssp370, ssp585]",[All],"[mrro, hfls, snw, mrso, mrsos]",day,Oriana Chegwidden,"How will hydrologic droughts, particularly tho...",Thanks so much for your help! Please let me kn...


### Search ESGF for the availability of requested data

In [8]:
print(ESGF_site)
df_ESGF = search(ESGF_site,df_request_new)

https://esgf-node.llnl.gov/esg-search/search

orianac@uw.edu
day mrro ['All'] historical
day hfls ['All'] historical
day snw ['All'] historical
day mrso ['All'] historical
day mrsos ['All'] historical
day mrro ['All'] ssp126
day hfls ['All'] ssp126
day snw ['All'] ssp126
day mrso ['All'] ssp126
day mrsos ['All'] ssp126
day mrro ['All'] ssp245
day hfls ['All'] ssp245
day snw ['All'] ssp245
day mrso ['All'] ssp245
day mrsos ['All'] ssp245
day mrro ['All'] ssp370
day hfls ['All'] ssp370
day snw ['All'] ssp370
day mrso ['All'] ssp370
day mrsos ['All'] ssp370
day mrro ['All'] ssp585
day hfls ['All'] ssp585
day snw ['All'] ssp585
day mrso ['All'] ssp585
day mrsos ['All'] ssp585


In [9]:
#import qgrid
#qgrid.show_grid(df_ESGF)

### Get the master list of existing zarr stores
- df_master includes all curated stores
- df_avail includes all stores, even those with known ES-DOC issues 

In [10]:
df_master = pd.read_csv('https://cmip6.storage.googleapis.com/cmip6-zarr-consolidated-stores.csv')
df_avail = pd.read_csv('https://cmip6.storage.googleapis.com/cmip6-zarr-consolidated-stores-noQC.csv')
len(df_master),len(df_avail),len(df_ESGF)

(238543, 239476, 14283)

### Check the new requests:
- already exists in df_master (what we have) - not needed
- exists in df_ESGF (what is available) - if not available, then not needed

In [11]:
df_needed = needed(df_avail, df_request_new, df_ESGF)

if len(df_needed) > 0:
    num_stores = df_needed.zstore.nunique() 
    print(f'needed: nfiles={len(df_needed)}, nstores={num_stores}')
    #print(df_needed.zstore.unique())
else:
    print('no new data available')
    exit


needed: nfiles=1549, nstores=37


In [12]:
import qgrid
qgrid.show_grid(df_needed)
#/DCPP/BCC/BCC-CSM2-MR/dcppA-hindcast/s1961-r1i1p1f1/Amon/tas/gn/

QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': True, 'defau…

In [13]:
df_needed.zstore.unique()

array(['/CMIP/EC-Earth-Consortium/EC-Earth3/historical/r2i1p1f1/day/mrro/gr/',
       '/CMIP/CAS/FGOALS-f3-L/historical/r1i1p1f1/day/hfls/gr/',
       '/CMIP/CAS/FGOALS-f3-L/historical/r2i1p1f1/day/hfls/gr/',
       '/CMIP/CAS/FGOALS-f3-L/historical/r3i1p1f1/day/hfls/gr/',
       '/CMIP/CAS/FGOALS-g3/historical/r1i1p1f1/day/hfls/gn/',
       '/CMIP/CAS/FGOALS-g3/historical/r3i1p1f1/day/hfls/gn/',
       '/CMIP/EC-Earth-Consortium/EC-Earth3/historical/r2i1p1f1/day/hfls/gr/',
       '/CMIP/NUIST/NESM3/historical/r1i1p1f1/day/hfls/gn/',
       '/CMIP/NUIST/NESM3/historical/r2i1p1f1/day/hfls/gn/',
       '/CMIP/NUIST/NESM3/historical/r3i1p1f1/day/hfls/gn/',
       '/CMIP/NUIST/NESM3/historical/r4i1p1f1/day/hfls/gn/',
       '/CMIP/NUIST/NESM3/historical/r5i1p1f1/day/hfls/gn/',
       '/ScenarioMIP/CAS/FGOALS-g3/ssp126/r1i1p1f1/day/hfls/gn/',
       '/ScenarioMIP/NUIST/NESM3/ssp126/r1i1p1f1/day/hfls/gn/',
       '/ScenarioMIP/NUIST/NESM3/ssp126/r2i1p1f1/day/hfls/gn/',
       '/ScenarioMIP/C

In [14]:
assert False

AssertionError: 

### Start logging the progress and exceptions

In [15]:
cat_file = 'csv/cmip6_'+request_id+'.csv'
log_file = 'txt/request_'+request_id+'.log'

In [16]:
# open and close for each write in case of kernel interrupt
def write_log(file,str,verbose=True):
    f = open(file,'a')
    if verbose:
        print(str)
    f.write(str+'\n')
    f.close()
    return

### The real work is done in this next loop 
- could be done in parallel except for the writing to the log file

In [17]:
new_zarrs = df_needed.zstore.unique()
verbose = True

zbdirs = []
for i in range(1,65):
    zbdirs += ['/h'+str(i)]

zdict = {}  # construct dictionary for new rows to add to master catalog
for item,zarr in enumerate(new_zarrs):
    #if item < 642:
    #    continue
    
    zbdir  = zarr_local  + zarr
    
    write_log(log_file,f"\n>>{item+1}/{num_stores}:<< local file: {zbdir}",verbose=verbose)
    
    # is zarr already in cloud?
    gsurl = 'gs://cmip6' + zarr
    contents = fs.ls(gsurl+'/.zmetadata')
    if any("zmetadata" in s for s in contents):
        write_log(log_file,'store already in cloud',verbose=verbose)
        continue

    # does zarr exist locally?  Won't need this once all uploads happen immediately
    iexist = False
    for zbdirr in zbdirs:
        zfile = zbdirr + '/naomi/zarr-minimal' + zarr + '.zmetadata'
        if os.path.isfile(zfile):
            write_log(log_file,f'already exists: {zfile}',verbose=verbose)
            iexist = True
            
    if iexist:
        continue
            
    gfiles,troubles,codes = get_ncfiles(zarr,df_needed,skip_sites)
    
    write_log(log_file,troubles,verbose=verbose)
    
    if len(gfiles) == 0: 
        write_log(log_file,'no files available',verbose=verbose)
        continue
    
    variable_id = zarr.split('/')[-3]
    for gfile in gfiles:   # changes file sizes!!
        command = '/usr/bin/ncatted -h -O -a missing_value\,'+variable_id+',d,, '+gfile
        os.system(command)

    # concatenate in time with mfdataset
    #print(gfiles)
    status, ds, dstr = concatenate(zarr,gfiles,codes)  

    if status == 'concatenation failure: ':
        write_log(log_file,status+dstr,verbose=verbose)
        continue
    else:
        write_log(log_file,dstr)

    try:
        del ds[variable_id].encoding['missing_value']
        ds.to_zarr(zbdir, consolidated=True, mode='w')
    except:
        ds.to_zarr(zbdir, consolidated=True, mode='w')
    

    if not os.path.isfile(zbdir+'/.zmetadata'):
        write_log(log_file,'to_zarr failure: ',verbose=verbose)
        continue
   
    vlist = get_details(ds,zbdir,zarr)
    
    # upload to cloud
    command = '/usr/bin/gsutil -m cp -r '+ zbdir[:-1] + ' ' + gsurl[:-1]
    write_log(log_file,command,verbose=verbose)
    os.system(command) 
        
    size_remote = fs.du(gsurl)
    size_local = getFolderSize(zbdir)
    assert (size_remote - size_local) < 100


>>1/37:<< local file: /h63/naomi/zarr-minimal/CMIP/EC-Earth-Consortium/EC-Earth3/historical/r2i1p1f1/day/mrro/gr/
noUse in codes
no files available

>>2/37:<< local file: /h63/naomi/zarr-minimal/CMIP/CAS/FGOALS-f3-L/historical/r1i1p1f1/day/hfls/gr/
	skipping esg.lasg.ac.cn domain
no files available

>>3/37:<< local file: /h63/naomi/zarr-minimal/CMIP/CAS/FGOALS-f3-L/historical/r2i1p1f1/day/hfls/gr/
	skipping esg.lasg.ac.cn domain
no files available

>>4/37:<< local file: /h63/naomi/zarr-minimal/CMIP/CAS/FGOALS-f3-L/historical/r3i1p1f1/day/hfls/gr/
	skipping esg.lasg.ac.cn domain
no files available

>>5/37:<< local file: /h63/naomi/zarr-minimal/CMIP/CAS/FGOALS-g3/historical/r1i1p1f1/day/hfls/gn/
	skipping esg.lasg.ac.cn domain
no files available

>>6/37:<< local file: /h63/naomi/zarr-minimal/CMIP/CAS/FGOALS-g3/historical/r3i1p1f1/day/hfls/gn/
	skipping esg.lasg.ac.cn domain
no files available

>>7/37:<< local file: /h63/naomi/zarr-minimal/CMIP/EC-Earth-Consortium/EC-Earth3/historical/r2

### Make a table of acquired data to send in email to requestor

In [None]:
if len(zdict) == 0 :
    print('nothing else to do')
    exit
else:
    dz = dict_to_dfcat(zdict)

In [None]:
assert False

In [None]:
try:
    df_master_new = pd.concat([df_master, dz])
except:
    df_master_new = df_master

In [None]:
#df_master_new = df_master

In [None]:
from response import response, dict_to_dfcat, get_details
from IPython.display import display

ldict = []
names = ""
print('Re: CMIP6 GCS Data Request (Responses)')
for row in df_request_new.values:
    rdict = dict(zip(df_request_new.keys(),row))
    #print(rdict)
    name = rdict['requester']
    timestamp = rdict['Timestamp']
    names += name
    del rdict['response status']
    ldict += [rdict]
    dfr = df_request_new[df_request_new.Timestamp == timestamp]
    
    print('Dear',name+':')
    print('\n  Here are the results from your recent CMIP6 data request(s).  The master catalog will be updated with the nightly build.')
    #if len(dtrouble)>=1:
    #    print('\n '+dtrouble)
    print('\n  Please note: ')
    print('      - Data for some models (e.g., CAS/FGOALS-f3-L and NUIST/NESM3) must be obtained directly from servers which are too slow or unresponsive. ')
    print('      - We exclude data with known errors (as reported at ES-DOC) from the official data catalog, https://errata.es-doc.org/ . ')
    print('      - Some data we have not been able to clean up enough to get it concatenated and save to zarr. Other datasets are only available for disjointed time periods.')
    print('\n  See the sample Jupyter Notebook at https://gist.github.com/naomi-henderson/ed1801d8ee8b992dda252f8b126876a5 for a quick introduction to accessing the data.')
    print('\nFrom the folks at:\n  The Climate Data Science Lab\n  Division of Ocean and Climate Physics\n  LDEO/Columbia University')
    print('\n--------------------------')

    print('\nrequest:')
    display(rdict)

    print('\nresponse:')
    try:
        print('new stores added:\n',len(dz),'\n')
    except:
        print(f'no new data available at ESGF API search node {ESGF_site}')

    #print('\n',dfr,len(df_master_new))
    table = response(dfr,df_master_new)

    print("\navailable data:\n  this includes your new stores but does not include datasets marked 'onhold', 'wontfix' or 'new' in the ES-DOC ERRATA")
    display(table)
    print('\n\n')

In [None]:
! mv csv/request_new.csv csv/requests.csv

In [None]:
file = 'nctemp/mrso_Lmon_SAM0-UNICON_piControl_r1i1p1f1_gn_028101-029112.nc'
ds = xr.open_dataset(file)

In [None]:
ds.time.dt.year

In [None]:
ds.attrs