# To Handle New Data Requests Automatically
- beginning of notebook is assumed to be interactive until the requests have been checked
- all progress and exception logging is done only for main loop
- copy and paste the e-mail response and send from gcs.cmip6.ldeo@gmail.com account

In [1]:
import numpy as np
import pandas as pd
import os
import gcsfs
import xarray as xr

### Local modules

In [2]:
from request import requests, set_request_id
from search import search, esgf_search_sites
from netcdf import get_ncfiles, concatenate
from identify import needed
from response import response, dict_to_dfcat, get_details

### Initialization

In [3]:
fs = gcsfs.GCSFileSystem(token='anon', access='read_only')

# Where to write NEW zarr stores:
zarr_local = '/h62/naomi/zarr-minimal'

### Choose basic configuration parameters

In [4]:
dtype = esgf_search_sites()
print('possible ESGF API search nodes: ',list(dtype.keys()))
ESGF_site = dtype['llnl']
#ESGF_site = dtype['gfdl']
#ESGF_site = dtype['ipsl']

# List sites to skip for aquiring new netcdf files: broken or slow sites
skip_sites = ['esg.lasg.ac.cn'] #,'esgf-data1.llnl.gov'] #['dist.nmlab.snu.ac.kr','esg.lasg.ac.cn'] #,'aims3.llnl.gov'] #,'esgf-data2.diasjp.net']

possible ESGF API search nodes:  ['llnl', 'ipsl', 'nci', 'ceda', 'jpl', 'gfdl', 'dkrz']


### Get prior Google Sheet requests

In [5]:
df_prior = pd.read_csv('csv/requests.csv')
#df_prior

### Get new Google Sheet requests
- by default, only the new rows from the sheet are considered
- specifying a list of rows or emails will add older entries 

In [6]:
rows = []   
emails = []

# modify here:
rows = [104]
#emails = ['orianac@uw.edu']

df_request_new, dtrouble = requests(df_prior,rows=rows,emails=emails)

request_id = set_request_id()

# Check for mal-formed requests (non-existent variables, etc)
if len(dtrouble)>=1:
    print(dtrouble)

df_request_new

Unnamed: 0,Timestamp,E-mail,response status,members,experiments,models,variables,table,requester,science,comments
104,2/7/2020 6:05:06,mpb20@st-andrews.ac.uk,,[One],"[historical, ssp126, ssp245, ssp370, ssp585]",[All],"[tas, huss]",day,Michael Byrne,"Hi there,\n\nI'm working on a new paper lookin...","That's it, thanks a lot for setting up this aw..."
14,9/17/2019 15:17:17,riley.brady@colorado.edu,once,[],"[dcppA-hindcast, dcppA-assim]",[MIROC6],[tos],Omon,Riley Brady,Providing analysis support for initialized dec...,It would be good to also get the available dcp...
15,9/17/2019 19:10:50,riley.brady@colorado.edu,once,[],"[dcppA-assim, dcppA-hindcast, dcppC-hindcast-n...",[CanESM5],[tas],Amon,Riley Brady,Providing analysis support for initialized dec...,
16,9/17/2019 19:16:38,riley.brady@colorado.edu,once,[],"[dcppA-assim, dcppA-hindcast]",[CanESM5],[tas],day,Riley Brady,Providing analysis support for initialized dec...,only for a few initialization years (check mem...
46,10/4/2019 0:40:58,stephens@ucar.edu,added hist-bgc to list of experiments,[],"[historical, hist-bgc]",[All],"[fgo2, fgco2, fgco2abio, fgco2nat, tos, sos, h...",Omon,Britt Stephens,Testing model representation of seasonal air-s...,"These should all be 2d monthly fields, so hope..."
90,1/8/2020 4:07:13,aaron.spring@mpimet.mpg.de,once,[All],"[dcppA-hindcast, dcppB-forecast, esm-piControl...",[All],"[fgco2, intpp, tos, sos]",Omon,Aaron Spring,PhD carbon cycle predictability,
104,2/7/2020 6:05:06,mpb20@st-andrews.ac.uk,,[One],"[historical, ssp126, ssp245, ssp370, ssp585]",[All],"[tas, huss]",day,Michael Byrne,"Hi there,I'm working on a new paper looking at...","That's it, thanks a lot for setting up this aw..."
106,2/8/2020 0:11:43,nhn2@columbia.edu,,[All],[dcppA-hindcast],[All],[tas],Amon,Naomi,,


In [10]:
# choose a new request to process:
timestamps = df_request_new.Timestamp.unique()
print(timestamps)
df_request_new = df_request_new[df_request_new.Timestamp == timestamps[-1]]
df_request_new

['2/8/2020 0:11:43']


Unnamed: 0,Timestamp,E-mail,response status,members,experiments,models,variables,table,requester,science,comments
106,2/8/2020 0:11:43,nhn2@columbia.edu,,[All],[dcppA-hindcast],[All],[tas],Amon,Naomi,,


### Search ESGF for the availability of requested data

In [11]:
print(ESGF_site)
df_ESGF = search(ESGF_site,df_request_new)

https://esgf-node.llnl.gov/esg-search/search

nhn2@columbia.edu
Amon tas ['All'] dcppA-hindcast


In [12]:
#import qgrid
#qgrid.show_grid(df_ESGF)

### Get the master list of existing zarr stores
- df_master includes all curated stores
- df_avail includes all stores, even those with known ES-DOC issues 

In [13]:
df_master = pd.read_csv('https://cmip6.storage.googleapis.com/cmip6-zarr-consolidated-stores.csv')
df_avail = pd.read_csv('https://cmip6.storage.googleapis.com/cmip6-zarr-consolidated-stores-noQC.csv')
len(df_master),len(df_avail),len(df_ESGF)

(224788, 225537, 5118)

### Check the new requests:
- already exists in df_master (what we have) - not needed
- exists in df_ESGF (what is available) - if not available, then not needed

In [14]:
df_needed = needed(df_avail, df_request_new, df_ESGF)

if len(df_needed) > 0:
    num_stores = df_needed.zstore.nunique() 
    print(f'needed: nfiles={len(df_needed)}, nstores={num_stores}')
    #print(df_needed.zstore.unique())
else:
    print('no new data available')
    exit


needed: nfiles=777, nstores=777


In [17]:
import qgrid
qgrid.show_grid(df_needed)
#/DCPP/BCC/BCC-CSM2-MR/dcppA-hindcast/s1961-r1i1p1f1/Amon/tas/gn/

QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': True, 'defau…

### Start logging the progress and exceptions

In [18]:
cat_file = 'csv/cmip6_'+request_id+'.csv'
log_file = 'txt/request_'+request_id+'.log'

In [19]:
# open and close for each write in case of kernel interrupt
def write_log(file,str,verbose=True):
    f = open(file,'a')
    if verbose:
        print(str)
    f.write(str+'\n')
    f.close()
    return

### The real work is done in this next loop 
- could be done in parallel except for the writing to the log file

In [20]:
new_zarrs = df_needed.zstore.unique()
verbose = True

zdict = {}  # construct dictionary for new rows to add to master catalog
for item,zarr in enumerate(new_zarrs):
    #if item >10:
    #    continue
    
    zbdir  = zarr_local  + zarr
    
    write_log(log_file,f"\n>>{item+1}/{num_stores}:<< local file: {zbdir}",verbose=verbose)
    
    # is zarr already in cloud?
    gsurl = 'gs://cmip6' + zarr
    contents = fs.ls(gsurl+'/.zmetadata')
    if any("zmetadata" in s for s in contents):
        write_log(log_file,'store already in cloud',verbose=verbose)
        continue

    # does zarr exist locally? 
    if os.path.isfile(zbdir+'/.zmetadata'):
        write_log(log_file,f'already exists: {zbdir}',verbose=verbose)
        continue

    gfiles,troubles,codes = get_ncfiles(zarr,df_needed,skip_sites)
    
    write_log(log_file,troubles,verbose=verbose)
    
    if len(gfiles) == 0: 
        write_log(log_file,'no files available',verbose=verbose)
        continue
    
    variable_id = zarr.split('/')[-3]
    for gfile in gfiles:   # changes file sizes!!
        command = '/usr/bin/ncatted -h -O -a missing_value\,'+variable_id+',d,, '+gfile
        os.system(command)

    # concatenate in time with mfdataset
    #print(gfiles)
    status, ds, dstr = concatenate(zarr,gfiles,codes)  

    if status == 'concatenation failure: ':
        write_log(log_file,status+dstr,verbose=verbose)
        continue
    else:
        write_log(log_file,dstr)

    try:
        del ds[variable_id].encoding['missing_value']
        ds.to_zarr(zbdir, consolidated=True, mode='w')
    except:
        ds.to_zarr(zbdir, consolidated=True, mode='w')
    

    if not os.path.isfile(zbdir+'/.zmetadata'):
        write_log(log_file,'to_zarr failure: ',verbose=verbose)
        continue
   
    vlist = get_details(ds,zbdir,zarr)
    
    # upload to cloud
    command = '/usr/bin/gsutil -m cp -r '+ zbdir[:-1] + ' ' + gsurl[:-1]
    write_log(log_file,command,verbose=verbose)
    os.system(command) 
        
    try:
        ds = xr.open_zarr(fs.get_mapper(gsurl), consolidated=True)
        zdict[item] = vlist
        write_log(log_file,f'successfully saved as {zbdir}',verbose=verbose) 
        for gfile in gfiles:
            os.system('rm -f '+ gfile)
    except:
        write_log(log_file,'store did not get saved to GCS properly',verbose=verbose)
    


>>1/777:<< local file: /h62/naomi/zarr-minimal/DCPP/BCC/BCC-CSM2-MR/dcppA-hindcast/s1961-r1i1p1f1/Amon/tas/gn/
curl http://cmip.bcc.cma.cn/thredds/fileServer/cmip6_data/DCPP/BCC/BCC-CSM2-MR/dcppA-hindcast/s1961-r1i1p1f1/Amon/tas/gn/v20191126/tas_Amon_BCC-CSM2-MR_dcppA-hindcast_s1961-r1i1p1f1_gn_196101-197012.nc -o nctemp/tas_Amon_BCC-CSM2-MR_dcppA-hindcast_s1961-r1i1p1f1_gn_196101-197012.nc


/usr/bin/gsutil -m cp -r /h62/naomi/zarr-minimal/DCPP/BCC/BCC-CSM2-MR/dcppA-hindcast/s1961-r1i1p1f1/Amon/tas/gn gs://cmip6/DCPP/BCC/BCC-CSM2-MR/dcppA-hindcast/s1961-r1i1p1f1/Amon/tas/gn
successfully saved as /h62/naomi/zarr-minimal/DCPP/BCC/BCC-CSM2-MR/dcppA-hindcast/s1961-r1i1p1f1/Amon/tas/gn/

>>2/777:<< local file: /h62/naomi/zarr-minimal/DCPP/BCC/BCC-CSM2-MR/dcppA-hindcast/s1961-r2i1p1f1/Amon/tas/gn/
curl http://cmip.bcc.cma.cn/thredds/fileServer/cmip6_data/DCPP/BCC/BCC-CSM2-MR/dcppA-hindcast/s1961-r2i1p1f1/Amon/tas/gn/v20191204/tas_Amon_BCC-CSM2-MR_dcppA-hindcast_s1961-r2i1p1f1_gn_196101-197

### Make a table of acquired data to send in email to requestor

In [21]:
if len(zdict) == 0 :
    print('nothing else to do')
    exit
else:
    dz = dict_to_dfcat(zdict)

In [16]:
assert False

AssertionError: 

In [22]:
try:
    df_master_new = pd.concat([df_master, dz])
except:
    df_master_new = df_master

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [23]:
#df_master_new = df_master

In [28]:
from response import response, dict_to_dfcat, get_details
from IPython.display import display

ldict = []
names = ""
print('Re: CMIP6 GCS Data Request (Responses)')
for row in df_request_new.values:
    rdict = dict(zip(df_request_new.keys(),row))
    #print(rdict)
    name = rdict['requester']
    timestamp = rdict['Timestamp']
    names += name
    del rdict['response status']
    ldict += [rdict]
    dfr = df_request_new[df_request_new.Timestamp == timestamp]
    
    print('Dear',name+':')
    print('\n  Here are the results from your recent CMIP6 data request(s).  The master catalog will be updated with the nightly build.')
    #if len(dtrouble)>=1:
    #    print('\n '+dtrouble)
    print('\n  Please note: ')
    print('      - Data for some models (e.g., CAS/FGOALS-f3-L and NUIST/NESM3) must be obtained directly from servers which are too slow or unresponsive. ')
    print('      - We exclude data with known errors (as reported at ES-DOC) from the official data catalog, https://errata.es-doc.org/ . ')
    print('      - Some data we have not been able to clean up enough to get it concatenated and save to zarr. Other datasets are only available for disjointed time periods.')
    print('\n  See the sample Jupyter Notebook at https://gist.github.com/naomi-henderson/ed1801d8ee8b992dda252f8b126876a5 for a quick introduction to accessing the data.')
    print('\nFrom the folks at:\n  The Climate Data Science Lab\n  Division of Ocean and Climate Physics\n  LDEO/Columbia University')
    print('\n--------------------------')

    print('\nrequest:')
    display(rdict)

    print('\nresponse:')
    try:
        print('new stores added:\n',len(dz),'\n')
    except:
        print(f'no new data available at ESGF API search node {ESGF_site}')

    #print('\n',dfr,len(df_master_new))
    table = response(dfr,df_master_new)

    print("\navailable data:\n  this includes your new stores but does not include datasets marked 'onhold', 'wontfix' or 'new' in the ES-DOC ERRATA")
    display(table)
    print('\n\n')

Re: CMIP6 GCS Data Request (Responses)
Dear Naomi:

  Here are the results from your recent CMIP6 data request(s).  The master catalog will be updated with the nightly build.

  Please note: 
      - Data for some models (e.g., CAS/FGOALS-f3-L and NUIST/NESM3) must be obtained directly from servers which are too slow or unresponsive. 
      - We exclude data with known errors (as reported at ES-DOC) from the official data catalog, https://errata.es-doc.org/. 
      - Some data we have not been able to clean up enough to get it concatenated and save to zarr. Other datasets are only available for disjointed time periods.

  See the sample Jupyter Notebook at https://gist.github.com/naomi-henderson/ed1801d8ee8b992dda252f8b126876a5 for a quick introduction to accessing the data.

From the folks at:
  The Climate Data Science Lab
  Division of Ocean and Climate Physics
  LDEO/Columbia University

--------------------------

request:


{'Timestamp': '2/8/2020 0:11:43',
 'E-mail': 'nhn2@columbia.edu',
 'members': ['All'],
 'experiments': ['dcppA-hindcast'],
 'models': ['All'],
 'variables': ['tas'],
 'table': 'Amon',
 'requester': 'Naomi',
 'science': nan,
 'comments': nan}


response:
new stores added:
 508 

165618 Amon
119788 ['dcppA-hindcast']
4849 ['tas']

available data:
  this includes your new stores but does not include datasets marked 'onhold', 'wontfix' or 'new' in the ES-DOC ERRATA


Unnamed: 0_level_0,variable_id,tas
experiment_id,source_id,Unnamed: 2_level_1
dcppA-hindcast,BCC-CSM2-MR,8
dcppA-hindcast,CESM1-1-CAM5-CMIP5,40
dcppA-hindcast,CanESM5,20
dcppA-hindcast,MIROC6,10
dcppA-hindcast,MPI-ESM1-2-HR,5
dcppA-hindcast,NorCPM1,20







In [26]:
! mv csv/request_new.csv csv/requests.csv