## Find Start/Stop times of GC data 
- For stores matching specific requests in order to check correctness of new data

In [1]:
import xarray as xr
import pandas as pd
import qgrid

In [2]:
# define a simple search on keywords
def search_df(df, verbose= False, **search):
    "search by keywords - if list, then match exactly, otherwise match as substring"
    keys = ['activity_id','institution_id','source_id','experiment_id','member_id', 'table_id', 'variable_id', 'grid_label']
    dz = df.copy()
    for skey in search.keys():   
        if isinstance(search[skey], str):  # match a string as a substring
            dz = dz[dz[skey].str.contains(search[skey])]
        else:
            dk = []
            for key in search[skey]:       # match a list of strings exactly
                dk += [dz[dz[skey]==key]]
            dz = pd.concat(dk)
            keys.remove(skey)
    if verbose:
        for key in keys:
            print(key,' = ',list(d[key].unique()))      
            
    return dz

import fsspec
def add_time_info(df, verbose= False):
    # check key='zstore'
    starts = []; stops = []; nts = []
    dz = df.copy()
    for index, row in df.iterrows():
        zstore = row.zstore
        ds = xr.open_zarr(fsspec.get_mapper(zstore),consolidated=True) 
        start = 'NA'
        start = 'NA'
        nt = '1'
        if 'time' in ds.coords:
            dstime = ds.time.values
            start = str(dstime[0])[:10]
            stop = str(dstime[-1])[:10]
            nt = len(dstime)
            if verbose:
                print(zstore,start,stop,nt)
        starts += [start]
        stops += [stop]
        nts += [nt]

    dz['start'] = starts
    dz['stop'] = stops
    dz['nt'] = nts
    return dz

import requests
def get_version(zstore):
    client = requests.session()
    baseurl =  'http://hdl.handle.net/api/handles/'
    query1 = '?type=IS_PART_OF'
    query2 = '?type=VERSION_NUMBER'

    tracking_ids = xr.open_zarr(fsspec.get_mapper(zstore),consolidated=True).attrs['tracking_id']

    versions = []
    datasets = []
    for file_tracking_id in tracking_ids.split('\n')[0:1]:
        url = baseurl+file_tracking_id[4:]+query1
        r = client.get(url)
        r.raise_for_status()
        dataset_tracking_id = r.json()['values'][0]['data']['value']
        datasets += [dataset_tracking_id]
        if ';' in dataset_tracking_id:
            # multiple dataset_ids erroneously reported
            dtracks = dataset_tracking_id.split(';')
            vs = []
            for dtrack in dtracks:
                url2 = baseurl + dtrack[4:] + query2
                r = client.get(url2)
                r.raise_for_status()
                r.json()['values'][0]['data']['value']
                vs += [r.json()['values'][0]['data']['value']]
            v = sorted(vs)[-1]    
        else:
            url2 = baseurl + dataset_tracking_id[4:] + query2
            r = client.get(url2)
            r.raise_for_status()
            v = r.json()['values'][0]['data']['value']
        versions += [v]

    version_id = list(set(versions))
    dataset_id = list(set(datasets))

    assert len(version_id)==1
    
    return dataset_id[0], version_id[0]

In [4]:
#dESGF = pd.read_csv('dESGF-CEDA.csv')  # from EvaluateResponse.ipynb, list of available datasets matching the request
#exps = dESGF.experiment_id.unique()
#variables = dESGF.variable_id.unique()  #[0]
#tables = dESGF.table_id.unique()

tables = 'day'
exps = ['historical','ssp126','ssp245']
variables = ['pr','tasmax','tasmin']

d2 = pd.read_csv('https://cmip6.storage.googleapis.com/cmip6-zarr-consolidated-stores.csv')
dGC = search_df(d2,variable_id=variables,experiment_id=exps,table_id=tables)

In [53]:
dGCp = add_time_info(dGC,verbose=True)

gs://cmip6/CMIP/AS-RCEC/TaiESM1/historical/r1i1p1f1/day/tasmax/gn/ 1850-01-01 2014-12-31 60225
gs://cmip6/CMIP/AWI/AWI-CM-1-1-MR/historical/r1i1p1f1/day/tasmax/gn/ 1850-01-01 2014-12-31 60265
gs://cmip6/CMIP/AWI/AWI-CM-1-1-MR/historical/r2i1p1f1/day/tasmax/gn/ 1850-01-01 2014-12-31 60265
gs://cmip6/CMIP/AWI/AWI-CM-1-1-MR/historical/r3i1p1f1/day/tasmax/gn/ 1850-01-01 2014-12-31 60265
gs://cmip6/CMIP/AWI/AWI-CM-1-1-MR/historical/r4i1p1f1/day/tasmax/gn/ 1850-01-01 2014-12-31 60265
gs://cmip6/CMIP/AWI/AWI-ESM-1-1-LR/historical/r1i1p1f1/day/tasmax/gn/ 1850-01-01 2014-12-31 60265
gs://cmip6/CMIP/BCC/BCC-CSM2-MR/historical/r1i1p1f1/day/tasmax/gn/ 1850-01-01 2014-12-31 60225
gs://cmip6/CMIP/BCC/BCC-CSM2-MR/historical/r2i1p1f1/day/tasmax/gn/ 1850-01-01 2014-12-31 60225
gs://cmip6/CMIP/BCC/BCC-CSM2-MR/historical/r3i1p1f1/day/tasmax/gn/ 1850-01-01 2014-12-31 60225
gs://cmip6/CMIP/BCC/BCC-ESM1/historical/r1i1p1f1/day/tasmax/gn/ 1850-01-01 2014-12-31 60225
gs://cmip6/CMIP/BCC/BCC-ESM1/historical/r2

  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  return array(a, dtype, copy=False, order=order)


gs://cmip6/ScenarioMIP/MRI/MRI-ESM2-0/ssp126/r1i1p1f1/day/tasmax/gn/ 2015-01-01 2300-12-31 104459
gs://cmip6/ScenarioMIP/NCC/NorESM2-LM/ssp126/r1i1p1f1/day/tasmax/gn/ 2015-01-01 2100-12-31 31390
gs://cmip6/ScenarioMIP/NCC/NorESM2-MM/ssp126/r1i1p1f1/day/tasmax/gn/ 2015-01-01 2100-12-31 31390
gs://cmip6/ScenarioMIP/NIMS-KMA/KACE-1-0-G/ssp126/r1i1p1f1/day/tasmax/gr/ 2015-01-01 2100-12-30 30960
gs://cmip6/ScenarioMIP/NOAA-GFDL/GFDL-ESM4/ssp126/r1i1p1f1/day/tasmax/gr1/ 2015-01-01 2100-12-31 31390
gs://cmip6/ScenarioMIP/NUIST/NESM3/ssp126/r1i1p1f1/day/tasmax/gn/ 2015-01-01 2100-12-31 31411
gs://cmip6/ScenarioMIP/AWI/AWI-CM-1-1-MR/ssp126/r1i1p1f1/day/tasmin/gn/ 2015-01-01 2100-12-31 31411
gs://cmip6/ScenarioMIP/BCC/BCC-CSM2-MR/ssp126/r1i1p1f1/day/tasmin/gn/ 2015-01-01 2100-12-31 31390
gs://cmip6/ScenarioMIP/CCCma/CanESM5/ssp126/r10i1p1f1/day/tasmin/gn/ 2015-01-01 2100-12-31 31390
gs://cmip6/ScenarioMIP/CCCma/CanESM5/ssp126/r10i1p2f1/day/tasmin/gn/ 2015-01-01 2100-12-31 31390
gs://cmip6/Scenar

  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  return array(a, dtype, copy=False, order=order)


gs://cmip6/ScenarioMIP/MRI/MRI-ESM2-0/ssp126/r1i1p1f1/day/tasmin/gn/ 2015-01-01 2300-12-31 104459
gs://cmip6/ScenarioMIP/NCC/NorESM2-LM/ssp126/r1i1p1f1/day/tasmin/gn/ 2015-01-01 2100-12-31 31390
gs://cmip6/ScenarioMIP/NCC/NorESM2-MM/ssp126/r1i1p1f1/day/tasmin/gn/ 2015-01-01 2100-12-31 31390
gs://cmip6/ScenarioMIP/NIMS-KMA/KACE-1-0-G/ssp126/r1i1p1f1/day/tasmin/gr/ 2015-01-01 2100-12-30 30960
gs://cmip6/ScenarioMIP/NOAA-GFDL/GFDL-ESM4/ssp126/r1i1p1f1/day/tasmin/gr1/ 2015-01-01 2100-12-31 31390
gs://cmip6/ScenarioMIP/NUIST/NESM3/ssp126/r1i1p1f1/day/tasmin/gn/ 2015-01-01 2100-12-31 31411
gs://cmip6/ScenarioMIP/AWI/AWI-CM-1-1-MR/ssp126/r1i1p1f1/day/pr/gn/ 2015-01-01 2100-12-31 31411
gs://cmip6/ScenarioMIP/BCC/BCC-CSM2-MR/ssp126/r1i1p1f1/day/pr/gn/ 2015-01-01 2100-12-31 31390
gs://cmip6/ScenarioMIP/CAS/FGOALS-g3/ssp126/r1i1p1f1/day/pr/gn/ 2015-01-01 2100-12-31 31390
gs://cmip6/ScenarioMIP/CCCma/CanESM5/ssp126/r10i1p1f1/day/pr/gn/ 2015-01-01 2100-12-31 31390
gs://cmip6/ScenarioMIP/CCCma/CanES

  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  return array(a, dtype, copy=False, order=order)


gs://cmip6/ScenarioMIP/MRI/MRI-ESM2-0/ssp126/r1i1p1f1/day/pr/gn/ 2015-01-01 2300-12-31 104459
gs://cmip6/ScenarioMIP/NCAR/CESM2-WACCM/ssp126/r1i1p1f1/day/pr/gn/ 2015-01-01 2101-01-01 31391
gs://cmip6/ScenarioMIP/NCAR/CESM2/ssp126/r1i1p1f1/day/pr/gn/ 2015-01-01 2101-01-01 31391
gs://cmip6/ScenarioMIP/NCC/NorESM2-LM/ssp126/r1i1p1f1/day/pr/gn/ 2015-01-01 2100-12-31 31390
gs://cmip6/ScenarioMIP/NCC/NorESM2-MM/ssp126/r1i1p1f1/day/pr/gn/ 2015-01-01 2100-12-31 31390
gs://cmip6/ScenarioMIP/NIMS-KMA/KACE-1-0-G/ssp126/r1i1p1f1/day/pr/gr/ 2015-01-01 2100-12-30 30960
gs://cmip6/ScenarioMIP/NOAA-GFDL/GFDL-ESM4/ssp126/r1i1p1f1/day/pr/gr1/ 2015-01-01 2100-12-31 31390
gs://cmip6/ScenarioMIP/NUIST/NESM3/ssp126/r1i1p1f1/day/pr/gn/ 2015-01-01 2100-12-31 31411
gs://cmip6/ScenarioMIP/AWI/AWI-CM-1-1-MR/ssp245/r1i1p1f1/day/tasmax/gn/ 2015-01-01 2100-12-31 31411
gs://cmip6/ScenarioMIP/BCC/BCC-CSM2-MR/ssp245/r1i1p1f1/day/tasmax/gn/ 2015-01-01 2100-12-31 31390
gs://cmip6/ScenarioMIP/CCCma/CanESM5/ssp245/r10i1p1

In [54]:
qgrid.show_grid(dGCp)

QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': True, 'defau…

In [21]:
import numpy as np
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

exps = dGCp.experiment_id.unique()
for exp in exps:
    dGCpe = dGCp[dGCp.experiment_id==exp]
# use this to get number of models (one per model)
    dm = dGCpe[['experiment_id','source_id','member_id','variable_id','start','stop','nt']].groupby([
                'experiment_id','member_id','source_id','variable_id','start','stop','nt']).nunique()[['member_id']]

    table = pd.DataFrame.pivot_table(dm,
                                     values='member_id',
                                     index=['source_id','variable_id','start','stop','nt'],
                                     columns=[],
                                     aggfunc=np.sum,
                                     fill_value=0)
    print(exp,'\n',table)

historical 
                                                           member_id
source_id        variable_id start      stop       nt              
ACCESS-CM2       pr          1850-01-01 2014-12-31 60265          2
                 tasmax      1850-01-01 2014-12-31 60265          1
                 tasmin      1850-01-01 2014-12-31 60265          1
ACCESS-ESM1-5    pr          1850-01-01 2014-12-31 60265          3
                 tasmax      1850-01-01 2014-12-31 60265          1
                 tasmin      1850-01-01 2014-12-31 60265          1
AWI-CM-1-1-MR    tasmax      1850-01-01 2014-12-31 60265          4
                 tasmin      1850-01-01 2014-12-31 60265          4
AWI-ESM-1-1-LR   pr          1850-01-01 2014-12-31 60265          1
                 tasmax      1850-01-01 2014-12-31 60265          1
                 tasmin      1850-01-01 2014-12-31 60265          1
BCC-CSM2-MR      pr          1850-01-01 2014-12-31 60225          3
                 tasmax      1850-0

In [46]:
ds = xr.open_zarr(fsspec.get_mapper('gs://cmip6/ScenarioMIP/NCC/NorESM2-LM/ssp126/r1i1p1f1/day/tasmax/gn/'),consolidated=True)

In [50]:
http://noresg.nird.sigma2.no/thredds/fileServer/esg_dataroot/cmor/CMIP6/ScenarioMIP/NCC/NorESM2-LM/ssp126/r1i1p1f1/day/pr/gn/v20191108/pr_day_NorESM2-LM_ssp126_r1i1p1f1_gn_20310101-20401230.nc
http://noresg.nird.sigma2.no/thredds/fileServer/esg_dataroot/cmor/CMIP6/ScenarioMIP/NCC/NorESM2-LM/ssp126/r1i1p1f1/day/pr/gn/v20191108/pr_day_NorESM2-LM_ssp126_r1i1p1f1_gn_20310101-20401231.nc

42337

In [51]:
ds2 = xr.open_zarr(fsspec.get_mapper('gs://cmip6/ScenarioMIP/NCC/NorESM2-LM/ssp245/r1i1p1f1/day/tasmax/gn/'),consolidated=True)