In [1]:
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import xarray as xr
import zarr
import gcsfs
from tqdm.autonotebook import tqdm
import os
import cftime
import json
from dask import array

%matplotlib inline
plt.rcParams['figure.figsize'] = 12, 6
%config InlineBackend.figure_format = 'retina' 

  import sys


In [None]:
from dask.distributed import Client
from dask_kubernetes import KubeCluster

cluster = KubeCluster.from_yaml('worker-spec.yml')
cluster.adapt(minimum=1, maximum=20, interval='2s')
client = Client(cluster)
client

In [4]:
df = pd.read_csv('https://storage.googleapis.com/cmip6/cmip6-zarr-consolidated-stores.csv')
gcs = gcsfs.GCSFileSystem(token='anon')
df.head()

Unnamed: 0,activity_id,institution_id,source_id,experiment_id,member_id,table_id,variable_id,grid_label,zstore,dcpp_init_year
0,AerChemMIP,BCC,BCC-ESM1,histSST,r1i1p1f1,AERmon,mmrbc,gn,gs://cmip6/AerChemMIP/BCC/BCC-ESM1/histSST/r1i...,
1,AerChemMIP,BCC,BCC-ESM1,histSST,r1i1p1f1,AERmon,mmrdust,gn,gs://cmip6/AerChemMIP/BCC/BCC-ESM1/histSST/r1i...,
2,AerChemMIP,BCC,BCC-ESM1,histSST,r1i1p1f1,AERmon,mmroa,gn,gs://cmip6/AerChemMIP/BCC/BCC-ESM1/histSST/r1i...,
3,AerChemMIP,BCC,BCC-ESM1,histSST,r1i1p1f1,AERmon,mmrso4,gn,gs://cmip6/AerChemMIP/BCC/BCC-ESM1/histSST/r1i...,
4,AerChemMIP,BCC,BCC-ESM1,histSST,r1i1p1f1,AERmon,mmrss,gn,gs://cmip6/AerChemMIP/BCC/BCC-ESM1/histSST/r1i...,


In [5]:
dfs = pd.read_csv('pangeo.csv')

In [6]:
# file_attrs = json.load(open('file_attrs.txt','r'))
# all_attrs = set(['_'.join([a['source_id'],a['member_id'],a['experiment_id'],a['table_id'],a['variable_id']]) for a in file_attrs])

# def want(s, m, e, t, v):
#     key = s+'_'+m+'_'+e+'_'+t+'_'+v
#     return s+'_'+m+'_'+e+'_'+t+'_'+v

# dfs = df[df[['source_id','member_id','experiment_id','table_id','variable_id']].apply(lambda x: want(*x) in all_attrs, axis=1)]

# pangeo_attrs = set(dfs[['source_id','member_id','experiment_id','table_id','variable_id']].apply(lambda x: want(*x), axis=1).values)
# manual_attrs = [a for a in all_attrs if a not in pangeo_attrs]
# json.dump(list(pangeo_attrs), open('pangeo_loads.txt', 'w'))
# json.dump(list(manual_attrs), open('manual_loads.txt', 'w'))

# dfs.to_csv('pangeo_loads.csv')

In [7]:
def load_srch_data(df, source_id, expt_id):

    uri = df[(df.source_id == source_id) &
                         (df.experiment_id == expt_id)].zstore.values[0]
    
    ds = xr.open_zarr(gcs.get_mapper(uri), consolidated=True)
    return ds

def load_data(series):
    ds = xr.open_zarr(gcs.get_mapper(series.zstore), consolidated=True)
    return ds

def get_dims(ds):
    ds_coords = [l for l in list(ds.coords.keys()) if 'bnds' not in l and 'vert' not in l]
    dims = [[l for l in ds_coords if 'lat' in l][0], [l for l in ds_coords if 'lon' in l][0]]
    lat = ds.coords.get(dims[0]).data
    lon = ds.coords.get(dims[1]).data
    return lat, lon, dims

def get_area(ds, df):
    var = ds.get(ds.variable_id)
    realm = ds.table_id[0].lower()
    lat, lon, dims = get_dims(ds)

    df_area = df.query("variable_id == 'areacell"+realm+"' & source_id == '"+ds.source_id+"'")
    if len(df_area.zstore.values) == 0:
        if len(lat) > 2000:
            area = lat
            dims = ["ncells"]
            total_area = lat.sum()
        else:
            time, lon, area = np.meshgrid(ds.time, np.cos(lat), lon, indexing='ij')
            total_area = area[0,:,:].sum()
    else:
        ds_area = xr.open_zarr(gcs.get_mapper(df_area.zstore.values[0]), consolidated=True)
        area = ds_area.get("areacell"+realm)
        total_area = area.sum(area.dims)
        dims = area.dims

    return area, dims, total_area

def avg_var(ds, df):
    area, dims, total_area = get_area(ds, df)
    var = ds.get(ds.variable_id)
    
    ta_timeseries = (var * area).sum(dim=dims) / total_area
    
    if isinstance(ta_timeseries, type(None)):
        print('failed')
    return ta_timeseries



In [8]:
results = {}

In [None]:
for num in tqdm(list(range(len(dfs)))):
    s = dfs.iloc[num]
    name = '_'.join([s.source_id, s.experiment_id, s.member_id, s.variable_id])
    
    if name + '.npy' in os.listdir('data') or 'EC-Earth3' in s.source_id:
        continue
    
    ds = load_data(s)
    try:
        print(str(num),':',name)

        if ds.experiment_id == 'piControl' or ds.experiment_id == '1pctCO2':
            ds = ds.sel(time=slice(ds.time[0], ds.time[min([1799, len(ds.time)-1])]))
        elif len(ds.time) > 2400:
            ds = ds.sel(time=slice(ds.time[0], ds.time[2399]))
        m = avg_var(ds, df)

        if not isinstance(m, type(None)):
            np.save('data/'+name, np.array([m.values[:], np.array([np.datetime64(t) for t in m.time.values])[:]]))
    except:
        print('FAILED on '+str(num)+' : '+name)

HBox(children=(FloatProgress(value=0.0, max=431.0), HTML(value='')))

163 : E3SM-1-1_piControl_r1i1p1f1_tos


  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  return array(a, dtype, copy=False, order=order)


175 : MPI-ESM-1-2-HAM_piControl_r1i1p1f1_ts


  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  return array(a, dtype, copy=False, order=order)


176 : MPI-ESM-1-2-HAM_piControl_r1i1p1f1_mlotst
177 : IPSL-CM6A-LR_1pctCO2_r1i1p1f1_ts
178 : IPSL-CM6A-LR_1pctCO2_r1i1p1f1_tos
179 : IPSL-CM6A-LR_abrupt-4xCO2_r10i1p1f1_ts
180 : IPSL-CM6A-LR_abrupt-4xCO2_r10i1p1f1_tos
181 : IPSL-CM6A-LR_abrupt-4xCO2_r11i1p1f1_ts
182 : IPSL-CM6A-LR_abrupt-4xCO2_r11i1p1f1_tos
183 : IPSL-CM6A-LR_abrupt-4xCO2_r12i1p1f1_ts
184 : IPSL-CM6A-LR_abrupt-4xCO2_r12i1p1f1_tos
185 : IPSL-CM6A-LR_abrupt-4xCO2_r2i1p1f1_ts
186 : IPSL-CM6A-LR_abrupt-4xCO2_r2i1p1f1_tos
187 : IPSL-CM6A-LR_abrupt-4xCO2_r3i1p1f1_ts
188 : IPSL-CM6A-LR_abrupt-4xCO2_r3i1p1f1_tos
189 : IPSL-CM6A-LR_abrupt-4xCO2_r4i1p1f1_ts
190 : IPSL-CM6A-LR_abrupt-4xCO2_r4i1p1f1_tos
191 : IPSL-CM6A-LR_abrupt-4xCO2_r5i1p1f1_ts
192 : IPSL-CM6A-LR_abrupt-4xCO2_r5i1p1f1_tos
193 : IPSL-CM6A-LR_abrupt-4xCO2_r6i1p1f1_ts
194 : IPSL-CM6A-LR_abrupt-4xCO2_r6i1p1f1_tos
195 : IPSL-CM6A-LR_abrupt-4xCO2_r7i1p1f1_ts
196 : IPSL-CM6A-LR_abrupt-4xCO2_r7i1p1f1_tos
197 : IPSL-CM6A-LR_abrupt-4xCO2_r8i1p1f1_ts
198 : IPSL-CM6A-LR_abr

  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  return array(a, dtype, copy=False, order=order)


201 : IPSL-CM6A-LR_piControl_r1i1p1f1_ts


  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  return array(a, dtype, copy=False, order=order)


202 : IPSL-CM6A-LR_piControl_r1i1p1f1_tos
203 : IPSL-CM6A-LR_piControl_r1i2p1f1_ts
204 : IPSL-CM6A-LR_piControl_r1i2p1f1_mlotst
205 : IPSL-CM6A-LR_piControl_r1i2p1f1_tos
206 : HadGEM3-GC31-LL_1pctCO2_r1i1p1f3_ts
207 : HadGEM3-GC31-LL_1pctCO2_r1i1p1f3_mlotst
208 : HadGEM3-GC31-LL_1pctCO2_r1i1p1f3_tos
209 : HadGEM3-GC31-LL_1pctCO2_r2i1p1f3_ts
210 : HadGEM3-GC31-LL_1pctCO2_r2i1p1f3_tos
211 : HadGEM3-GC31-LL_1pctCO2_r3i1p1f3_ts
212 : HadGEM3-GC31-LL_1pctCO2_r3i1p1f3_tos
213 : HadGEM3-GC31-LL_1pctCO2_r4i1p1f3_ts
214 : HadGEM3-GC31-LL_1pctCO2_r4i1p1f3_tos
215 : HadGEM3-GC31-LL_abrupt-4xCO2_r1i1p1f3_ts
216 : HadGEM3-GC31-LL_abrupt-4xCO2_r1i1p1f3_mlotst
217 : HadGEM3-GC31-LL_abrupt-4xCO2_r1i1p1f3_tos
218 : HadGEM3-GC31-LL_piControl_r1i1p1f1_ts
219 : HadGEM3-GC31-LL_piControl_r1i1p1f1_mlotst
220 : HadGEM3-GC31-LL_piControl_r1i1p1f1_tos
221 : HadGEM3-GC31-MM_piControl_r1i1p1f1_ts
222 : HadGEM3-GC31-MM_piControl_r1i1p1f1_mlotst
FAILED on 222 : HadGEM3-GC31-MM_piControl_r1i1p1f1_mlotst
223 : HadGE

  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  return array(a, dtype, copy=False, order=order)


246 : MPI-ESM1-2-HR_piControl_r1i1p1f1_ts


  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  return array(a, dtype, copy=False, order=order)


247 : MPI-ESM1-2-HR_piControl_r1i1p1f1_mlotst
248 : MPI-ESM1-2-LR_1pctCO2_r1i1p1f1_rlut
249 : MPI-ESM1-2-LR_1pctCO2_r1i1p1f1_rsdt
250 : MPI-ESM1-2-LR_1pctCO2_r1i1p1f1_rsut
251 : MPI-ESM1-2-LR_1pctCO2_r1i1p1f1_ts
252 : MPI-ESM1-2-LR_1pctCO2_r1i1p1f1_mlotst


  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  return array(a, dtype, copy=False, order=order)


253 : MPI-ESM1-2-LR_piControl_r1i1p1f1_ts
254 : MRI-ESM2-0_1pctCO2_r1i1p1f1_rlut
255 : MRI-ESM2-0_1pctCO2_r1i1p1f1_rsdt
256 : MRI-ESM2-0_1pctCO2_r1i1p1f1_rsut
257 : MRI-ESM2-0_1pctCO2_r1i1p1f1_ts
258 : MRI-ESM2-0_1pctCO2_r1i1p1f1_tos
260 : MRI-ESM2-0_1pctCO2_r1i2p1f1_rlut
261 : MRI-ESM2-0_1pctCO2_r1i2p1f1_rsdt
262 : MRI-ESM2-0_1pctCO2_r1i2p1f1_rsut
263 : MRI-ESM2-0_1pctCO2_r1i2p1f1_ts
264 : MRI-ESM2-0_abrupt-4xCO2_r10i1p1f1_rlut
265 : MRI-ESM2-0_abrupt-4xCO2_r10i1p1f1_rsdt
266 : MRI-ESM2-0_abrupt-4xCO2_r10i1p1f1_rsut
267 : MRI-ESM2-0_abrupt-4xCO2_r10i1p1f1_ts
268 : MRI-ESM2-0_abrupt-4xCO2_r10i1p1f1_tos
270 : MRI-ESM2-0_abrupt-4xCO2_r11i1p1f1_rlut
271 : MRI-ESM2-0_abrupt-4xCO2_r11i1p1f1_rsdt
272 : MRI-ESM2-0_abrupt-4xCO2_r11i1p1f1_rsut
273 : MRI-ESM2-0_abrupt-4xCO2_r11i1p1f1_ts
274 : MRI-ESM2-0_abrupt-4xCO2_r12i1p1f1_rlut
275 : MRI-ESM2-0_abrupt-4xCO2_r12i1p1f1_rsdt
276 : MRI-ESM2-0_abrupt-4xCO2_r12i1p1f1_rsut
277 : MRI-ESM2-0_abrupt-4xCO2_r12i1p1f1_ts
278 : MRI-ESM2-0_abrupt-4xCO2_r13i

tornado.application - ERROR - Exception in callback functools.partial(<bound method IOLoop._discard_future_result of <zmq.eventloop.ioloop.ZMQIOLoop object at 0x7fa54393e4e0>>, <Task finished coro=<SpecCluster._correct_state_internal() done, defined at /srv/conda/envs/notebook/lib/python3.6/site-packages/distributed/deploy/spec.py:300> exception=AttributeError("'KubeCluster' object has no attribute '_lock'",)>)
Traceback (most recent call last):
  File "/srv/conda/envs/notebook/lib/python3.6/site-packages/tornado/ioloop.py", line 743, in _run_callback
    ret = callback()
  File "/srv/conda/envs/notebook/lib/python3.6/site-packages/tornado/ioloop.py", line 767, in _discard_future_result
    future.result()
  File "/srv/conda/envs/notebook/lib/python3.6/site-packages/distributed/deploy/spec.py", line 381, in _close
    await self._correct_state()
  File "/srv/conda/envs/notebook/lib/python3.6/site-packages/distributed/deploy/spec.py", line 301, in _correct_state_internal
    async with 

300 : MRI-ESM2-0_abrupt-4xCO2_r4i1p1f1_rsut
301 : MRI-ESM2-0_abrupt-4xCO2_r4i1p1f1_ts
302 : MRI-ESM2-0_abrupt-4xCO2_r5i1p1f1_rlut
303 : MRI-ESM2-0_abrupt-4xCO2_r5i1p1f1_rsdt
304 : MRI-ESM2-0_abrupt-4xCO2_r5i1p1f1_rsut
305 : MRI-ESM2-0_abrupt-4xCO2_r5i1p1f1_ts
306 : MRI-ESM2-0_abrupt-4xCO2_r6i1p1f1_rlut
307 : MRI-ESM2-0_abrupt-4xCO2_r6i1p1f1_rsdt
308 : MRI-ESM2-0_abrupt-4xCO2_r6i1p1f1_rsut
309 : MRI-ESM2-0_abrupt-4xCO2_r6i1p1f1_ts
310 : MRI-ESM2-0_abrupt-4xCO2_r7i1p1f1_rlut
311 : MRI-ESM2-0_abrupt-4xCO2_r7i1p1f1_rsdt
312 : MRI-ESM2-0_abrupt-4xCO2_r7i1p1f1_rsut
313 : MRI-ESM2-0_abrupt-4xCO2_r7i1p1f1_ts
314 : MRI-ESM2-0_abrupt-4xCO2_r8i1p1f1_rlut
315 : MRI-ESM2-0_abrupt-4xCO2_r8i1p1f1_rsdt
316 : MRI-ESM2-0_abrupt-4xCO2_r8i1p1f1_rsut
317 : MRI-ESM2-0_abrupt-4xCO2_r8i1p1f1_ts
318 : MRI-ESM2-0_abrupt-4xCO2_r9i1p1f1_rlut
319 : MRI-ESM2-0_abrupt-4xCO2_r9i1p1f1_rsdt
320 : MRI-ESM2-0_abrupt-4xCO2_r9i1p1f1_rsut
321 : MRI-ESM2-0_abrupt-4xCO2_r9i1p1f1_ts
322 : MRI-ESM2-0_piControl_r1i1p1f1_ts


  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  return array(a, dtype, copy=False, order=order)


323 : MRI-ESM2-0_piControl_r1i1p1f1_mlotst


  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  return array(a, dtype, copy=False, order=order)


324 : MRI-ESM2-0_piControl_r1i1p1f1_tos
326 : MRI-ESM2-0_piControl_r1i2p1f1_ts
327 : GISS-E2-1-G-CC_piControl_r1i1p1f1_ts
328 : GISS-E2-1-G-CC_piControl_r1i1p1f1_mlotst
329 : GISS-E2-1-G-CC_piControl_r1i1p1f1_tos
330 : GISS-E2-1-G_1pctCO2_r102i1p1f1_ts
331 : GISS-E2-1-G_1pctCO2_r102i1p1f1_tos
332 : GISS-E2-1-G_1pctCO2_r1i1p1f1_ts
333 : GISS-E2-1-G_1pctCO2_r1i1p1f1_tos
334 : GISS-E2-1-G_1pctCO2_r1i1p3f1_ts
335 : GISS-E2-1-G_1pctCO2_r1i1p3f1_tos
336 : GISS-E2-1-G_abrupt-4xCO2_r102i1p1f1_ts
337 : GISS-E2-1-G_abrupt-4xCO2_r1i1p1f1_ts
338 : GISS-E2-1-G_abrupt-4xCO2_r1i1p1f1_tos
339 : GISS-E2-1-G_abrupt-4xCO2_r1i1p1f3_ts
340 : GISS-E2-1-G_abrupt-4xCO2_r1i1p3f1_ts
341 : GISS-E2-1-G_piControl_r101i1p1f1_ts
342 : GISS-E2-1-G_piControl_r101i1p1f1_mlotst
343 : GISS-E2-1-G_piControl_r101i1p1f1_tos
344 : GISS-E2-1-G_piControl_r102i1p1f1_ts
345 : GISS-E2-1-G_piControl_r102i1p1f1_mlotst
346 : GISS-E2-1-G_piControl_r102i1p1f1_tos
347 : GISS-E2-1-G_piControl_r1i1p1f1_ts
348 : GISS-E2-1-G_piControl_r1i1

In [59]:
# realm = ds.table_id[0].lower()
# df_area = df.query("variable_id == 'areacell"+realm+"' & source_id == '"+ds.source_id+"'")
# ds_area = xr.open_zarr(gcs.get_mapper(df_area.zstore.values[0]), consolidated=True)
# area = ds_area.get("areacell"+realm)
lat, lon, dims = get_dims(ds)
print(ds.coords.get('latitude').latitude.data[324].compute())

[-66.41343 -66.41343 -66.41343 ... -66.41343 -66.41343 -66.41343]


Plot 30-year moving average $\Delta T$  for 1pctCO2 runs
=======

In [None]:
ns = np.array([[f, np.load('data/'+f, allow_pickle = True)] for f in os.listdir('data') if "1pctCO2" in f and '_ts.npy' in f])
below_ts = np.array([n for n in ns if np.mean(n[1][0]) < 200])
dates = [pd.to_datetime(n[1], errors='coerce') for n in ns[:,1]][0]
plt.figure("1pctCO2 runs")
for f, n in ns:
    if not isinstance(n[1,0], type(int)) and f not in below_ts[:,0]:
        w = 30
        plt.plot_date(dates[int(w/2)-1:int(len(n[0])-w/2)],
                      moving_average(n[0], w),
                      xdate=True)

Plot 30-year moving average $\Delta T$  for abrupt-4x runs
=======

In [None]:
ns = np.array([[f, np.load('data/'+f, allow_pickle = True)] for f in os.listdir('data') if "abrupt" in f and '_ts.npy' in f])
below_ts = np.array([n[0] for n in ns if np.mean(n[1][0]) < 200])
short_ts = np.array([n[0] for n in ns if len(n[1][1]) < 1700])
dates = [d for d in [pd.to_datetime(n[1], errors='coerce') for n in ns[:,1]] if d[0] != 'NaT']
plt.figure("1pctCO2 runs")
for f, n in ns:
    if f not in below_ts and f not in short_ts:
        w = 30
        d = next(d for d in dates if len(d) == len(n[0]))
        plt.plot_date(d[int(w/2)-1:int(len(n[0])-w/2)],
                      moving_average(n[0], w),
                      xdate=True, fmt='-')

In [None]:
np.shape(dates[0])

In [None]:
def percentiles(ds, df):
    Area = get_area(ds, df)
    Var = ds.get(ds.variable_id)
    
    percentiles = []
    
    for t in range(len(Var.time)):
        area = Area[t]
        var = Var[t]
        time = Var.time[t]
        
        weights = np.reshape(area.data, (np.prod(np.shape(var)),1))
        vals = np.reshape(var.data, (np.prod(np.shape(var)),1))
        weights = np.array(weights[~np.isnan(vals)])
        vals = np.array(vals[~np.isnan(vals)])

        idx = np.argsort(vals)
        vals = np.take_along_axis(vals, idx, axis=0)
        sorted_weights = np.array(np.take_along_axis(weights, idx, axis=0))
        total = np.nansum(weights)

        i = 0
        low = 0
        N = len(weights)
        pcts = np.array([0.05, 0.17, 0.5, 0.83, 0.95])
        pct_vals = []
        for j in range(0,len(vals)):
            low = low + sorted_weights[j]
            high = low + sorted_weights[min(j+1,N)]
            # If the cumulative weights are nearest the next percentile
            # Then mark down the value
            if low/total < pcts[i] and high/total >= pcts[i]:
                pct_vals += [[pcts[i], vals[j]]]
                if i == np.shape(pcts)[0] - 1:
                    break
                i = i + 1
                
        percentiles += [time, pct_vals]
    
    return percentiles


In [None]:
get_dims(ds)