In [1]:
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import xarray as xr
import zarr
import gcsfs
from tqdm.autonotebook import tqdm
import os
import cftime
import json
from dask import array

%matplotlib inline
plt.rcParams['figure.figsize'] = 12, 6
%config InlineBackend.figure_format = 'retina' 

  import sys


In [2]:
from dask.distributed import Client
from dask_kubernetes import KubeCluster

cluster = KubeCluster()
cluster.adapt(minimum=1, maximum=20, interval='2s')
client = Client(cluster)
client

distributed.dashboard.proxy - INFO - To route to workers diagnostics web server please install jupyter-server-proxy: python -m pip install jupyter-server-proxy
distributed.scheduler - INFO - Clear task state
distributed.scheduler - INFO -   Scheduler at:   tcp://10.48.179.2:38967
distributed.scheduler - INFO -   dashboard at:                     :8787
distributed.scheduler - INFO - Receive client connection: Client-91ef2dc0-6ef7-11ea-a5d0-cecc0997d3e4
distributed.core - INFO - Starting established connection


0,1
Client  Scheduler: tcp://10.48.179.2:38967  Dashboard: /user/ghall3-pangeo_tests-aajlpvy0/proxy/8787/status,Cluster  Workers: 0  Cores: 0  Memory: 0 B


In [3]:
df = pd.read_csv('https://storage.googleapis.com/cmip6/cmip6-zarr-consolidated-stores.csv')
gcs = gcsfs.GCSFileSystem(token='anon')

In [4]:
dfs = pd.read_csv('pangeo.csv')

In [5]:
def load_srch_data(df, source_id, expt_id):

    uri = df[(df.source_id == source_id) &
                         (df.experiment_id == expt_id)].zstore.values[0]
    
    ds = xr.open_zarr(gcs.get_mapper(uri), consolidated=True)
    return ds

def load_data(series):
    ds = xr.open_zarr(gcs.get_mapper(series.zstore), consolidated=True)
    return ds

def get_dims(ds):
    ds_coords = [l for l in list(ds.coords.keys()) if 'bnds' not in l and 'vert' not in l]
    dims = [[l for l in ds_coords if 'lat' in l][0], [l for l in ds_coords if 'lon' in l][0]]
    lat = ds.coords.get(dims[0])
    lon = ds.coords.get(dims[1])
    return lat, lon, dims

def get_area(ds, df):
    var = ds.get(ds.variable_id)
    realm = ds.table_id[0].lower()
    lat, lon, dims = get_dims(ds)

    df_area = df.query("variable_id == 'areacell"+realm+"' & source_id == '"+ds.source_id+"' & grid_label== '"+ds.grid_label+"'")
    if len(df_area.zstore.values) == 0:
        if len(lat.data) > 2000:
            area = np.cos(lat * np.pi / 180)
            dims = ["ncells"]
            total_area = lat.sum()
        elif np.shape(lat) == np.shape(var)[1:]:
            area = np.cos(lat.data * np.pi / 180)
            total_area = area.sum()
            dims = ds.get(dims[0]).dims
        else:
            time, area, lon = np.meshgrid(ds.time, np.cos(lat.data * np.pi / 180), lon, indexing='ij')
            total_area = area[0,:,:].sum()
    else:
        ds_area = xr.open_zarr(gcs.get_mapper(df_area.zstore.values[0]), consolidated=True)
        area = ds_area.get("areacell"+realm)
        total_area = area.sum(area.dims)
        dims = area.dims

    return area, dims, total_area

def avg_var(ds, df):
    area, dims, total_area = get_area(ds, df)
    var = ds.get(ds.variable_id)
    
    ta_timeseries = (var * area).sum(dim=dims) / total_area
    
    if isinstance(ta_timeseries, type(None)):
        print('failed')
    return ta_timeseries



distributed.scheduler - INFO - Register worker <Worker 'tcp://10.48.180.57:33321', name: 0, memory: 0, processing: 0>
distributed.scheduler - INFO - Starting worker compute stream, tcp://10.48.180.57:33321
distributed.core - INFO - Starting established connection


Load all files available on Pangeo servers
=======

In [None]:
for num in tqdm(list(range(len(dfs)))):
    s = dfs.iloc[num]
    name = '_'.join([s.source_id, s.experiment_id, s.member_id, s.variable_id])
    
    ds = load_data(s)
    df_area = df.query("variable_id == 'areacell"+ds.table_id[0].lower()+"' & source_id == '"+ds.source_id+"' & grid_label== '"+ds.grid_label+"'")
    if len(df_area.zstore.values != 0):
        continue
        
    print(str(num)+" : "+name)

    if ds.experiment_id == 'piControl' or ds.experiment_id == '1pctCO2':
        ds = ds.sel(time=slice(ds.time[0], ds.time[min([1799, len(ds.time)-1])]))
    elif len(ds.time) > 2400:
        ds = ds.sel(time=slice(ds.time[0], ds.time[2399]))
    m = avg_var(ds, df)

    if not isinstance(m, type(None)):
        np.save('data/'+name, np.array([m.values[:], np.array([np.datetime64(t) for t in m.time.values])[:]]))

Get all files not available on Pangeo's servers
=========

In [6]:
from collections import defaultdict
manual = json.load(open('manual_loads.txt','r'))
allfiles = json.load(open('allfiles.txt','r'))
mapping = defaultdict(list)
[mapping['_'.join([a.split('/')[9],a.split('/')[11],a.split('/')[10]]+ a.split('/')[12:14])].append(a) for a in allfiles]
to_load = [(m, mapping.get(m)) for m in manual]
to_load = sorted(to_load, key=lambda x: len(x[1]))

Get all info I need to download these by hand.
=======

In [24]:
esgf = pd.DataFrame([m.split('_') for m in manual])
esgf = esgf.rename(columns={0:'source_id',1:'member_id',2:'experiment_id',3:'table_id',4:'variable_id'})
esgf.t

Unnamed: 0,source_id,member_id,experiment_id,table_id,variable_id
0,FGOALS-f3-L,r1i1p1f1,piControl,Omon,tos
1,FGOALS-f3-L,r1i1p1f1,piControl,Amon,ts
2,CNRM-ESM2-1,r2i1p1f2,abrupt-4xCO2,Omon,mlotst
3,IPSL-CM6A-LR,r12i1p1f1,abrupt-4xCO2,Omon,mlotst
4,MRI-ESM2-0,r1i1p1f1,abrupt-4xCO2,Omon,mlotst
...,...,...,...,...,...
244,CNRM-ESM2-1,r1i1p1f2,1pctCO2,Omon,mlotst
245,CNRM-CM6-1,r3i1p1f2,abrupt-4xCO2,Omon,mlotst
246,AWI-ESM-1-1-LR,r1i1p1f1,piControl,Amon,ts
247,NorESM2-MM,r1i1p1f1,1pctCO2,Omon,mlotst


In [20]:
len(manual)

249