# Access to data in the cloud (GCS)

## Import modules and libraries

*First, let's make sure the Python env is correct to run this notebook*:

In [None]:
import os, sys, urllib, tempfile
with tempfile.TemporaryDirectory() as tmpdirname:
    sys.path.append(tmpdirname)
    repo = "https://raw.githubusercontent.com/obidam/ds2-2026/main/"
    urllib.request.urlretrieve(os.path.join(repo, "utils.py"), 
                               os.path.join(tmpdirname, "utils.py"))
    from utils import check_up_env
    check_up_env()

*Then, import the usual suspects*:

In [None]:
import xarray as xr
from intake import open_catalog

import sys
import gcsfs
import xarray as xr
import intake
import pandas as pd

## Read data from Google Cloud Storage (gcsfs)

### Access and listing

In [None]:
# Define cloud file system access point:
fs = gcsfs.GCSFileSystem(project='alert-ground-261008', token='anon', access='read_only')

# And list content of a bucket:
fs.ls('opendata_bdo2020')

But data access with ``gcsfs`` is critically dependant on the GCS set-up. For instance the following project does not allow to list the bucket content:

In [None]:
fs2 = gcsfs.GCSFileSystem(project='alert-ground-261008', token='anon', access='read_only')
try:
    fs2.ls('data_bdo2020')
except:
    print(sys.exc_info()[0])

On the other hand, some dataset may not be free and use a requester pay model. 
In this case, you would have to properly manage authentication:

In [None]:
fs3 = gcsfs.GCSFileSystem(project='poised-honor-358', token='anon')
try:
    fs3.ls('sonific01')
except ValueError as e:
    print(str(e))

### Load data

In [None]:
ds = xr.open_dataset("gcs://opendata_bdo2020/EN.4.2.1.f.analysis.g10.zarr",
                     backend_kwargs={"storage_options": {
                         "project": "alert-ground-261008", 
                         "token": 'anon', 
                         'access':'read_only'}},
                     engine="zarr")
print(ds)

In [None]:
# Load another dataset:
ds = xr.open_dataset("gcs://opendata_bdo2020/GLOBAL_ARGO_SDL2000",
                     backend_kwargs={"storage_options": {
                         "project": "alert-ground-261008", 
                         "token": 'anon', 
                         'access':'read_only'}},
                     consolidated=False,
                     engine="zarr")

# print("Size of the dataset:", ds.nbytes/1e9,"Gb")
print(ds)

## Use intake catalog of data

The catalog also uses the gcsfs entry point, but with intake it's transparent to the user:

### Access and listing of the catalog

In [None]:
from intake import open_catalog

In [None]:
catalog_url = 'https://raw.githubusercontent.com/obidam/ds2-2026/main/ds2_data_catalog.yml'
cat = open_catalog(catalog_url)
list(cat)

### Load data

In [None]:
ds = cat['en4'].read_chunked()
print(ds)

In [None]:
ds  = cat["sea_surface_height"].to_dask()
print(ds)

# Pangeo data

https://github.com/pangeo-data/pangeo-datastore

https://catalog.pangeo.io/

## Explore catalog

In [None]:
from intake import open_catalog

pangeo_cat = open_catalog("https://raw.githubusercontent.com/pangeo-data/pangeo-datastore/master/intake-catalogs/master.yaml")
list(pangeo_cat)

In [None]:
list(pangeo_cat.ocean)
# print(list(pangeo_cat.atmosphere))
# print(list(pangeo_cat.hydro))
# pangeo_cat.walk(depth=5)

# CMIP6 data

In [None]:
# Let's open the CMIP catalogue:
df_full = pd.read_csv('https://storage.googleapis.com/cmip6/cmip6-zarr-consolidated-stores.csv')
df_full.sample(10)

In [None]:
# And make a simulation selection:

# df = df_full.query("activity_id=='CMIP' & table_id == 'Omon' & variable_id == 'thetao' & experiment_id == 'historical' & member_id == 'r1i1p1f1'")
df = df_full.query("activity_id=='CMIP' & table_id == 'Omon' & institution_id == 'CNRM-CERFACS' & experiment_id == 'historical'")
# df = df_full.query('institution_id == "CNRM-CERFACS" & member_id=="r1i1p1f2" & source_id=="CNRM-CM6-1"')

# df = df_full.query("activity_id=='CMIP' & table_id == 'Omon' & variable_id == 'thetao' & experiment_id == 'abrupt-4xCO2'")

# df = df.query("source_id=='CNRM-CM6-1-HR' & variable_id=='thetao'") # Horizontal resolution up to 1/4 deg
# df = df.query("source_id=='CNRM-ESM2-1' & variable_id=='thetao'") # Horizontal resolution up to 1deg
df = df.query("source_id=='CNRM-ESM2-1' & (variable_id=='thetao' | variable_id=='so')") # Horizontal resolution up to 1deg

# df = df.sort_values('version')
df = df.sort_values('member_id')
df

In [None]:
# get the path to a specific zarr store (the first one from the dataframe above)
zstore = df.zstore.values[-1]
print(zstore)

# open it using xarray and zarr
ds = xr.open_dataset(zstore, consolidated=True, engine='zarr', 
                     backend_kwargs={"storage_options": { "token": 'anon',  'access':'read_only'}})
print(ds)

In [None]:
sst = ds['thetao'].sel(lev=0, method='nearest')
sst

In [None]:
def open_cmip6(df_row):
    # get the path to zarr store
    zstore = df.zstore.values[-1]
#     print(zstore)

    # open it using xarray and zarr
    return xr.open_dataset(zstore, consolidated=True, engine='zarr', 
                     backend_kwargs={"storage_options": { "token": 'anon',  'access':'read_only'}})

ds = open_cmip6(df.iloc[0])
print("Size of the dataset:", ds.nbytes/1e9,"Gb")
ds

In [None]:
# Compute size of the df selection:
total_size = 0 # Gb
for index, row in df.iterrows():
    ds = open_cmip6(row)
    total_size += ds.nbytes/1e9
print("Size of the selection of datasets:", total_size, "Gb")    