# Build intake catalog for ATLAS v2 datasets

In [1]:
import pandas as pd
import numpy as np

In [2]:
## load manifest

df = pd.read_csv("../atlas_v2/manifest-atlas-v2_20250317.txt", names=["url"])
df

Unnamed: 0,url
0,https://data.mips.copernicus-climate.eu/thredd...
1,https://data.mips.copernicus-climate.eu/thredd...
2,https://data.mips.copernicus-climate.eu/thredd...
3,https://data.mips.copernicus-climate.eu/thredd...
4,https://data.mips.copernicus-climate.eu/thredd...
...,...
1013,https://data.mips.copernicus-climate.eu/thredd...
1014,https://data.mips.copernicus-climate.eu/thredd...
1015,https://data.mips.copernicus-climate.eu/thredd...
1016,https://data.mips.copernicus-climate.eu/thredd...


In [3]:
df.url[0]

'https://data.mips.copernicus-climate.eu/thredds/fileServer/esg_c3s-cica-atlas/v02/CMIP6/historical/tr_CMIP6_historical_mon_185001-201412.nc'

In [4]:
## build new dataframe

df_new = pd.DataFrame(columns = [
    'ds_id',
    'version',
    'path',
    'size',
    'project',
    'domain',
    'experiment',
    'time_frequency',
    'variable',
    'start_time',
    'end_time',
    'bbox',
    'level',
    'url',
])
df_new

Unnamed: 0,ds_id,version,path,size,project,domain,experiment,time_frequency,variable,start_time,end_time,bbox,level,url


In [5]:
df_new["url"] = df["url"].str.replace("/esg_c3s-cica-atlas/v02", "/esg_c3s-cica-atlas/v02", regex=False)
df_new.head()

Unnamed: 0,ds_id,version,path,size,project,domain,experiment,time_frequency,variable,start_time,end_time,bbox,level,url
0,,,,,,,,,,,,,,https://data.mips.copernicus-climate.eu/thredd...
1,,,,,,,,,,,,,,https://data.mips.copernicus-climate.eu/thredd...
2,,,,,,,,,,,,,,https://data.mips.copernicus-climate.eu/thredd...
3,,,,,,,,,,,,,,https://data.mips.copernicus-climate.eu/thredd...
4,,,,,,,,,,,,,,https://data.mips.copernicus-climate.eu/thredd...


In [6]:
df_new["path"] = df_new["url"].str.replace("https://data.mips.copernicus-climate.eu/thredds/fileServer/esg_c3s-cica-atlas/v02/", "", regex=False)
df_new.head()

Unnamed: 0,ds_id,version,path,size,project,domain,experiment,time_frequency,variable,start_time,end_time,bbox,level,url
0,,,CMIP6/historical/tr_CMIP6_historical_mon_18500...,,,,,,,,,,,https://data.mips.copernicus-climate.eu/thredd...
1,,,CMIP6/historical/r01_CMIP6_historical_mon_1850...,,,,,,,,,,,https://data.mips.copernicus-climate.eu/thredd...
2,,,CMIP6/historical/fdbals_CMIP6_historical_mon_1...,,,,,,,,,,,https://data.mips.copernicus-climate.eu/thredd...
3,,,CMIP6/historical/sst_CMIP6_historical_mon_1850...,,,,,,,,,,,https://data.mips.copernicus-climate.eu/thredd...
4,,,CMIP6/historical/spei6_CMIP6_historical_mon_18...,,,,,,,,,,,https://data.mips.copernicus-climate.eu/thredd...


In [7]:
version = "v2"

df_new["ds_id"] = df_new["path"].apply(lambda x: 'c3s-cica-atlas.' + '.'.join(x.split('/')[-1].split('_')[0:-1]) + '.' + version)
df_new.head()

Unnamed: 0,ds_id,version,path,size,project,domain,experiment,time_frequency,variable,start_time,end_time,bbox,level,url
0,c3s-cica-atlas.tr.CMIP6.historical.mon.v2,,CMIP6/historical/tr_CMIP6_historical_mon_18500...,,,,,,,,,,,https://data.mips.copernicus-climate.eu/thredd...
1,c3s-cica-atlas.r01.CMIP6.historical.mon.v2,,CMIP6/historical/r01_CMIP6_historical_mon_1850...,,,,,,,,,,,https://data.mips.copernicus-climate.eu/thredd...
2,c3s-cica-atlas.fdbals.CMIP6.historical.mon.v2,,CMIP6/historical/fdbals_CMIP6_historical_mon_1...,,,,,,,,,,,https://data.mips.copernicus-climate.eu/thredd...
3,c3s-cica-atlas.sst.CMIP6.historical.mon.v2,,CMIP6/historical/sst_CMIP6_historical_mon_1850...,,,,,,,,,,,https://data.mips.copernicus-climate.eu/thredd...
4,c3s-cica-atlas.spei6.CMIP6.historical.mon.v2,,CMIP6/historical/spei6_CMIP6_historical_mon_18...,,,,,,,,,,,https://data.mips.copernicus-climate.eu/thredd...


In [8]:
df_new.ds_id[0]

'c3s-cica-atlas.tr.CMIP6.historical.mon.v2'

In [9]:
# Let’s agree on “Europe” for E-OBS, “Europe (EURO-CORDEX)” for CORDEX-EUR-11 and “Global (mosaic)” for CODEX-CORE (the others are all “Global”).

def get_project(ds_id):
    project = ds_id.split(".")[2]
    # project = project.split("-")[0]
    return project

def get_domain(ds_id):
    project = ds_id.split(".")[2]
    if project == "E-OBS":
        domain = "Europe"
    elif project == "CORDEX-EUR-11":
        domain = "Europe (EURO-CORDEX)"
    elif project == "CORDEX-CORE":
        domain = "Global (mosaic)"
    else:
        domain = "Global"
    return domain

def get_experiment(ds_id):
    parts = ds_id.split('.')
    if len(parts) == 6:
        exp = parts[3]
    else:
        exp = np.nan
    return exp
    

def get_start_time(path):
    parts = path.split("_")
    part = parts[-1]
    start = part.split("-")[0]
    if "mon" in parts:
        start_time = f"{start[0:4]}-{start[4:6]}-01T00:00:00"
    else:
        start_time = f"{start[0:4]}-01-01T00:00:00"
    return start_time


def get_end_time(path):
    parts = path.split("_")
    part = parts[-1]
    end = part.split("-")[1]
    if "mon" in parts:
        end_time = f"{end[0:4]}-{end[4:6]}-31T00:00:00"
    else:
        end_time = f"{end[0:4]}-12-31T00:00:00"
    return end_time

def get_version(ds_id):
    return "v1"

In [10]:
df_new['version'] = df_new['ds_id'].apply(get_version)
df_new['project'] = df_new["ds_id"].apply(get_project)
df_new['domain'] = df_new["ds_id"].apply(get_domain)
df_new['experiment'] = df_new["ds_id"].apply(get_experiment)
df_new['time_frequency'] = df_new["ds_id"].apply(lambda x: x.split('.')[-2])
df_new['variable'] = df_new["ds_id"].apply(lambda x: x.split('.')[1])
df_new['start_time'] = df_new["path"].apply(get_start_time)
df_new['end_time'] = df_new["path"].apply(get_end_time)
df_new

Unnamed: 0,ds_id,version,path,size,project,domain,experiment,time_frequency,variable,start_time,end_time,bbox,level,url
0,c3s-cica-atlas.tr.CMIP6.historical.mon.v2,v1,CMIP6/historical/tr_CMIP6_historical_mon_18500...,,CMIP6,Global,historical,mon,tr,1850-01-01T00:00:00,2014-12-31T00:00:00,,,https://data.mips.copernicus-climate.eu/thredd...
1,c3s-cica-atlas.r01.CMIP6.historical.mon.v2,v1,CMIP6/historical/r01_CMIP6_historical_mon_1850...,,CMIP6,Global,historical,mon,r01,1850-01-01T00:00:00,2014-12-31T00:00:00,,,https://data.mips.copernicus-climate.eu/thredd...
2,c3s-cica-atlas.fdbals.CMIP6.historical.mon.v2,v1,CMIP6/historical/fdbals_CMIP6_historical_mon_1...,,CMIP6,Global,historical,mon,fdbals,1850-01-01T00:00:00,2014-12-31T00:00:00,,,https://data.mips.copernicus-climate.eu/thredd...
3,c3s-cica-atlas.sst.CMIP6.historical.mon.v2,v1,CMIP6/historical/sst_CMIP6_historical_mon_1850...,,CMIP6,Global,historical,mon,sst,1850-01-01T00:00:00,2014-12-31T00:00:00,,,https://data.mips.copernicus-climate.eu/thredd...
4,c3s-cica-atlas.spei6.CMIP6.historical.mon.v2,v1,CMIP6/historical/spei6_CMIP6_historical_mon_18...,,CMIP6,Global,historical,mon,spei6,1850-01-01T00:00:00,2014-12-31T00:00:00,,,https://data.mips.copernicus-climate.eu/thredd...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1013,c3s-cica-atlas.tnn.BERKELEY.mon.v2,v1,BERKELEY/tnn_BERKELEY_mon_188101-201712.nc,,BERKELEY,Global,,mon,tnn,1881-01-01T00:00:00,2017-12-31T00:00:00,,,https://data.mips.copernicus-climate.eu/thredd...
1014,c3s-cica-atlas.t.BERKELEY.mon.v2,v1,BERKELEY/t_BERKELEY_mon_188101-201712.nc,,BERKELEY,Global,,mon,t,1881-01-01T00:00:00,2017-12-31T00:00:00,,,https://data.mips.copernicus-climate.eu/thredd...
1015,c3s-cica-atlas.tx35.BERKELEY.mon.v2,v1,BERKELEY/tx35_BERKELEY_mon_188101-201712.nc,,BERKELEY,Global,,mon,tx35,1881-01-01T00:00:00,2017-12-31T00:00:00,,,https://data.mips.copernicus-climate.eu/thredd...
1016,c3s-cica-atlas.dtr.BERKELEY.mon.v2,v1,BERKELEY/dtr_BERKELEY_mon_188101-201712.nc,,BERKELEY,Global,,mon,dtr,1881-01-01T00:00:00,2017-12-31T00:00:00,,,https://data.mips.copernicus-climate.eu/thredd...


## show values

In [11]:
df_new.nunique()

ds_id             1018
version              1
path              1018
size                 0
project              9
domain               4
experiment           9
time_frequency       2
variable            56
start_time           8
end_time             7
bbox                 0
level                0
url               1018
dtype: int64

In [12]:
list(df_new.version.unique())

['v1']

In [13]:
list(df_new.project.unique())

['CMIP6',
 'E-OBS',
 'CMIP5',
 'CERRA',
 'ERA5-Land',
 'CORDEX-CORE',
 'CORDEX-EUR-11',
 'CPC',
 'BERKELEY']

In [14]:
list(df_new.domain.unique())

['Global', 'Europe', 'Global (mosaic)', 'Europe (EURO-CORDEX)']

In [15]:
list(df_new.experiment.unique())

['historical',
 'ssp126',
 'ssp119',
 'ssp245',
 'ssp585',
 'ssp370',
 nan,
 'rcp45',
 'rcp85',
 'rcp26']

In [16]:
list(df_new.time_frequency.unique())

['mon', 'yr']

In [17]:
list(df_new.variable.unique())

['tr',
 'r01',
 'fdbals',
 'sst',
 'spei6',
 'rlds',
 'spi6',
 'rsds',
 'sdiibaisimip',
 'cddbaisimip',
 'sdii',
 'trbals',
 'dtr',
 'fd',
 'rx1day',
 'tx35',
 'clt',
 'tx35baisimip',
 'tx40bals',
 'dtrbals',
 't',
 'tx35bals',
 'cdd',
 'siconc',
 'psl',
 'tx40baisimip',
 'tx',
 'fdbaisimip',
 'mrsos',
 'prsn',
 'tn',
 'tx40',
 'huss',
 'r20baisimip',
 'hdbaisimip',
 'r10',
 'txx',
 'dtrbaisimip',
 'evspsbl',
 'trbaisimip',
 'sfcwind',
 'hdbals',
 'r01baisimip',
 'rx5day',
 'pet',
 'mrro',
 'r',
 'cdbaisimip',
 'tnn',
 'r10baisimip',
 'hd',
 'r20',
 'cdbals',
 'cd',
 'spei6fullperiod',
 'spi6fullperiod']

In [18]:
list(df_new.start_time.unique())

['1850-01-01T00:00:00',
 '2015-01-01T00:00:00',
 '1950-01-01T00:00:00',
 '2006-01-01T00:00:00',
 '1985-01-01T00:00:00',
 '1970-01-01T00:00:00',
 '1979-01-01T00:00:00',
 '1881-01-01T00:00:00']

In [19]:
list(df_new.end_time.unique())

['2014-12-31T00:00:00',
 '2100-12-31T00:00:00',
 '2021-12-31T00:00:00',
 '2005-12-31T00:00:00',
 '2023-12-31T00:00:00',
 '2020-12-31T00:00:00',
 '2017-12-31T00:00:00']

## Write new catalog

In [20]:
from datetime import datetime

last_updated = datetime.now().utcnow()
version = last_updated.strftime('v%Y%m%d')
cat_name = f"c3s-cica-atlas_v02_{version}.csv.gz"
cat_path = f"../intake/catalogs/c3s-atlas/{cat_name}"

df_new.to_csv(cat_path, index=False, compression="gzip")

  last_updated = datetime.now().utcnow()
