# Build intake catalog for CORDEX datasets

date: 2025.07.12

In [1]:
import pandas as pd
import numpy as np

In [2]:
## load manifest

df = pd.read_csv("../cordex/IPSL_manifest_C3S_CORDEX_20250709.txt.gz", names=["url"])
df

Unnamed: 0,url
0,https://data.mips.climate.copernicus.eu/thredd...
1,https://data.mips.climate.copernicus.eu/thredd...
2,https://data.mips.climate.copernicus.eu/thredd...
3,https://data.mips.climate.copernicus.eu/thredd...
4,https://data.mips.climate.copernicus.eu/thredd...
...,...
568400,https://data.mips.climate.copernicus.eu/thredd...
568401,https://data.mips.climate.copernicus.eu/thredd...
568402,https://data.mips.climate.copernicus.eu/thredd...
568403,https://data.mips.climate.copernicus.eu/thredd...


In [3]:
df.url[0]

'https://data.mips.climate.copernicus.eu/thredds/fileServer/esg_c3s-cordex/output/ARC-44/UQAM/ECMWF-ERAINT/evaluation/r1i1p1/UQAM-CRCM5/v1/fx/sftlf/v20200915/sftlf_ARC-44_ECMWF-ERAINT_evaluation_r1i1p1_UQAM-CRCM5_v1_fx.nc'

In [4]:
## build new dataframe

df_new = pd.DataFrame(columns = [
    'ds_id',
    'path',
    'size',
    'project',
    'domain',
    'institute',
    'driving_model',
    'experiment_id',
    'ensemble',
    'rcm_name',
    'rcm_version',
    'time_frequency',
    'variable',
    'version',   
    'start_time',
    'end_time',
    'bbox',
    'level',
    'url'
])
df_new

Unnamed: 0,ds_id,path,size,project,domain,institute,driving_model,experiment_id,ensemble,rcm_name,rcm_version,time_frequency,variable,version,start_time,end_time,bbox,level,url


In [5]:
df_new["url"] = df["url"]
df_new.head()

Unnamed: 0,ds_id,path,size,project,domain,institute,driving_model,experiment_id,ensemble,rcm_name,rcm_version,time_frequency,variable,version,start_time,end_time,bbox,level,url
0,,,,,,,,,,,,,,,,,,,https://data.mips.climate.copernicus.eu/thredd...
1,,,,,,,,,,,,,,,,,,,https://data.mips.climate.copernicus.eu/thredd...
2,,,,,,,,,,,,,,,,,,,https://data.mips.climate.copernicus.eu/thredd...
3,,,,,,,,,,,,,,,,,,,https://data.mips.climate.copernicus.eu/thredd...
4,,,,,,,,,,,,,,,,,,,https://data.mips.climate.copernicus.eu/thredd...


In [6]:
df_new["path"] = df_new["url"].str.replace("https://data.mips.climate.copernicus.eu/thredds/fileServer/esg_c3s-cordex/", "", regex=False)
df_new.head()

Unnamed: 0,ds_id,path,size,project,domain,institute,driving_model,experiment_id,ensemble,rcm_name,rcm_version,time_frequency,variable,version,start_time,end_time,bbox,level,url
0,,output/ARC-44/UQAM/ECMWF-ERAINT/evaluation/r1i...,,,,,,,,,,,,,,,,,https://data.mips.climate.copernicus.eu/thredd...
1,,output/ARC-44/UQAM/ECMWF-ERAINT/evaluation/r1i...,,,,,,,,,,,,,,,,,https://data.mips.climate.copernicus.eu/thredd...
2,,output/ARC-44/UQAM/ECMWF-ERAINT/evaluation/r1i...,,,,,,,,,,,,,,,,,https://data.mips.climate.copernicus.eu/thredd...
3,,output/ARC-44/UQAM/ECMWF-ERAINT/evaluation/r1i...,,,,,,,,,,,,,,,,,https://data.mips.climate.copernicus.eu/thredd...
4,,output/ARC-44/UQAM/ECMWF-ERAINT/evaluation/r1i...,,,,,,,,,,,,,,,,,https://data.mips.climate.copernicus.eu/thredd...


In [7]:
df_new.path[0]

'output/ARC-44/UQAM/ECMWF-ERAINT/evaluation/r1i1p1/UQAM-CRCM5/v1/fx/sftlf/v20200915/sftlf_ARC-44_ECMWF-ERAINT_evaluation_r1i1p1_UQAM-CRCM5_v1_fx.nc'

In [8]:
# 'c3s-cordex.output.EUR-11.MOHC.MOHC-HadGEM2-ES.rcp85.r1i1p1.MOHC-HadREM3-GA7-05.v1.mon.tas.v20200330'


df_new["ds_id"] = df_new["path"].apply(lambda x: 'c3s-cordex.' + '.'.join(x.split('/')[0:-1]))
df_new.head()

Unnamed: 0,ds_id,path,size,project,domain,institute,driving_model,experiment_id,ensemble,rcm_name,rcm_version,time_frequency,variable,version,start_time,end_time,bbox,level,url
0,c3s-cordex.output.ARC-44.UQAM.ECMWF-ERAINT.eva...,output/ARC-44/UQAM/ECMWF-ERAINT/evaluation/r1i...,,,,,,,,,,,,,,,,,https://data.mips.climate.copernicus.eu/thredd...
1,c3s-cordex.output.ARC-44.UQAM.ECMWF-ERAINT.eva...,output/ARC-44/UQAM/ECMWF-ERAINT/evaluation/r1i...,,,,,,,,,,,,,,,,,https://data.mips.climate.copernicus.eu/thredd...
2,c3s-cordex.output.ARC-44.UQAM.ECMWF-ERAINT.eva...,output/ARC-44/UQAM/ECMWF-ERAINT/evaluation/r1i...,,,,,,,,,,,,,,,,,https://data.mips.climate.copernicus.eu/thredd...
3,c3s-cordex.output.ARC-44.UQAM.ECMWF-ERAINT.eva...,output/ARC-44/UQAM/ECMWF-ERAINT/evaluation/r1i...,,,,,,,,,,,,,,,,,https://data.mips.climate.copernicus.eu/thredd...
4,c3s-cordex.output.ARC-44.UQAM.ECMWF-ERAINT.eva...,output/ARC-44/UQAM/ECMWF-ERAINT/evaluation/r1i...,,,,,,,,,,,,,,,,,https://data.mips.climate.copernicus.eu/thredd...


In [9]:
df_new.ds_id[0]

'c3s-cordex.output.ARC-44.UQAM.ECMWF-ERAINT.evaluation.r1i1p1.UQAM-CRCM5.v1.fx.sftlf.v20200915'

In [10]:

def get_project(ds_id):
    value = ds_id.split('.')[0]
    return value

def get_domain(ds_id):
    value = ds_id.split(".")[2]
    return value

def get_institute(ds_id):
    value = ds_id.split(".")[3]
    return value

def get_driving_model(ds_id):
    value = ds_id.split(".")[4]
    return value

def get_experiment_id(ds_id):
    value = ds_id.split(".")[5]
    return value

def get_ensemble(ds_id):
    value = ds_id.split(".")[6]
    return value

def get_rcm_name(ds_id):
    value = ds_id.split(".")[7]
    return value

def get_rcm_version(ds_id):
    value = ds_id.split(".")[8]
    return value

def get_time_frequency(ds_id):
    value = ds_id.split(".")[9]
    return value

def get_variable(ds_id):
    value = ds_id.split(".")[10]
    return value

def get_version(ds_id):
    value = ds_id.split('.')[11]
    return value
    
def get_start_time(path):
    parts = path.split("_")
    part = parts[-2]
    start = part.split("-")[0]
    if "mon" in parts:
        start_time = f"{start[0:4]}-{start[4:6]}-01T00:00:00"
    else:
        start_time = f"{start[0:4]}-01-01T00:00:00"
    return start_time

def get_end_time(path):
    # ['fx', 'day', 'mon', '6hr', '3hr', 'sem']
    # print(path)
    filename = path.split("/")[-1]
    # print(filename)
    name = filename.split(".nc")[0]
    # print(name)
    parts = name.split("_")
    # print(parts)
    
    if "fx" in parts:
        end_time = ""
        return end_time
    
    date_part = parts[-1]
    end = date_part.split("-")[-1]
    if "day" in parts:
        end_time = f"{end[0:4]}-{end[4:6]}-{end[6:7]}T00:00:00"
    elif "mon" in parts:
        end_time = f"{end[0:4]}-{end[4:6]}-31T00:00:00"
    else:
        end_time = f"{end[0:4]}-12-31T00:00:00"
    return end_time



In [11]:
df_new['project'] = df_new["ds_id"].apply(get_project)
df_new['domain'] = df_new["ds_id"].apply(get_domain)
df_new['institute'] = df_new["ds_id"].apply(get_institute)
df_new['driving_model'] = df_new["ds_id"].apply(get_driving_model)
df_new['experiment_id'] = df_new["ds_id"].apply(get_experiment_id)
df_new['ensemble'] = df_new["ds_id"].apply(get_ensemble)
df_new['rcm_name'] = df_new["ds_id"].apply(get_rcm_name)
df_new['rcm_version'] = df_new["ds_id"].apply(get_rcm_version)
df_new['time_frequency'] = df_new["ds_id"].apply(get_time_frequency)
df_new['variable'] = df_new["ds_id"].apply(get_variable)
df_new['version'] = df_new['ds_id'].apply(get_version)
#df_new['start_time'] = df_new["path"].apply(get_start_time)
df_new['end_time'] = df_new["path"].apply(get_end_time)
df_new.head()

Unnamed: 0,ds_id,path,size,project,domain,institute,driving_model,experiment_id,ensemble,rcm_name,rcm_version,time_frequency,variable,version,start_time,end_time,bbox,level,url
0,c3s-cordex.output.ARC-44.UQAM.ECMWF-ERAINT.eva...,output/ARC-44/UQAM/ECMWF-ERAINT/evaluation/r1i...,,c3s-cordex,ARC-44,UQAM,ECMWF-ERAINT,evaluation,r1i1p1,UQAM-CRCM5,v1,fx,sftlf,v20200915,,,,,https://data.mips.climate.copernicus.eu/thredd...
1,c3s-cordex.output.ARC-44.UQAM.ECMWF-ERAINT.eva...,output/ARC-44/UQAM/ECMWF-ERAINT/evaluation/r1i...,,c3s-cordex,ARC-44,UQAM,ECMWF-ERAINT,evaluation,r1i1p1,UQAM-CRCM5,v1,fx,orog,v20200915,,,,,https://data.mips.climate.copernicus.eu/thredd...
2,c3s-cordex.output.ARC-44.UQAM.ECMWF-ERAINT.eva...,output/ARC-44/UQAM/ECMWF-ERAINT/evaluation/r1i...,,c3s-cordex,ARC-44,UQAM,ECMWF-ERAINT,evaluation,r1i1p1,UQAM-CRCM5,v1,day,clt,v20200915,,1985-12-3T00:00:00,,,https://data.mips.climate.copernicus.eu/thredd...
3,c3s-cordex.output.ARC-44.UQAM.ECMWF-ERAINT.eva...,output/ARC-44/UQAM/ECMWF-ERAINT/evaluation/r1i...,,c3s-cordex,ARC-44,UQAM,ECMWF-ERAINT,evaluation,r1i1p1,UQAM-CRCM5,v1,day,clt,v20200915,,1995-12-3T00:00:00,,,https://data.mips.climate.copernicus.eu/thredd...
4,c3s-cordex.output.ARC-44.UQAM.ECMWF-ERAINT.eva...,output/ARC-44/UQAM/ECMWF-ERAINT/evaluation/r1i...,,c3s-cordex,ARC-44,UQAM,ECMWF-ERAINT,evaluation,r1i1p1,UQAM-CRCM5,v1,day,clt,v20200915,,2005-12-3T00:00:00,,,https://data.mips.climate.copernicus.eu/thredd...


## show values

In [12]:
df_new.nunique()

ds_id              23488
path              568405
size                   0
project                1
domain                27
institute             39
driving_model         17
experiment_id          5
ensemble               6
rcm_name              56
rcm_version           11
time_frequency         6
variable              25
version              144
start_time             0
end_time             470
bbox                   0
level                  0
url               568405
dtype: int64

In [13]:
versions = list(df_new["version"].unique())
versions.sort()
versions

['v20131026',
 'v20131119',
 'v20140218',
 'v20140313',
 'v20140319',
 'v20140324',
 'v20140402',
 'v20140515',
 'v20140620',
 'v20140826',
 'v20140917',
 'v20141024',
 'v20141216',
 'v20150114',
 'v20150127',
 'v20150320',
 'v20150421',
 'v20160419',
 'v20160525',
 'v20160620',
 'v20160704',
 'v20160705',
 'v20160803',
 'v20160830',
 'v20170206',
 'v20170208',
 'v20170329',
 'v20170410',
 'v20170412',
 'v20170504',
 'v20170523',
 'v20170524',
 'v20170606',
 'v20170713',
 'v20171121',
 'v20180212',
 'v20180220',
 'v20180226',
 'v20180509',
 'v20180707',
 'v20180710',
 'v20180717',
 'v20180813',
 'v20180820',
 'v20181107',
 'v20181126',
 'v20181203',
 'v20181212',
 'v20190103',
 'v20190108',
 'v20190115',
 'v20190131',
 'v20190208',
 'v20190212',
 'v20190304',
 'v20190412',
 'v20190415',
 'v20190418',
 'v20190419',
 'v20190502',
 'v20190509',
 'v20190510',
 'v20190512',
 'v20190522',
 'v20190619',
 'v20190620',
 'v20190625',
 'v20190702',
 'v20190805',
 'v20190806',
 'v20190809',
 'v201

In [14]:
list(df_new.project.unique())

['c3s-cordex']

In [15]:
list(df_new.domain.unique())

['ARC-44',
 'ARC-22',
 'SAM-22',
 'NAM-44',
 'AUS-44',
 'ANT-44',
 'MED-44',
 'AFR-44',
 'MNA-44',
 'EAS-22',
 'MED-11',
 'AUS-44i',
 'WAS-22',
 'NAM-22',
 'CAS-44',
 'WAS-44',
 'EUR-11',
 'CAS-22',
 'SAM-20',
 'SEA-22',
 'AUS-22',
 'CAM-44',
 'CAM-22',
 'SAM-44',
 'MNA-22',
 'EAS-44',
 'AFR-22']

In [16]:
list(df_new.driving_model.unique())

['ECMWF-ERAINT',
 'CCCma-CanESM2',
 'MPI-M-MPI-ESM-MR',
 'MPI-M-MPI-ESM-LR',
 'ICHEC-EC-EARTH',
 'NCC-NorESM1-M',
 'MOHC-HadGEM2-ES',
 'NOAA-GFDL-GFDL-ESM2M',
 'CSIRO-BOM-ACCESS1-0',
 'CSIRO-BOM-ACCESS1-3',
 'IPSL-IPSL-CM5A-MR',
 'CNRM-CERFACS-CNRM-CM5',
 'MIROC-MIROC5',
 'IPSL-IPSL-CM5A-LR',
 'NOAA-GFDL-GFDL-ESM2G',
 'CSIRO-QCCCE-CSIRO-Mk3-6-0',
 'NCAR-CCSM4']

In [17]:
list(df_new.experiment_id.unique())

['evaluation', 'rcp85', 'historical', 'rcp45', 'rcp26']

In [18]:
list(df_new.ensemble.unique())

['r1i1p1', 'r0i0p0', 'r12i1p1', 'r3i1p1', 'r2i1p1', 'r6i1p1']

In [19]:
list(df_new.rcm_name.unique())

['UQAM-CRCM5',
 'UQAM-CRCM5-SN',
 'BCCR-WRF331',
 'CCCma-CanRCM4',
 'MGO-RRCM',
 'ULg-MAR36',
 'SMHI-RCA4',
 'SMHI-RCA4-SN',
 'AWI-HIRHAM5',
 'DMI-HIRHAM5',
 'ICTP-RegCM4-7',
 'GERICS-REMO2015',
 'ISU-RegCM4',
 'NCAR-RegCM4',
 'NCAR-WRF',
 'UA-WRF',
 'UNSW-WRF360K',
 'UNSW-WRF360L',
 'UNSW-WRF360J',
 'CLMcom-CCLM4-8-17-CLM3-5',
 'KNMI-RACMO21P',
 'ULg-MAR311',
 'GUF-CCLM4-8-18',
 'LMD-LMDZ4NEMOMED8',
 'ICTP-RegCM4-3',
 'ELU-RegCM4-3',
 'CNRM-ALADIN52',
 'CMCC-CCLM4-8-19',
 'MPI-CSC-REMO2009',
 'GERICS-REMO2009',
 'CLMcom-CCLM4-8-17',
 'KNMI-RACMO22T',
 'BOUN-RegCM4-3',
 'CYI-WRF351',
 'ICTP-RegCM4-4',
 'CSIRO-CCAM-2008',
 'ORNL-RegCM4-7',
 'CLMcom-ETH-COSMO-crCLIM-v1-1',
 'OURANOS-CRCM5',
 'IITM-RegCM4-4',
 'MOHC-HadRM3P',
 'ICTP-RegCM4-6',
 'CLMcom-BTU-CCLM4-8-17',
 'IPSL-WRF381P',
 'CNRM-ALADIN63',
 'CNRM-ALADIN53',
 'UHOH-WRF361H',
 'KNMI-RACMO22E',
 'MOHC-HadREM3-GA7-05',
 'RMIB-UGent-ALARO-0',
 'INPE-Eta',
 'RU-CORE-RegCM4-3',
 'CLMcom-HZG-CCLM5-0-15',
 'UCAN-WRF341I',
 'CLMcom-CC

In [20]:
list(df_new.rcm_version.unique())

['v1', 'r2', 'v2', 'v0', 'v4-4-rc8', 'v3-5-1', 'v7', 'v4', 'v5', 'v1a', 'v3']

In [21]:
list(df_new.time_frequency.unique())

['fx', 'day', 'mon', '6hr', '3hr', 'sem']

In [22]:
list(df_new.variable.unique())

['sftlf',
 'orog',
 'clt',
 'uas',
 'rlds',
 'pr',
 'psl',
 'ps',
 'rsds',
 'tasmax',
 'huss',
 'vas',
 'tas',
 'tasmin',
 'sfcWind',
 'evspsbl',
 'hurs',
 'ua850',
 'rsus',
 'va850',
 'zg500',
 'mrro',
 'ua200',
 'va200',
 'ta200']

In [23]:
list(df_new.start_time.unique())

[nan]

In [24]:
list(df_new.end_time.unique())

['',
 '1985-12-3T00:00:00',
 '1995-12-3T00:00:00',
 '2005-12-3T00:00:00',
 '2010-12-3T00:00:00',
 '1980-12-3T00:00:00',
 '2000-12-3T00:00:00',
 '1990-12-3T00:00:00',
 '2014-12-3T00:00:00',
 '2015-12-3T00:00:00',
 '2070-12-3T00:00:00',
 '2045-12-3T00:00:00',
 '2085-12-3T00:00:00',
 '2060-12-3T00:00:00',
 '2025-12-3T00:00:00',
 '2100-12-3T00:00:00',
 '2040-12-3T00:00:00',
 '2075-12-3T00:00:00',
 '2055-12-3T00:00:00',
 '2050-12-3T00:00:00',
 '2095-12-3T00:00:00',
 '2020-12-3T00:00:00',
 '2030-12-3T00:00:00',
 '2065-12-3T00:00:00',
 '2090-12-3T00:00:00',
 '2035-12-3T00:00:00',
 '2080-12-3T00:00:00',
 '1960-12-3T00:00:00',
 '1965-12-3T00:00:00',
 '1950-12-3T00:00:00',
 '1970-12-3T00:00:00',
 '1955-12-3T00:00:00',
 '1975-12-3T00:00:00',
 '2009-12-3T00:00:00',
 '1987-12-3T00:00:00',
 '2008-12-3T00:00:00',
 '2006-12-3T00:00:00',
 '1981-12-3T00:00:00',
 '1997-12-3T00:00:00',
 '1984-12-3T00:00:00',
 '1991-12-3T00:00:00',
 '1996-12-3T00:00:00',
 '2003-12-3T00:00:00',
 '1993-12-3T00:00:00',
 '2001

## Write new catalog

In [25]:
from datetime import datetime

last_updated = datetime.now().utcnow()
version = last_updated.strftime('v%Y%m%d')
cat_name = f"c3s-cica-atlas-v02_{version}.csv.gz"
cat_path = f"../intake/catalogs/c3s-atlas/{cat_name}"

df_new.to_csv(cat_path, index=False, compression="gzip")

  last_updated = datetime.now().utcnow()
