# Build intake catalog for ATLAS v1 datasets

In [1]:
import pandas as pd
import numpy as np

In [2]:
## load manifest

df = pd.read_csv("../atlas_v1/manifest_cica_20231214.txt", names=["url"])
df

Unnamed: 0,url
0,https://data.mips.copernicus-climate.eu/thredd...
1,https://data.mips.copernicus-climate.eu/thredd...
2,https://data.mips.copernicus-climate.eu/thredd...
3,https://data.mips.copernicus-climate.eu/thredd...
4,https://data.mips.copernicus-climate.eu/thredd...
...,...
407,https://data.mips.copernicus-climate.eu/thredd...
408,https://data.mips.copernicus-climate.eu/thredd...
409,https://data.mips.copernicus-climate.eu/thredd...
410,https://data.mips.copernicus-climate.eu/thredd...


In [3]:
df.url[0]

'https://data.mips.copernicus-climate.eu/thredds/fileServer/esg_c3s-ipcc-atlas/CMIP5/historical/pr_CMIP5_historical_mon_185001-200512.nc'

In [4]:
## build new dataframe

df_new = pd.DataFrame(columns = [
    'ds_id', 
    'path',
    'size',
    'project',
    'domain',
    'experiment',
    'time_frequency',
    'variable',
    'start_time',
    'end_time',
    'bbox',
    'level',
    'url',
])
df_new

Unnamed: 0,ds_id,path,size,project,domain,experiment,time_frequency,variable,start_time,end_time,bbox,level,url


In [5]:
df_new["url"] = df["url"].str.replace("/esg_c3s-ipcc-atlas/", "/esg_c3s-cica-atlas/", regex=False)
df_new.head()

Unnamed: 0,ds_id,path,size,project,domain,experiment,time_frequency,variable,start_time,end_time,bbox,level,url
0,,,,,,,,,,,,,https://data.mips.copernicus-climate.eu/thredd...
1,,,,,,,,,,,,,https://data.mips.copernicus-climate.eu/thredd...
2,,,,,,,,,,,,,https://data.mips.copernicus-climate.eu/thredd...
3,,,,,,,,,,,,,https://data.mips.copernicus-climate.eu/thredd...
4,,,,,,,,,,,,,https://data.mips.copernicus-climate.eu/thredd...


In [6]:
df_new["path"] = df_new["url"].str.replace("https://data.mips.copernicus-climate.eu/thredds/fileServer/esg_c3s-cica-atlas/", "", regex=False)
df_new.head()

Unnamed: 0,ds_id,path,size,project,domain,experiment,time_frequency,variable,start_time,end_time,bbox,level,url
0,,CMIP5/historical/pr_CMIP5_historical_mon_18500...,,,,,,,,,,,https://data.mips.copernicus-climate.eu/thredd...
1,,CMIP5/historical/prsn_CMIP5_historical_mon_185...,,,,,,,,,,,https://data.mips.copernicus-climate.eu/thredd...
2,,CMIP5/historical/rx1day_CMIP5_historical_mon_1...,,,,,,,,,,,https://data.mips.copernicus-climate.eu/thredd...
3,,CMIP5/historical/rx5day_CMIP5_historical_mon_1...,,,,,,,,,,,https://data.mips.copernicus-climate.eu/thredd...
4,,CMIP5/historical/sfcwind_CMIP5_historical_mon_...,,,,,,,,,,,https://data.mips.copernicus-climate.eu/thredd...


In [7]:
df_new["ds_id"] = df_new["path"].apply(lambda x: 'c3s-cica-atlas.' + '.'.join(x.split('/')[-1].split('_')[0:-1]))
df_new.head()

Unnamed: 0,ds_id,path,size,project,domain,experiment,time_frequency,variable,start_time,end_time,bbox,level,url
0,c3s-cica-atlas.pr.CMIP5.historical.mon,CMIP5/historical/pr_CMIP5_historical_mon_18500...,,,,,,,,,,,https://data.mips.copernicus-climate.eu/thredd...
1,c3s-cica-atlas.prsn.CMIP5.historical.mon,CMIP5/historical/prsn_CMIP5_historical_mon_185...,,,,,,,,,,,https://data.mips.copernicus-climate.eu/thredd...
2,c3s-cica-atlas.rx1day.CMIP5.historical.mon,CMIP5/historical/rx1day_CMIP5_historical_mon_1...,,,,,,,,,,,https://data.mips.copernicus-climate.eu/thredd...
3,c3s-cica-atlas.rx5day.CMIP5.historical.mon,CMIP5/historical/rx5day_CMIP5_historical_mon_1...,,,,,,,,,,,https://data.mips.copernicus-climate.eu/thredd...
4,c3s-cica-atlas.sfcwind.CMIP5.historical.mon,CMIP5/historical/sfcwind_CMIP5_historical_mon_...,,,,,,,,,,,https://data.mips.copernicus-climate.eu/thredd...


In [8]:
df_new.ds_id[0]

'c3s-cica-atlas.pr.CMIP5.historical.mon'

In [9]:
# Let’s agree on “Europe” for E-OBS, “Europe (EURO-CORDEX)” for CORDEX-EUR-11 and “Global (mosaic)” for CODEX-CORE (the others are all “Global”).

def get_project(ds_id):
    project = ds_id.split(".")[2]
    # project = project.split("-")[0]
    return project

def get_domain(ds_id):
    project = ds_id.split(".")[2]
    if project == "E-OBS":
        domain = "Europe"
    elif project == "CORDEX-EUR-11":
        domain = "Europe (EURO-CORDEX)"
    elif project == "CORDEX-CORE":
        domain = "Global (mosaic)"
    else:
        domain = "Global"
    return domain

def get_experiment(ds_id):
    parts = ds_id.split('.')
    if len(parts) == 5:
        exp = parts[3]
    else:
        exp = np.nan
    return exp
    

def get_start_time(path):
    parts = path.split("_")
    part = parts[-1]
    start = part.split("-")[0]
    if "mon" in parts:
        start_time = f"{start[0:4]}-{start[4:6]}-01T00:00:00"
    else:
        start_time = f"{start[0:4]}-01-01T00:00:00"
    return start_time


def get_end_time(path):
    parts = path.split("_")
    part = parts[-1]
    end = part.split("-")[1]
    if "mon" in parts:
        end_time = f"{end[0:4]}-{end[4:6]}-31T00:00:00"
    else:
        end_time = f"{end[0:4]}-12-31T00:00:00"
    return end_time


In [10]:
df_new['project'] = df_new["ds_id"].apply(get_project)
df_new['domain'] = df_new["ds_id"].apply(get_domain)
df_new['experiment'] = df_new["ds_id"].apply(get_experiment)
df_new['time_frequency'] = df_new["ds_id"].apply(lambda x: x.split('.')[-1])
df_new['variable'] = df_new["ds_id"].apply(lambda x: x.split('.')[1])
df_new['start_time'] = df_new["path"].apply(get_start_time)
df_new['end_time'] = df_new["path"].apply(get_end_time)
df_new

Unnamed: 0,ds_id,path,size,project,domain,experiment,time_frequency,variable,start_time,end_time,bbox,level,url
0,c3s-cica-atlas.pr.CMIP5.historical.mon,CMIP5/historical/pr_CMIP5_historical_mon_18500...,,CMIP5,Global,historical,mon,pr,1850-01-01T00:00:00,2005-12-31T00:00:00,,,https://data.mips.copernicus-climate.eu/thredd...
1,c3s-cica-atlas.prsn.CMIP5.historical.mon,CMIP5/historical/prsn_CMIP5_historical_mon_185...,,CMIP5,Global,historical,mon,prsn,1850-01-01T00:00:00,2005-12-31T00:00:00,,,https://data.mips.copernicus-climate.eu/thredd...
2,c3s-cica-atlas.rx1day.CMIP5.historical.mon,CMIP5/historical/rx1day_CMIP5_historical_mon_1...,,CMIP5,Global,historical,mon,rx1day,1850-01-01T00:00:00,2005-12-31T00:00:00,,,https://data.mips.copernicus-climate.eu/thredd...
3,c3s-cica-atlas.rx5day.CMIP5.historical.mon,CMIP5/historical/rx5day_CMIP5_historical_mon_1...,,CMIP5,Global,historical,mon,rx5day,1850-01-01T00:00:00,2005-12-31T00:00:00,,,https://data.mips.copernicus-climate.eu/thredd...
4,c3s-cica-atlas.sfcwind.CMIP5.historical.mon,CMIP5/historical/sfcwind_CMIP5_historical_mon_...,,CMIP5,Global,historical,mon,sfcwind,1850-01-01T00:00:00,2005-12-31T00:00:00,,,https://data.mips.copernicus-climate.eu/thredd...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
407,c3s-cica-atlas.tx40.ERA5.mon,ERA5/tx40_ERA5_mon_194001-202212.nc,,ERA5,Global,,mon,tx40,1940-01-01T00:00:00,2022-12-31T00:00:00,,,https://data.mips.copernicus-climate.eu/thredd...
408,c3s-cica-atlas.txx.ERA5.mon,ERA5/txx_ERA5_mon_194001-202212.nc,,ERA5,Global,,mon,txx,1940-01-01T00:00:00,2022-12-31T00:00:00,,,https://data.mips.copernicus-climate.eu/thredd...
409,c3s-cica-atlas.tx.ERA5.mon,ERA5/tx_ERA5_mon_194001-202212.nc,,ERA5,Global,,mon,tx,1940-01-01T00:00:00,2022-12-31T00:00:00,,,https://data.mips.copernicus-climate.eu/thredd...
410,c3s-cica-atlas.siconc.ORAS5.mon,ORAS5/siconc_ORAS5_mon_195801-201412.nc,,ORAS5,Global,,mon,siconc,1958-01-01T00:00:00,2014-12-31T00:00:00,,,https://data.mips.copernicus-climate.eu/thredd...


## show values

In [11]:
df_new.nunique()

ds_id             412
path              412
size                0
project             8
domain              4
experiment          8
time_frequency      2
variable           30
start_time          7
end_time            5
bbox                0
level               0
url               412
dtype: int64

In [12]:
list(df_new.project.unique())

['CMIP5',
 'CMIP6',
 'CORDEX-CORE',
 'CORDEX-EUR-11',
 'E-OBS',
 'ERA5',
 'ERA5-Land',
 'ORAS5']

In [13]:
list(df_new.domain.unique())

['Global', 'Global (mosaic)', 'Europe (EURO-CORDEX)', 'Europe']

In [14]:
list(df_new.experiment.unique())

['historical',
 'rcp26',
 'rcp45',
 'rcp85',
 'ssp126',
 'ssp245',
 'ssp370',
 'ssp585',
 nan]

In [15]:
list(df_new.time_frequency.unique())

['mon', 'yr']

In [16]:
list(df_new.variable.unique())

['pr',
 'prsn',
 'rx1day',
 'rx5day',
 'sfcwind',
 't',
 'tn',
 'tnn',
 'tx35',
 'tx',
 'tx40',
 'txx',
 'cd',
 'cdd',
 'clt',
 'evspsbl',
 'fd',
 'hd',
 'huss',
 'mrro',
 'mrsos',
 'psl',
 'rlds',
 'rsds',
 'siconc',
 'spei6',
 'spi6',
 'sst',
 'tx35bals',
 'tx40bals']

In [17]:
list(df_new.start_time.unique())

['1850-01-01T00:00:00',
 '2006-01-01T00:00:00',
 '2015-01-01T00:00:00',
 '1970-01-01T00:00:00',
 '1950-01-01T00:00:00',
 '1940-01-01T00:00:00',
 '1958-01-01T00:00:00']

In [18]:
list(df_new.end_time.unique())

['2005-12-31T00:00:00',
 '2100-12-31T00:00:00',
 '2014-12-31T00:00:00',
 '2021-12-31T00:00:00',
 '2022-12-31T00:00:00']

## Write new catalog

In [19]:
from datetime import datetime

last_updated = datetime.now().utcnow()
version = last_updated.strftime('v%Y%m%d')
cat_name = f"c3s-cica-atlas_{version}.csv.gz"
cat_path = f"../intake/catalogs/c3s-atlas/{cat_name}"

df_new.to_csv(cat_path, index=False, compression="gzip")