# CMIP5 build intake catalog

Build intake catalog for CMIP5 from manifest files.

In [None]:
import pandas as pd

## Load variable description

In [None]:
import yaml
import re

In [None]:
variables = dict()

variables_ = yaml.safe_load(open("../cmip5/monthly_single-level_variables.yml"))['variables']
variables.update(variables_)

variables_ = yaml.safe_load(open("../cmip5/monthly_pressure-levels_variables.yml"))['variables']
variables.update(variables_)

variables_ = yaml.safe_load(open("../cmip5/daily_single-level_variables.yml"))['variables']
variables.update(variables_)

variables_ = yaml.safe_load(open("../cmip5/daily_pressure_level_variables.yml"))['variables']
variables.update(variables_)

variables

In [None]:
lookup = dict()
pattern = re.compile(r'\(([\w]+)\)')
for var in variables:
    matches = pattern.findall(var)
    if not matches:
        continue
    short_name = matches[-1]
    lookup[short_name] = dict()
    lookup[short_name]['variable_name'] = var
    lookup[short_name]['units'] = variables[var]['units']
    lookup[short_name]['description'] = variables[var]['description']
    
lookup

## Load manifest

In [None]:
df = pd.DataFrame(columns=["url", "label"])

df_m = pd.read_csv("../cmip5/manifest_C3S-34a-Lot1_cmip5-monthly-single-level_http_latest.txt.gz", names=["url"])
df_m["label"] = "cmip5-monthly-single-level"
df = pd.concat([df, df_m], ignore_index=True)

df_m = pd.read_csv("../cmip5/manifest_C3S-34a-Lot1_cmip5-monthly-latitude-depth_http_latest.txt.gz", names=["url"])
df_m["label"] = "mip5-monthly-latitude-depth"
df = pd.concat([df, df_m], ignore_index=True)

df_m = pd.read_csv("../cmip5/manifest_C3S-34a-Lot1_cmip5-monthly-pressure-level_http_latest.txt.gz", names=["url"])
df_m["label"] = "cmip5-monthly-pressure-level"
df = pd.concat([df, df_m], ignore_index=True)

df_m = pd.read_csv("../cmip5/manifest_C3S-34a-Lot1_cmip5-daily-single-level_http_latest.txt.gz", names=["url"])
df_m["label"] = "cmip5-daily-single-level"
df = pd.concat([df, df_m], ignore_index=True)

df_m = pd.read_csv("../cmip5/manifest_C3S-34a-Lot1_cmip5-daily-pressure-level_http_latest.txt.gz", names=["url"])
df_m["label"] = "cmip5-daily-pressure-level"
df = pd.concat([df, df_m], ignore_index=True)

df_m = pd.read_csv("../cmip5/manifest_C3S-34a-Lot1_cmip5-3hr-single-level_http_latest.txt.gz", names=["url"])
df_m["label"] = "cmip5-3hr-single-level"
df = pd.concat([df, df_m], ignore_index=True)

df

In [None]:
df.url[0]

## Build new dataframe


In [None]:
df_new = pd.DataFrame(columns = [
    'drs_id', 
    'path',
    'size',
    'project',
    'product',
    'institute',
    'model',
    'experiment',
    'time_frequency',
    'realm',
    'table',
    'ensemble',
    'variable',
    'version',
    'start_time',
    'end_time',
    'bbox',
    'level',
    'units',
    'variable_name',
    'description',
    'url',
    'label',
])
df_new

In [None]:
df_new["path"] = df["url"].str.replace("http://data.mips.copernicus-climate.eu/thredds/fileServer/esg_c3s-cmip5/", "", regex=False)
df_new.head()

In [None]:
# c3s-cmip5.output1.MPI-M.MPI-ESM-LR.historical.mon.atmos.Amon.r1i1p1.tas.v20120315

In [None]:
df_new["drs_id"] = df_new["path"].apply(lambda x: 'c3s-cmip5.' + '.'.join(x.split('/')[0:-1]))
df_new.head()

In [None]:
df_new["drs_id"][0].split(".")

In [None]:
df_new['project'] = df_new["drs_id"].apply(lambda x: x.split('.')[0])
df_new['product'] = df_new["drs_id"].apply(lambda x: x.split('.')[1])
df_new['institute'] = df_new["drs_id"].apply(lambda x: x.split('.')[2])
df_new['model'] = df_new["drs_id"].apply(lambda x: x.split('.')[3])
df_new['experiment'] = df_new["drs_id"].apply(lambda x: x.split('.')[4])
df_new['time_frequency'] = df_new["drs_id"].apply(lambda x: x.split('.')[5])
df_new['realm'] = df_new["drs_id"].apply(lambda x: x.split('.')[6])
df_new['table'] = df_new["drs_id"].apply(lambda x: x.split('.')[7])
df_new['ensemble'] = df_new["drs_id"].apply(lambda x: x.split('.')[8])
df_new['variable'] = df_new["drs_id"].apply(lambda x: x.split('.')[9])
df_new['version'] = df_new["drs_id"].apply(lambda x: x.split('.')[10])
df_new['variable_name'] = df_new["variable"].apply(lambda x: lookup.get(x, {}).get('variable_name'))
df_new['units'] = df_new["variable"].apply(lambda x: lookup.get(x, {}).get('units'))
df_new['description'] = df_new["variable"].apply(lambda x: lookup.get(x, {}).get('description'))
df_new['url'] = df["url"].apply(lambda x: x)
df_new['label'] = df["label"].apply(lambda x: x)
df_new.head()

In [None]:
from datetime import datetime
import calendar

def parse_time(filename):
    time_part = filename.split("_")[-1].split(".nc")[0]
    if "-" not in time_part:
        return None, None
    start, end = time_part.split("-")
    if len(start) == 12:
        format = "%Y%m%d%H%M"
    elif len(start) == 8:
        format = "%Y%m%d"
    elif len(start) == 6:
        format = "%Y%m"
    start_time = datetime.strptime(start, format)
    if len(start) == 6:
        start_time = datetime(start_time.year, start_time.month, 1, 12)
    elif len(start) == 8:
        start_time = datetime(start_time.year, start_time.month, start_time.day, 12)
    end_time = datetime.strptime(end, format)
    if len(end) == 6:
        end_time = datetime(end_time.year, end_time.month, calendar.monthrange(end_time.year, end_time.month)[1], 12)
    elif len(end) == 8:
        end_time = datetime(end_time.year, end_time.month, end_time.day, 12)
    return start_time.isoformat(), end_time.isoformat()

In [None]:
df_new["path"][0].split("/")[-1]

In [None]:
df_new['start_time'] = df_new["path"].apply(lambda x: parse_time(x.split('/')[-1])[0])
df_new['end_time'] = df_new["path"].apply(lambda x: parse_time(x.split('/')[-1])[1])
df_new.head()

## Write catalog

In [None]:
last_updated = datetime.now().utcnow()
version = last_updated.strftime('v%Y%m%d')
cat_name = f"c3s-cmip5_{version}.csv.gz"
cat_path = f"../intake/catalogs/c3s-cmip5/{cat_name}"

df_new.to_csv(cat_path, index=False, compression="gzip")

 ## Load catalog

In [None]:
df_c = pd.read_csv(cat_path)
df_c

In [None]:
df_c.nunique()

## Load intake

In [None]:
import intake

In [None]:
cat_url = "https://raw.githubusercontent.com/cp4cds/c3s_34g_manifests/master/intake/catalogs/c3s.yaml"
# cat_url = "https://github.com/cehbrecht/c3s_34g_manifests/raw/dev-cmip5/intake/catalogs/c3s.yaml"
cat = intake.open_catalog(cat_url)
list(cat)

In [None]:
df_cmip5 = cat['c3s-cmip5'].read()
df_cmip5

In [None]:
df_cmip5.nunique()