In [1]:
import pandas as pd

# Potentially problematic models
Models that are known to have failed/have long download times - download everything else first before attempting these


In [2]:
problem_models = ["CESM2","CESM2-WACCM","KACE-1-0-G","TaiESM1","KIOST-ESM"]

# Download lists of available data from synda

```
fpath=/rds/general/user/cb2714/home/01_wwa/10_misc/00_list-models

synda search project=CMIP6 frequency=day experiment_id=historical,ssp585 variable=tas,tasmax,tasmin,pr -l 5000 > $fpath/vars-cmip6.txt
synda search project=CMIP6 frequency=day experiment_id=highresSST-present,highresSST-future variable=tas,tasmax,tasmin,pr -l 5000 > $fpath/vars-highresmip.txt

synda search project=CMIP6 frequency=day experiment_id=historical,ssp585 variable=tas,pr,sfcWind,hurs -l 10000 > $fpath/fwi-cmip6.txt
synda search project=CMIP6 frequency=day experiment_id=highresSST-present,highresSST-future variable=tas,pr,sfcWind,hurs -l 10000 > $fpath/fwi-highresmip.txt

synda search project=CMIP6 frequency=day experiment_id=historical,ssp585 variable=tos,ta,hus,psl -l 10000 > /rds/general/user/cb2714/home/01_wwa/10_misc/00_list-models/pi-cmip6.txt

```

## Available runs

In [25]:
vset = "fwi"
ds = "highresmip"

df = pd.read_table(vset+"-"+ds+".txt", sep = "\.", engine = "python", header = None,
                   names = [0,"activity","institute","source_id","experiment","variant_label","table", "variable", "grid", "version"])

In [26]:
mdl_xvar = df[["source_id", "variant_label", "variable", "experiment"]].drop_duplicates()

# filter only models with both time periods
mdl_xvar = mdl_xvar.groupby(["source_id", "variant_label", "variable"]).count().reset_index()
mdl_xvar = mdl_xvar.loc[mdl_xvar.experiment > 1]

In [27]:
# list models with all available data for run r1i1p1f1 (this is the single most commonly used label)
df_r1 = mdl_xvar.loc[mdl_xvar.variant_label == "r1i1p1f1"]
ct_r1 = pd.crosstab(index = df_r1.variable, columns = [df_r1.source_id, df_r1.variant_label]).transpose()
ct_r1

Unnamed: 0_level_0,variable,hurs,pr,sfcWind,tasmax
source_id,variant_label,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CMCC-CM2-HR4,r1i1p1f1,0,1,0,0
CMCC-CM2-VHR4,r1i1p1f1,0,1,0,0
EC-Earth3P,r1i1p1f1,0,1,0,1
EC-Earth3P-HR,r1i1p1f1,1,1,1,1
FGOALS-f3-H,r1i1p1f1,0,1,1,0
FGOALS-f3-L,r1i1p1f1,0,1,1,1
GFDL-CM4C192,r1i1p1f1,1,1,1,1
HadGEM3-GC31-HM,r1i1p1f1,1,1,1,1
HadGEM3-GC31-LM,r1i1p1f1,1,1,1,1
HadGEM3-GC31-MM,r1i1p1f1,1,1,1,1


In [28]:
# list models with all necessary variables
mlist = [x[0] for x in ct_r1.loc[ct_r1.sum(1) == ct_r1.shape[1]].index]
",".join(mlist)

'EC-Earth3P-HR,GFDL-CM4C192,HadGEM3-GC31-HM,HadGEM3-GC31-LM,HadGEM3-GC31-MM,HiRAM-SIT-HR,HiRAM-SIT-LR,MPI-ESM1-2-XR,MRI-AGCM3-2-H,MRI-AGCM3-2-S,NICAM16-7S,NICAM16-8S,NICAM16-9S'

In [30]:
# list models that don't have an r1i1p1f1 variant - which variants could we add to the list?
df_rx = mdl_xvar.loc[~mdl_xvar.source_id.isin(df_r1.source_id)]
ct_rx = pd.crosstab(index = df_rx.variable, columns = [df_rx.source_id, df_rx.variant_label]).transpose()
ct_rx.loc[ct_rx.sum(1) == ct_rx.shape[1]]

Unnamed: 0_level_0,variable,hurs,pr,sfcWind,tasmax
source_id,variant_label,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CNRM-CM6-1,r10i1p1f2,1,1,1,1
CNRM-CM6-1,r2i1p1f2,1,1,1,1
CNRM-CM6-1,r3i1p1f2,1,1,1,1
CNRM-CM6-1,r4i1p1f2,1,1,1,1
CNRM-CM6-1,r5i1p1f2,1,1,1,1
CNRM-CM6-1,r6i1p1f2,1,1,1,1
CNRM-CM6-1,r7i1p1f2,1,1,1,1
CNRM-CM6-1,r8i1p1f2,1,1,1,1
CNRM-CM6-1-HR,r10i1p1f2,1,1,1,1
CNRM-CM6-1-HR,r2i1p1f2,1,1,1,1


In [31]:
# ensemble sizes
pd.crosstab(index = mdl_xvar.source_id, columns = mdl_xvar.variable)

variable,hurs,pr,sfcWind,tasmax
source_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CMCC-CM2-HR4,0,1,0,0
CMCC-CM2-VHR4,0,1,0,0
CNRM-CM6-1,10,10,8,10
CNRM-CM6-1-HR,9,10,9,10
EC-Earth3P,0,1,0,1
EC-Earth3P-HR,1,1,1,1
FGOALS-f3-H,0,1,1,0
FGOALS-f3-L,0,1,1,1
GFDL-CM4C192,1,1,1,1
HadGEM3-GC31-HM,1,1,1,1


In [32]:
# expand models & ensemble members with variables
pd.crosstab(index = mdl_xvar.variable, columns = [mdl_xvar.source_id, mdl_xvar.variant_label]).transpose()

Unnamed: 0_level_0,variable,hurs,pr,sfcWind,tasmax
source_id,variant_label,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CMCC-CM2-HR4,r1i1p1f1,0,1,0,0
CMCC-CM2-VHR4,r1i1p1f1,0,1,0,0
CNRM-CM6-1,r10i1p1f2,1,1,1,1
CNRM-CM6-1,r1i1p1f2,1,1,0,1
CNRM-CM6-1,r2i1p1f2,1,1,1,1
CNRM-CM6-1,r3i1p1f2,1,1,1,1
CNRM-CM6-1,r4i1p1f2,1,1,1,1
CNRM-CM6-1,r5i1p1f2,1,1,1,1
CNRM-CM6-1,r6i1p1f2,1,1,1,1
CNRM-CM6-1,r7i1p1f2,1,1,1,1


## CORDEX


In [20]:
vset = "fwi"
ds = "cordex"

df = pd.read_table(vset+"-"+ds+".txt", sep = "\.", engine = "python", header = None,
                   names = [0,1,"domain","institute","driving_model","experiment","ensemble","rcm_name", "rcm_version", "frequency", "variable","version"])

In [21]:
mdl_xvar = df[["domain", "driving_model", "ensemble","rcm_name","rcm_version","variable", "experiment"]].drop_duplicates()

# # filter only models with both time periods
mdl_xvar = mdl_xvar.groupby(["domain", "driving_model", "ensemble","rcm_name","rcm_version","variable"]).count().reset_index()
mdl_xvar = mdl_xvar.loc[mdl_xvar.experiment > 1]

In [23]:
# list models with all available data
ct_r1 = pd.crosstab(index = mdl_xvar.variable, columns = [mdl_xvar.domain, mdl_xvar.driving_model, mdl_xvar.ensemble, mdl_xvar.rcm_name, mdl_xvar.rcm_version]).transpose()
ct_r1.loc[ct_r1.sum(1) == ct_r1.shape[1]]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,variable,hurs,pr,sfcWind,tasmax
domain,driving_model,ensemble,rcm_name,rcm_version,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
NAM-22,CCCma-CanESM2,r1i1p1,CRCM5,v1,1,1,1,1
NAM-22,CCCma-CanESM2,r1i1p1,CanRCM4,r2,1,1,1,1
NAM-22,MOHC-HadGEM2-ES,r1i1p1,REMO2015,v1,1,1,1,1
NAM-22,MOHC-HadGEM2-ES,r1i1p1,RegCM4,v4-4-rc8,1,1,1,1
NAM-22,MOHC-HadGEM2-ES,r1i1p1,WRF,v3-5-1,1,1,1,1
NAM-22,MPI-M-MPI-ESM-LR,r1i1p1,CRCM5,v1,1,1,1,1
NAM-22,MPI-M-MPI-ESM-LR,r1i1p1,REMO2015,v1,1,1,1,1
NAM-22,MPI-M-MPI-ESM-LR,r1i1p1,RegCM4,v4-4-rc8,1,1,1,1
NAM-22,MPI-M-MPI-ESM-LR,r1i1p1,WRF,v3-5-1,1,1,1,1
NAM-22,MPI-M-MPI-ESM-MR,r1i1p1,CRCM5,v1,1,1,1,1


In [24]:
ct_r1.loc[ct_r1.sum(1) != ct_r1.shape[1]]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,variable,hurs,pr,sfcWind,tasmax
domain,driving_model,ensemble,rcm_name,rcm_version,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
NAM-22,CNRM-CERFACS-CNRM-CM5,r1i1p1,CRCM5,v1,0,1,0,1
NAM-22,NOAA-GFDL-GFDL-ESM2M,r1i1p1,CRCM5,v1,0,1,0,1
