## File Pathing for CMIP6 4xCO2 and piControl simulations
#### 10/12/2024 
---

### Steps:
1. Using CEDA we grab a list of all models in the CMIP6 archive (72 Models)
2. Using the ESGF python search api we search for models that include any of the variables of interest
   - tas, huss, hurs, ps (for my use case)
3. We export the search results of this out to individual csv files that we later composite back together
4. From the composite we find the models that have deal breaker variables included
5. Have a function that is used to retrieve any dataset

---

#### Librarys

In [1]:
import numpy as np
import xarray as xr
import pandas as pd
import os
from pyesgf.search import SearchConnection
os.environ["ESGF_PYCLIENT_NO_FACETS_STAR_WARNING"] = "on"

def extract_8_chars(string):
    start_index = string.find('_r') 
    end_index = string.find('_', start_index+1)
    return string[start_index+1:end_index]

def extract_SimNums(string):
    try:
        start_index = string.find('_r')
        end_index = string.find('i', start_index)
        r = int(string[start_index + 2:end_index])

        start_index = string.find('i', end_index)
        end_index = string.find('p', start_index)
        i = int(string[start_index + 1:end_index])

        start_index = string.find('p', end_index)
        end_index = string.find('f', start_index)
        p = int(string[start_index + 1:end_index])

        start_index = string.find('f', end_index)
        end_index = string.find('_', start_index)
        f = int(string[start_index + 1:end_index])
        
        return r, i, p, f
    except Exception as e:
        # Provide feedback for debugging
        print(f"Error processing string '{string}': {e}")
        return None, None, None, None
   
def getPeriod(filename):
    if 'abrupt-4xCO2' in filename: return 'abrupt-4xCO2'
    elif 'piControl' in filename: return 'piControl'
def whichVar(filename):
    if 'tas' in filename: return 'tas'
    elif 'huss' in filename: return 'huss'
    elif 'hurs' in filename: return 'hurs'
    elif 'ps' in filename: return 'ps'
        
conn = SearchConnection('https://esgf.ceda.ac.uk/esg-search', distrib=True) #UK one
# conn = SearchConnection('https://esgf-node.llnl.gov/esg-search', distrib=True) #German one
# conn = SearchConnection('https://esgf-data.dkrz.de/esg-search', distrib=True)  #US one

In [2]:
extract_8_chars('ps_Amon_CMCC-ESM2_abrupt-4xCO2_r1i1p1f1_gn_185001-201412.nc')

'r1i1p1f1'

#### 1: Grabbing the list of models in CMIP6 from CEDA archive

In [81]:
Agencies = os.listdir(f'/badc/cmip6/data/CMIP6/CMIP/')
Models = [os.listdir(f'/badc/cmip6/data/CMIP6/CMIP/{Agency}') for Agency in Agencies]
models = []
for sublist in Models:
    models.extend(sublist)


#### 2: Searching with pyesgf search (time consuming)

In [9]:

def calc(model): 
    if os.path.isfile(f'CO2_4x_url_{model}.csv'):
        print('skip')
        return None
    else:
        print(model) # for monitoring progress of parallel execution
        query = conn.new_context(project="CMIP6",     
                                 experiment_id='abrupt-4xCO2,piControl',
                                 source_id = model,
                                 frequency = 'day', 
                                 # member_id="r1i1p1f1",
                                 variable_id = "tas,huss,ps,hurs")
        results = query.search()
        
        files = []
        for i in range(len(results)):
            try:
                hit = results[i].file_context().search()
            except:
                hit = results[i].file_context().search()
            files += list(map(lambda f: {'model': model,
                                         'filename': f.filename, 
                                         'download_url': f.download_url, 
                                         'opendap_url': f.opendap_url}, hit))
        
        df = pd.DataFrame.from_dict(files)
        if len(df) == 0: return None
        df = df.dropna() # some opendap_urls are not found - so get rid of those
        df[['r', 'i', 'p', 'f']] = pd.DataFrame(
            df['filename'].apply(extract_SimNums).tolist(),
            index=df.index
        )
                
        df['Varient'] = df.filename.apply(extract_8_chars)
        df['period'] = df.filename.apply(getPeriod)
        df['Var'] = df.filename.apply(whichVar)
        df['timeStep'] = 'day'
        df['openDapWorking'] = False
        
        # discreteFiles = df.filename.unique()
        
        for i, dFile in enumerate(discreteFiles):
            for openDap in df[df.filename == dFile].opendap_url:
                try: 
                    xr.open_dataset(openDap)
                    df['openDapWorking'][df.opendap_url == openDap] = True
                    break
                except:
                    pass
        df.to_csv(f'CO2_4x_url_{model}.csv')   # Saves to CSV
        return None
## So turns out not every opendap works - Im going to blow my brains out
## So we need to not drop duplicates and then also deal with brutforcing through figuring out which ones work.

def parallel_execution(func, inputs, processes=None):
    from multiprocessing import Pool, cpu_count

    # Set the number of processes to use (default: number of CPU cores)
    if processes is None:
        processes = cpu_count()

    # Create a pool of worker processes
    with Pool(processes=processes) as pool:
        # Map the function to inputs and distribute across processors
        results = pool.map(func, inputs)

    return results

# processes = 8 works best for me - you won't overload RAM but their server can only process so many search requests
results = parallel_execution(calc, models, processes=8) 


skipBCC-CSM2-MRskipCAS-ESM2-0skipskipskipE3SM-1-1




skip


skip
skipskipskipskip




skipskipskip
skipCanESM5-CanOE

skipskip

EC-Earth3P-VHR

skip
skipskip


skipskipskip

skip
skip
skip

skipskip
skip
skip
UKESM1-1-LL
skip
skip

GISS-E2-1-G-CCskip

skip
GISS-E2-1-G
GISS-E2-1-H
skipskip

skipskip

skip
skipE3SM-1-1-ECAskipskip



skipskipskip


NorCPM1NorESM1-Fskip


skip
GFDL-AM4
skip
skip
NESM3
BCC-ESM1
skip
CIESM
skip
skip
skip
skip
MCM-UA-1-0
CAMS-CSM1-0


#### 3: Composite Models back together and note if any didn't save properly

In [14]:
df = pd.DataFrame()
notIn = []
for model in models:
    try:
        df_add = pd.read_csv(f'CO2_4x_url_{model}.csv', index_col=0)
        df = pd.concat([df, df_add], ignore_index=True)
    except:
        print(f'{model} not included')
        notIn.append(model)

df.to_csv(f'CO2_4x_url_all.csv', index=None)
#6JGFX2W7

BCC-CSM2-MR not included
BCC-ESM1 not included
CAMS-CSM1-0 not included
CAS-ESM2-0 not included
CanESM5-CanOE not included
E3SM-1-1 not included
E3SM-1-1-ECA not included
EC-Earth3P-VHR not included
UKESM1-1-LL not included
GISS-E2-1-G not included
GISS-E2-1-G-CC not included
GISS-E2-1-H not included
NorCPM1 not included
NorESM1-F not included
GFDL-AM4 not included
NESM3 not included
CIESM not included
MCM-UA-1-0 not included


#### 4: Setting variables that must be present to be a usable model - and determining list of usable models

In [88]:
# List of all requested vars and all of those that must be included for a model to be usable
allVars = ['tas', 'huss', 'hurs', 'ps']
insistVars = ['tas', 'huss', 'hurs']

df = pd.read_csv('CO2_4x_url_all.csv', index_col=None)
df = df[df.openDapWorking]


### Some Stats for what you've fetched
print(f'we tried to find information for {len(models)}')
print(f'We are unable to find any files for {len(set(models) - set(df.model.unique())- set(notIn))} models')
print(f'We have yet to process {len(notIn)}')

# some pandas magic to find models where all the insisted upon vars are included
groups = df.groupby(['model',  'Var', 'period'], as_index=False).Var.value_counts()

if len(insistVars) == 0:
    tasHuss = groups.groupby(['model'], as_index = False).model.value_counts()
else:
    tasHuss = groups[~groups['Var'].isin(list(set(allVars) - set(insistVars)))].groupby(['model'], as_index = False).model.value_counts()

goodModels = tasHuss[tasHuss['count'] == len(insistVars)*2].reset_index(drop=True).model    # the 2 in here will mess up if you have a dif number of periods

print(f'we can find {len(goodModels)} models that include {insistVars} for both 4xco2 and PI')

df = df[df['model'].isin(goodModels)]


## So well have to find the pressure feilds from alot of these - either in Amon or day?


shared_combinations = {}

for (period, model), group in df.groupby(['period', 'model']):
    combinations_per_variable = (
        group.groupby('Var')[['r', 'f', 'p', 'i']]
        .apply(lambda var_group: set(tuple(x) for x in var_group.to_numpy()))
        .tolist()
    )
    if combinations_per_variable:
        shared_combinations_for_group = set.intersection(*combinations_per_variable)
    else:
        shared_combinations_for_group = set()
    shared_combinations[(period, model)] = shared_combinations_for_group

filtered_dfs = []

for (period, model), combinations in shared_combinations.items():
    if combinations:  
        first_combination = next(iter(combinations))  
        
        filtered_df = df[
            (df['period'] == period) & 
            (df['model'] == model) & 
            (df[['r', 'f', 'p', 'i']].apply(tuple, axis=1) == first_combination)
        ]
        filtered_dfs.append(filtered_df)
        
df = pd.concat(filtered_dfs, ignore_index=True)
df = df[df['Var'] != 'ps']    # we are going to only use monthly ps to keep things consistant
for model in df.model.unique():
    checkEmpty = False
    for period in df.period.unique():
        if len(df[(df['model'] == model) & (df['period'] == period)]) == 0: checkEmpty = True

    if checkEmpty:
        df = df[~(df['model'] == model)]
# ## This step sees if there is available ps if there is in both then you leave it if theres one or the other you take out both, if theres none you leave it
# monModels = []
# for model in df.model.unique():
#     bools = []    
#     for period in df.period.unique():
#         d = df[(df['model'] == model) & (df['period'] == period)].Var.unique() 
#         bools.append('ps' in set(d))
        
#     if (bools[0] ^ bools[1]):
#         monModels.append(model)
#         df = df[~((df['model'] == model) & (df['Var'] == 'ps')) ]
#     if not(bools[0] or bools[1]):
#         monModels.append(model)
# df =df.reset_index(drop=True)

# dfmon = pd.read_csv('CO2_4x_url_all_psMon.csv')

# dfmon = dfmon.dropna() 
# dfmon[['r', 'i', 'p', 'f']] = pd.DataFrame(
#     dfmon['filename'].apply(extract_SimNums).tolist(),
#     index=dfmon.index
# )

# dfmon['Varient'] = dfmon.filename.apply(extract_8_chars)
# dfmon['period'] = dfmon.filename.apply(getPeriod)
# dfmon['Var'] = dfmon.filename.apply(whichVar)
# dfmon['timeStep'] = 'mon'

# for model in monModels:
#     for period in df.period.unique():
#         Varient = df[(df['model'] == model) & (df['period'] == period)].Varient.unique()[0]
#         dfadd = dfmon[((dfmon['model'] == model) & (dfmon['period'] == period)) & (dfmon['Varient'] == Varient)]
#         df = pd.concat([df, dfadd], ignore_index=True)


df[['start', 'stop']] = df['filename'].apply(lambda x: pd.Series(extractDates(x)))

df.to_csv('CO2_4x_url_usable.csv', index = False)

we tried to find information for 72
We are unable to find any files for 0 models
We have yet to process 40
we can find 32 models that include ['tas', 'huss', 'hurs'] for both 4xco2 and PI


In [2]:
def extractDates(string):
    ncFind = string.find('.nc') 
    ncBack = string.rfind('-', 0, ncFind)
    dback = (ncFind-ncBack)-1
    stop = string[ncBack+1:ncFind]
    start = string[ncBack-dback:ncBack]
    try:
        return extractYear(start), extractYear(stop)
    except:
        print(string)
def extractYear(Date):
    return Date[:4]
extractDates('ps_Amon_TaiESM1_piControl_r1i1p1f1_gn_050101-060012.nc')

('0501', '0600')

#### 5: Making a function to grab any set of opendap_url's 

In [35]:
### from df we are going to find the model, period, varient combo that is needed to complete the ps variables
df = pd.read_csv('CO2_4x_url_usable.csv')

for model in df.model.unique():
    bools = []
    print(model)
    for period in df.period.unique():
        print(set(df[(df['model'] == model) & (df['period'] == period)].Varient.unique()))

len(df.model.unique())

ACCESS-CM2
{'r1i1p1f1'}
{'r1i1p1f1'}
ACCESS-ESM1-5
{'r2i1p1f1'}
{'r1i1p1f1'}
CESM2
{'r1i1p1f1'}
{'r1i1p1f1'}
CESM2-FV2
{'r1i1p1f1'}
{'r1i1p1f1'}
CESM2-WACCM
{'r1i1p1f1'}
{'r1i1p1f1'}
CESM2-WACCM-FV2
{'r1i1p1f1'}
{'r1i1p1f1'}
CMCC-CM2-SR5
{'r1i1p1f1'}
{'r1i1p1f1'}
CMCC-ESM2
{'r1i1p1f1'}
{'r1i1p1f1'}
CNRM-CM6-1
{'r1i1p1f2'}
{'r1i1p1f2'}
CNRM-CM6-1-HR
{'r1i1p1f2'}
{'r1i1p1f2'}
CNRM-ESM2-1
{'r1i1p1f2'}
{'r1i1p1f2'}
CanESM5
{'r1i1p2f1'}
{'r1i1p2f1'}
EC-Earth3
{'r3i1p1f1'}
{'r1i1p1f1'}
EC-Earth3-CC
{'r1i1p1f1'}
{'r1i1p1f1'}
EC-Earth3-Veg
{'r1i1p1f1'}
{'r1i1p1f1'}
EC-Earth3-Veg-LR
{'r1i1p1f1'}
{'r1i1p1f1'}
GFDL-CM4
{'r1i1p1f1'}
{'r1i1p1f1'}
HadGEM3-GC31-LL
{'r1i1p1f3'}
{'r1i1p1f1'}
HadGEM3-GC31-MM
{'r1i1p1f3'}
{'r1i1p1f1'}
IITM-ESM
{'r1i1p1f1'}
{'r1i1p1f1'}
IPSL-CM5A2-INCA
{'r1i1p1f1'}
{'r1i1p1f1'}
KIOST-ESM
{'r1i1p1f1'}
{'r1i1p1f1'}
MIROC-ES2H
{'r3i1p4f2'}
{'r1i1p4f2'}
MIROC-ES2L
{'r1i1p1f2'}
{'r1i1p1f2'}
MIROC6
{'r1i1p1f1'}
{'r1i1p1f1'}
MRI-ESM2-0
{'r5i1p1f1'}
{'r1i1p1f1'}
NorESM2-LM
{'r1i1

31

---
### Thats the daily vars taken care of
- however we need to do monthly for ps on the models where we don't have ps vars

**Steps for this:**
1. Identify which models are close but don't include just ps
2. Use same search as above to find monthly ps
3. Save this to csv files
4. Incorperate this into the url_usable csv file somehow
---

#### 1: Identifying models which would be usable apart from daily ps

18

#### 2: Using search to look for monthly pressure for these models: (time consuming)

In [None]:
def calc(model): 
    print(model) # for monitoring progress of parallel execution
    query = conn.new_context(project="CMIP6",     
                             experiment_id='abrupt-4xCO2,piControl',
                             source_id = model,
                             frequency = 'mon', 
                             # member_id="r1i1p1f1",
                             variable_id = "ps")
    results = query.search()
        
    files = []
    for i in range(len(results)):
        try:
            hit = results[i].file_context().search()
        except:
            hit = results[i].file_context().search()
        files += list(map(lambda f: {'model': model,
                                     'filename': f.filename, 
                                     'download_url': f.download_url, 
                                     'opendap_url': f.opendap_url}, hit))
    
    df = pd.DataFrame.from_dict(files)
    if len(df) == 0: 
        print(model, ': no results ')
        return None
    df = df.dropna() # some opendap_urls are not found - so get rid of those
    df[['r', 'i', 'p', 'f']] = pd.DataFrame(
        df['filename'].apply(extract_SimNums).tolist(),
        index=df.index
    )
            
    df['Varient'] = df.filename.apply(extract_8_chars)
    df['period'] = df.filename.apply(getPeriod)
    df['Var'] = df.filename.apply(whichVar)
    df['timeStep'] = 'mon'
    df['openDapWorking'] = False
    
    discreteFiles = df.filename.unique()
    
    for i, dFile in enumerate(discreteFiles):
        for openDap in df[df.filename == dFile].opendap_url:
            try: 
                xr.open_dataset(openDap)
                df['openDapWorking'][df.opendap_url == openDap] = True
                break
            except:
                pass
    df.to_csv(f'CO2_4x_url_{model}_psMon.csv')   # Saves to CSV
    return None

def parallel_execution(func, inputs, processes=None):
    from multiprocessing import Pool, cpu_count

    # Set the number of processes to use (default: number of CPU cores)
    if processes is None:
        processes = cpu_count()

    # Create a pool of worker processes
    with Pool(processes=processes) as pool:
        # Map the function to inputs and distribute across processors
        results = pool.map(func, inputs)

    return results

df = pd.read_csv('CO2_4x_url_usable.csv')

models = df.model.unique()
# processes = 8 works best for me - you won't overload RAM but their server can only process so many search requests
results = parallel_execution(calc, models, processes=8) 

ACCESS-ESM1-5ACCESS-CM2CESM2-WACCMCESM2-WACCM-FV2
CMCC-CM2-SR5CESM2-FV2CESM2
CMCC-ESM2







  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  return np.asarray(self.get_duck_array(), dtype=dtype)
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df['openDapWorking'][df.opendap_url == openDap] = True
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats i

CNRM-CM6-1


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df['openDapWorking'][df.opendap_url == openDap] = True
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['open

CNRM-CM6-1-HR


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df['openDapWorking'][df.opendap_url == openDap] = True
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['open

CNRM-ESM2-1


  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  return np.asarray(self.get_duck_array(), dtype=dtype)
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df['openDapWorking'][df.opendap_url == openDap] = True
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats i

CanESM5


  var = coder.decode(var, name=name)
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df['openDapWorking'][df.opendap_url == openDap] = True
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#retu

EC-Earth3


  var = coder.decode(var, name=name)
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df['openDapWorking'][df.opendap_url == openDap] = True
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#retu

EC-Earth3-CC


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df['openDapWorking'][df.opendap_url == openDap] = True
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['open

EC-Earth3-Veg


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df['openDapWorking'][df.opendap_url == openDap] = True
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['open

EC-Earth3-Veg-LR


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df['openDapWorking'][df.opendap_url == openDap] = True
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['open

In [47]:
len(models)

31

#### 3: Compositing this all back together

In [85]:
df = pd.DataFrame()
notIn = []
for model in models:
    try:
        df_add = pd.read_csv(f'CO2_4x_url_{model}_psMon.csv', index_col=0)
        df = pd.concat([df, df_add], ignore_index=True)
    except:
        print(f'{model} not included')
        notIn.append(model)
df['timeStep'] = 'mon'


df[['start', 'stop']] = df['filename'].apply(lambda x: pd.Series(extractDates(x)))
df.to_csv(f'CO2_4x_url_all_psMon.csv', index=None)
# --------------------------------------------------------
dfday = pd.read_csv('CO2_4x_url_usable.csv')
dfmon = pd.read_csv('CO2_4x_url_all_psMon.csv')
dfday = dfday[dfday.openDapWorking]
dfmon = dfmon[dfmon.openDapWorking]
dfs = []
i = 0
for model in df.model.unique():
    for period in df.period.unique():
        Varient = dfday[(dfday['model'] == model) & (dfday['period'] == period)].Varient.unique()[0]
        monsub = dfmon[((dfmon['model'] == model) & (dfmon['period'] == period)) & (dfmon['Varient'] == Varient)]
        if len(monsub) == 0: print(model, period)
        dfs.append(monsub)

dfmon = pd.concat(dfs, ignore_index=True)
dfAll = pd.concat([dfday, dfmon ], ignore_index=True)
dfAll.to_csv('CO2_4x_url_usable_wMon.csv')

AWI-CM-1-1-MR not included
AWI-ESM-1-1-LR not included
BCC-CSM2-MR not included
BCC-ESM1 not included
CAMS-CSM1-0 not included
CAS-ESM2-0 not included
FGOALS-f3-L not included
FGOALS-g3 not included
CanESM5-CanOE not included
CMCC-CM2-HR4 not included
E3SM-1-0 not included
E3SM-1-1 not included
E3SM-1-1-ECA not included
E3SM-2-0 not included
EC-Earth3-AerChem not included
EC-Earth3-LR not included
EC-Earth3P-VHR not included
FIO-ESM-2-0 not included
MPI-ESM-1-2-HAM not included
INM-CM4-8 not included
INM-CM5-0 not included
IPSL-CM6A-LR not included
IPSL-CM6A-LR-INCA not included
UKESM1-1-LL not included
ICON-ESM-LR not included
MPI-ESM1-2-HR not included
MPI-ESM1-2-LR not included
GISS-E2-1-G not included
GISS-E2-1-G-CC not included
GISS-E2-1-H not included
GISS-E2-2-G not included
GISS-E2-2-H not included
NorCPM1 not included
NorESM1-F not included
KACE-1-0-G not included
GFDL-AM4 not included
GFDL-ESM4 not included
NESM3 not included
CIESM not included
MCM-UA-1-0 not included
ps_Amon

ValueError: Columns must be same length as key

#### 4: Quality controlling CO2_4x_url_all_psMon - to make sure they all are what we'd expect them to be

In [6]:
df = pd.read_csv('CO2_4x_url_usable_wMon.csv', index_col=0)
pd.set_option('display.max_rows', 400)

df.query("period == 'abrupt-4xCO2' & Var == 'huss' & openDapWorking").filename.unique()
def extractDates(string):
    ncFind = string.find('.nc') 
    ncBack = string.rfind('-', 0, ncFind)
    dback = (ncFind-ncBack)-1
    stop = string[ncBack+1:ncFind]
    start = string[ncBack-dback:ncBack]
    try:
        return extractYear(start), extractYear(stop)
    except:
        print(string)
def extractYear(Date):
    return int(Date[:4])

# figure out the time each models cover
df[['start', 'stop']] = df['filename'].apply(extractDates).apply(pd.Series)

# for each variable - what is the minimum and maximum times they cover
stopMax = df.groupby(['model', 'period', 'Var'], as_index=False).stop.max()
startMin = df.groupby(['model', 'period', 'Var'], as_index=False).start.min()

# find the overlaping times shared for each model and period
stopShared = stopMax.groupby(['model', 'period'], as_index=False).min()
startShared = startMin.groupby(['model', 'period'], as_index=False).max()

span_df = pd.DataFrame({'model': startShared.model, 
              'period': startShared.period, 
              'span': stopShared.stop - startShared.start,
              'startShared': startShared.start,
              'stopShared': stopShared.stop})

# find models that don't have shared data that spans more than 20 years
insufficientSpan = []
for model in span_df.model.unique():
    spans = span_df[span_df['model'] == model].span.to_numpy()
    if any(v < 20 for v in spans):
        insufficientSpan.append(model)

print('the following models dont cover a shared 20 years: ', insufficientSpan)

# remove models with insufficient span
df = df[~df.model.isin(insufficientSpan)]

def inspan(row, padstart, padend):
    return max(row['start'], padstart) <= min(row['stop'], padend)

dfs = []  # Store results for all models and periods
i = 0 
for model in df.model.unique():
    for period in df.period.unique():
        i+=1
        # Calculate padstart and padend based on `span_df`
        mask = (span_df.model == model) & (span_df.period == period)
        if mask.sum() > 0:  # Ensure there's a match in span_df
            end = int(span_df.loc[mask, 'stopShared'].iloc[0])  # Extract the scalar value safely
            padend = end + 3
            padstart = end - 20 - 3
            
            filtered_df = df[(df.model == model) & (df.period == period)].copy()

            # Apply inspan to the filtered DataFrame
            filtered_df.loc[:, 'inspan'] = filtered_df.apply(
                lambda row: inspan(row, padstart, padend), axis=1
            )
            
            dfs.append(filtered_df)

df = pd.concat(dfs, ignore_index=True)
print(len(df.model.unique()))
df = df[df.inspan].reset_index(drop=True)
print(len(df.model.unique()))
span_df = span_df[~span_df.model.isin(insufficientSpan)].reset_index(drop = True)
span_df.to_csv('span_df.csv')
df.to_csv('CO2_4x_url_reduced_wMon.csv', index=None)

the following models dont cover a shared 20 years:  ['EC-Earth3-Veg', 'MRI-ESM2-0']
29
29


#### 5: Adding this into the CO2_4x_url_usable
Have to add a column for if its a monthly or daily var - along with some logic

#### 6: integrated retrevial function for open_dap links
This is what we'll use in the preprocessing steps