# data_d22a

### Purpose
Prepare CMIP6 data for Monte Carlo Drift Correction analysis within [d22a-mcdc](https://github.com/grandey/d22a-mcdc).

### Input data requirements
CMIP6 data that have been post-processed by [p22c-esgf-processing](https://github.com/grandey/p22c-esgf-processing).

### Output data written
NetCDF files in [data/](https://github.com/grandey/d22a-mcdc/tree/main/data/), each containing a global mean time series for a given variable, AOGCM variant, and experiment.

### History
BSG, 2022.

In [1]:
! date

Tue Aug 16 11:09:53 +08 2022


In [2]:
from cdo import Cdo, CDOException
import pandas as pd
import pathlib
import xarray as xr

In [3]:
# CDO tempdir
cdo = Cdo(tempdir='temp')
cdo.cleanTempDir()

In [4]:
# Module versions
print(f'CDO version: {cdo.version()}')
print(f'cdo.py bindings version: {cdo.__version__()}')
print(f'xarray version: {xr.__version__}')

CDO version: 2.0.5
cdo.py bindings version: 1.5.4
xarray version: 2022.6.0


## Identify AOGCM variants (source-member pairs)

In [5]:
# Location of regrid_missto0_yearmean_fldmean data produced by p22c-esgf-processing
in_base = pathlib.Path('~/Data/p22c/CMIP6/regrid_missto0_yearmean_fldmean/').expanduser()

# Core variables required
core_var_list = ['rsdt', 'rsut', 'rlut', # R = rsdt-rsut-rlut
                 'hfds',  # H (without flux correction)
                 'zostoga']  # Z

# Experiments required (with corresponding names)
exp_dict = {'piControl': 'Control', 'historical': 'Historical',
            'ssp126': 'SSP1-2.6', 'ssp245': 'SSP2-4.5',
            'ssp370': 'SSP3-7.0', 'ssp585': 'SSP5-8.5'}

# Identify source-member pairs to use
source_member_list = sorted([d.name for d in in_base.glob(f'rsdt/[!.]*_*')])  # this list will be reduced
for source_member in source_member_list.copy():  # loop over copy of source-member pairs to check data availability
    for var in core_var_list:  # loop over required variables
        for exp in exp_dict.keys():  # loop over experiments
            in_fns = sorted(in_base.glob(f'{var}/{source_member}/*.{exp}.*/*.nc'))
            if len(in_fns) == 0:  # if input files for this experiment do not exist...
                try:
                    source_member_list.remove(source_member)  # ... do not use this source-member pair
                except ValueError:  # when source-member pair has previously been removed
                    pass

print(f'{len(source_member_list)} source-member pairs identified.')
source_member_list

21 source-member pairs identified.


['ACCESS-CM2_r1i1p1f1',
 'ACCESS-ESM1-5_r1i1p1f1',
 'CMCC-CM2-SR5_r1i1p1f1',
 'CMCC-ESM2_r1i1p1f1',
 'CNRM-CM6-1_r1i1p1f2',
 'CNRM-ESM2-1_r1i1p1f2',
 'CanESM5_r1i1p1f1',
 'CanESM5_r1i1p2f1',
 'EC-Earth3-Veg-LR_r1i1p1f1',
 'EC-Earth3-Veg_r1i1p1f1',
 'EC-Earth3_r1i1p1f1',
 'GISS-E2-1-G_r1i1p5f1',
 'GISS-E2-1-H_r1i1p1f2',
 'IPSL-CM6A-LR_r1i1p1f1',
 'MIROC6_r1i1p1f1',
 'MPI-ESM1-2-HR_r1i1p1f1',
 'MPI-ESM1-2-LR_r1i1p1f1',
 'MRI-ESM2-0_r1i1p1f1',
 'NorESM2-LM_r1i1p1f1',
 'NorESM2-MM_r1i1p1f1',
 'UKESM1-0-LL_r1i1p1f2']

In [6]:
# Limit to one variant per source AOGCM
source_list = []
for source_member in source_member_list.copy():
    source = source_member.split('_')[0]
    if source not in source_list:
        source_list.append(source)
    else:
        print(f'Dropping {source_member}.')
        source_member_list.remove(source_member)
        
print(f'{len(source_member_list)} source-member pairs remain.')
source_member_list

Dropping CanESM5_r1i1p2f1.
20 source-member pairs remain.


['ACCESS-CM2_r1i1p1f1',
 'ACCESS-ESM1-5_r1i1p1f1',
 'CMCC-CM2-SR5_r1i1p1f1',
 'CMCC-ESM2_r1i1p1f1',
 'CNRM-CM6-1_r1i1p1f2',
 'CNRM-ESM2-1_r1i1p1f2',
 'CanESM5_r1i1p1f1',
 'EC-Earth3-Veg-LR_r1i1p1f1',
 'EC-Earth3-Veg_r1i1p1f1',
 'EC-Earth3_r1i1p1f1',
 'GISS-E2-1-G_r1i1p5f1',
 'GISS-E2-1-H_r1i1p1f2',
 'IPSL-CM6A-LR_r1i1p1f1',
 'MIROC6_r1i1p1f1',
 'MPI-ESM1-2-HR_r1i1p1f1',
 'MPI-ESM1-2-LR_r1i1p1f1',
 'MRI-ESM2-0_r1i1p1f1',
 'NorESM2-LM_r1i1p1f1',
 'NorESM2-MM_r1i1p1f1',
 'UKESM1-0-LL_r1i1p1f2']

## Merge input data files across time using CDO

In [7]:
# Location in which to write output NetCDF files
out_base = pathlib.Path.cwd() / 'data'
out_base.mkdir(exist_ok=True)

In [8]:
# Function to merge global mean files across time
def mergetime_using_cdo(
        variable='zostoga',
        experiments=['piControl', 'historical', 'ssp126', 'ssp245', 'ssp370', 'ssp585'],
        source_member_list=['ACCESS-CM2_r1i1p1f1',],
        force=False):
    # Lists to hold names of files skipped (due to already existing) and files written
    skipped_list = []
    written_list = []
    # Loop over source-member pairs
    for source_member in source_member_list:
        # Output directory name
        out_dir = out_base.joinpath(f'{in_base.name}_mergetime/{variable}/{source_member}/')
        # Loop over experiments
        for exp in experiments:
            # Input data directory
            in_dir_list = sorted(in_base.glob(f'{variable}/{source_member}/*.{exp}.*'))
            if len(in_dir_list) == 0:
                continue  # skip
            elif len(in_dir_list) > 1:
                print(f'Warning: {len(in_dir_list)} input directories found for {source_member} {exp} {variable}')
            in_dir = in_dir_list[-1]
            # Input files
            in_fn_list = sorted([str(f) for f in in_dir.glob(f'*.nc')])
            # Has at least one input file been found?
            if len(in_fn_list) == 0:
                continue
            # If SSP, check for historical experiment file before proceeding
            if 'ssp' in exp:
                hist_fn = out_dir.joinpath(f'{variable}_{source_member}_historical.mergetime.nc')
                if not hist_fn.exists():
                    print(f'{variable}: historical file {hist_fn.name} not found')
                    continue
            # Make output directory
            out_dir.mkdir(exist_ok=True, parents=True)
            # Output filename
            out_fn = out_dir.joinpath(f'{variable}_{source_member}_{exp}.mergetime.nc')
            # Does the output file already exist?
            if out_fn.exists() and not force:
                skipped_list.append(out_fn)
                continue
            # Use CDO to merge files (if necessary), and use absolute time axis
            # piControl and historical: merge all available files
            if exp in ['piControl', 'historical']:
                _ = cdo.mergetime(input=f'{" ".join(in_fn_list)}', output=f'{out_fn}',
                                  options='-a -f nc', env={"SKIP_SAME_TIME": "1"})
            # SSPs: also merge with historical to create single time series
            elif 'ssp' in exp:       
                # Merge, including with historical
                hist_fn = out_dir.joinpath(f'{variable}_{source_member}_historical.mergetime.nc')
                _ = cdo.mergetime(input=f'{hist_fn} {" ".join(in_fn_list)}', output=f'{out_fn}',
                                  options='-a -f nc', env={"SKIP_SAME_TIME": "1"})
            else:
                print(f'exp="{exp}" not recognised')
            # Has output file been written?
            if out_fn.exists():
                written_list.append(out_fn)
            else:
                print(f'Failed to write {out_fn.name}')
    # Print summary
    print(f'{variable}: written {len(written_list)} files; skipped {len(skipped_list)} existing files')
    # Return sorted list of files written and skipped
    return sorted(written_list + skipped_list)

In [9]:
%%time
# Apply mergetime_using_cdo() to merge files of interest
# Core variables
for var in core_var_list:
    _ = mergetime_using_cdo(variable=var,
                            experiments=exp_dict.keys(),
                            source_member_list=source_member_list,
                            force=False)

# Non-zero hfcorr exists only for MRI-ESM2-0_r1i1p1f1
_ = mergetime_using_cdo(variable='hfcorr',
                        experiments=exp_dict.keys(),
                        source_member_list=['MRI-ESM2-0_r1i1p1f1',],
                        force=False)

rsdt: written 0 files; skipped 120 existing files
rsut: written 0 files; skipped 120 existing files
rlut: written 0 files; skipped 120 existing files
hfds: written 0 files; skipped 120 existing files
zostoga: written 0 files; skipped 120 existing files
hfcorr: written 0 files; skipped 6 existing files
CPU times: user 89.2 ms, sys: 31.5 ms, total: 121 ms
Wall time: 121 ms


## Total area of earth
Required by mcdc_analysis_d22a.ipynb.

In [10]:
# Location regrid_missto0_yearmean data produced by p22c-esgf-processing
in_base2 = pathlib.Path('~/Data/p22c/CMIP6/regrid_missto0_yearmean/').expanduser()

In [11]:
%%time
# Series to hold total area for each source-member pair
area_ser = pd.Series(dtype='float64', name='area_earth')
# Use rsdt file from historical experiment
var = 'rsdt'
exp = 'historical'
# Loop over source-member pairs
for source_member in source_member_list:
    # Calculate total area using CDO
    in_fn = list(in_base2.glob(f'{var}/{source_member}/*{exp}*/*.nc'))[0]
    area_earth_fn = cdo.fldsum(input=f'-gridarea {in_fn}')  # CDO temp file
    area_earth = xr.open_dataset(area_earth_fn)['cell_area'].data.flatten()[0]
    # Save to Series
    area_ser[source_member] = area_earth

# Print output
area_ser

CPU times: user 188 ms, sys: 641 ms, total: 829 ms
Wall time: 9.32 s


ACCESS-CM2_r1i1p1f1          5.100645e+14
ACCESS-ESM1-5_r1i1p1f1       5.100645e+14
CMCC-CM2-SR5_r1i1p1f1        5.100645e+14
CMCC-ESM2_r1i1p1f1           5.100645e+14
CNRM-CM6-1_r1i1p1f2          5.100645e+14
CNRM-ESM2-1_r1i1p1f2         5.100645e+14
CanESM5_r1i1p1f1             5.100645e+14
EC-Earth3-Veg-LR_r1i1p1f1    5.100645e+14
EC-Earth3-Veg_r1i1p1f1       5.100645e+14
EC-Earth3_r1i1p1f1           5.100645e+14
GISS-E2-1-G_r1i1p5f1         5.100645e+14
GISS-E2-1-H_r1i1p1f2         5.100645e+14
IPSL-CM6A-LR_r1i1p1f1        5.100645e+14
MIROC6_r1i1p1f1              5.100645e+14
MPI-ESM1-2-HR_r1i1p1f1       5.100645e+14
MPI-ESM1-2-LR_r1i1p1f1       5.100645e+14
MRI-ESM2-0_r1i1p1f1          5.100645e+14
NorESM2-LM_r1i1p1f1          5.100645e+14
NorESM2-MM_r1i1p1f1          5.100645e+14
UKESM1-0-LL_r1i1p1f2         5.100645e+14
Name: area_earth, dtype: float64

In [12]:
# Save to CSV
out_fn = out_base.joinpath('area_earth.csv')
area_ser.to_csv(out_fn, index_label='source_member')
print(f'Written {out_fn.name}')

Written area_earth.csv


In [13]:
! date

Tue Aug 16 11:10:04 +08 2022
