### 12/12/2024 - First steps in the 4xCO2 model analysis
##### Whats been done so far:
- Models decided upon
- Grabbed file paths for tas,huss, hurs, ps (daily and monthly)

##### What still needs to happen
- Need to look at the full theory to see if there are additional variables that I need to grab
- Need to get the clippin info from Mike
- Need to sort out if things would be more efficient if I download and then delete each dataset - then I'd have to have a fully integrated program to do everything
---


In [1]:
import sys
sys.path.append('/home/users/chingosa/Functions/')
import CMIPFuncs as func
import numpy as np
import xarray as xr
import matplotlib.pyplot as plt
import pandas as pd
from scipy import stats 
import os
import cftime
import requests
from tqdm import tqdm
from dask.diagnostics import ProgressBar


filePaths = pd.read_csv('CO2_4x_url_reduced_wMon.csv')



---
# Working on time cropping - with span df


In [6]:
filePaths = pd.read_csv('CO2_4x_url_reduced_wMon.csv', index_col = None)

def download(url, filename):
    """
    Stolen from https://utcdw.physics.utoronto.ca/UTCDW_Guidebook/Chapter3/section3.4_climate_model_output_from_ESGF.html#downloading-data

    Just a downloading script 
    Supply the dowload URLS and the filename to save to - they save directly to the local environment - would need to change this if you want to be more organizied
    """
    print("Downloading ", filename)
    r = requests.get(url, stream=True)
    total_size, block_size = int(r.headers.get('content-length', 0)), 1024
    with open(filename, 'wb') as f:
        for data in tqdm(r.iter_content(block_size),
                         total=total_size//block_size,
                         unit='KiB', unit_scale=True):
            f.write(data)

    if total_size != 0 and os.path.getsize(filename) != total_size:
        print("Downloaded size does not match expected size!\n",
              "FYI, the status code was ", r.status_code)


## This is the final time cropping functionality - 6 lines that took 6 days to figure out
def doTimeCrop(ds, model, period, span_df, filePaths):
    '''
    Grabs 20 years of time using cftime functionality (hopefully this works for all model)
    the start and end of these are agreed on using the span_df which is calculated in 10_12_24

    roughly 7300 points per ds depending on time dtype
    '''

    
    mask = (span_df.model == model) & (span_df.period == period)
    end = int(span_df.loc[mask, 'stopShared'].iloc[0])  # Extract the scalar value safely
    mask = ((ds['time.year'] > end-20) & (ds['time.year'] <= end))
    ds = ds.sel(time=mask)
    ds = ds.sel(time=~ds['time'].to_index().duplicated())
    return ds



def processModel(model):
    '''
    Model Preprocessing for CMIP6 4xCO2 and PiControl

    Grid norm is what everything is interpolated to 
    File Paths are the reduced filepaths of just the times we are interested in 
    span_df - describes the temporal coverage of models and periods - so we can select a shared 20 year period

    For Each period and variable of the provided model we download the files
    then find the 20 years of interest, then interpolate to an agreed upon grid based on the gridding of GFDL-CM4
    then we crop to +- 40 N/S to look at the tropics
    If we are looking at 'ps' which is a monthly variable we need to pull in another ds (tas in this case) to interpolate time to

    We try and rechunk but I'm pretty sure that doesn't get saved lol
    We save it and then delete the source files bc they are alot larger

    This takes ~10 mins per model
    '''
    normGrid = xr.open_dataset('/badc/cmip6/data/CMIP6/ScenarioMIP/NOAA-GFDL/GFDL-CM4/ssp245/r1i1p1f1/fx/sftlf/gr1/latest/sftlf_fx_GFDL-CM4_ssp245_r1i1p1f1_gr1.nc').sftlf
    filePaths = pd.read_csv('CO2_4x_url_reduced_wMon.csv', index_col = None)
    span_df = pd.read_csv('span_df.csv', index_col=0)

    filePaths = filePaths[filePaths.model == model].reset_index(drop=True)
        
        
    for period in filePaths.period.unique():
        for Var in ['tas', 'huss', 'hurs', 'ps']:

            saveName = f'{model}_{period}_{Var}_processed.nc'
            paths = filePaths[(filePaths['period'] == period) & (filePaths['Var'] == Var)].reset_index(drop = True)

            if not os.path.exists(saveName): # Check if processed dataset has already been created
                for i in np.arange(len(paths)):   # if it hasn't go through each path in paths

                    if not os.path.exists(paths.filename[i]): # if that hasn't already been downloaded dowloadd it
                        url = paths.download_url[i]
                        filename = paths.filename[i]

                        download(url, filename)
            
            
                ds = xr.open_mfdataset(paths.filename, combine='nested', concat_dim='time', use_cftime=True)
                ds = ds.drop_vars(['time_bnds'], errors = 'ignore')
                ds = doTimeCrop(ds, model, period, span_df, filePaths)
                ds = ds[Var]
                ds = ds.interp_like(normGrid, kwargs={"fill_value": "extrapolate"}).sel(lat = slice(-40,40))
                

                if Var == 'ps':
                    ds_tas = xr.open_dataset(f'{model}_{period}_tas_processed.nc', use_cftime=True)
                    ds = ds.interp_like(ds_tas, kwargs={"fill_value": "extrapolate"})
                    
                
                ds = ds.chunk({'time': -1, 'lat': 5})
                
                write_job = ds.to_netcdf(saveName, compute=False)
                with ProgressBar():
                    print(f"Writing to {saveName}")
                    write_job.compute()
            
            for k in paths.filename:
                if os.path.exists(k): os.remove(k)






    

Var = 'ps'
model = 'GFDL-CM4'
period = 'piControl'

# ds = ds_colate(['tas'], model, period, filePaths, span_df)
# ds = ds.chunk({'time' : -1})

In [7]:
processModel(model)

Writing to GFDL-CM4_abrupt-4xCO2_tas_processed.nc
[########################################] | 100% Completed | 277.92 s
Downloading  huss_day_GFDL-CM4_abrupt-4xCO2_r1i1p1f1_gr1_01210101-01401231.nc


1.08MKiB [01:58, 9.13kKiB/s]                          


Downloading  huss_day_GFDL-CM4_abrupt-4xCO2_r1i1p1f1_gr1_01410101-01501231.nc


539kKiB [00:50, 10.7kKiB/s]                          


Downloading  huss_day_GFDL-CM4_abrupt-4xCO2_r1i1p1f1_gr2_01210101-01401231.nc


284kKiB [00:23, 12.1kKiB/s]                          


Downloading  huss_day_GFDL-CM4_abrupt-4xCO2_r1i1p1f1_gr2_01410101-01501231.nc


142kKiB [00:08, 16.4kKiB/s]                          


Writing to GFDL-CM4_abrupt-4xCO2_huss_processed.nc
[########################################] | 100% Completed | 273.96 s
Downloading  hurs_day_GFDL-CM4_abrupt-4xCO2_r1i1p1f1_gr1_01210101-01401231.nc


1.01MKiB [01:58, 8.54kKiB/s]                          


Downloading  hurs_day_GFDL-CM4_abrupt-4xCO2_r1i1p1f1_gr1_01410101-01501231.nc


507kKiB [00:42, 11.8kKiB/s]                          


Downloading  hurs_day_GFDL-CM4_abrupt-4xCO2_r1i1p1f1_gr2_01210101-01401231.nc


267kKiB [00:45, 5.82kKiB/s]                          


Downloading  hurs_day_GFDL-CM4_abrupt-4xCO2_r1i1p1f1_gr2_01410101-01501231.nc


134kKiB [00:08, 15.1kKiB/s]                          


Writing to GFDL-CM4_abrupt-4xCO2_hurs_processed.nc
[########################################] | 100% Completed | 288.01 s
Downloading  ps_AERmon_GFDL-CM4_abrupt-4xCO2_r1i1p1f1_gr1_010101-015012.nc


69.3kKiB [00:08, 8.10kKiB/s]                          


Writing to GFDL-CM4_abrupt-4xCO2_ps_processed.nc
[########################################] | 100% Completed | 69.63 s
Downloading  tas_day_GFDL-CM4_piControl_r1i1p1f1_gr1_06110101-06301231.nc


850kKiB [00:08, 105kKiB/s]                           


Downloading  tas_day_GFDL-CM4_piControl_r1i1p1f1_gr1_06310101-06501231.nc


850kKiB [00:07, 114kKiB/s]                           


Downloading  tas_day_GFDL-CM4_piControl_r1i1p1f1_gr2_06110101-06301231.nc


227kKiB [00:02, 102kKiB/s]                           


Downloading  tas_day_GFDL-CM4_piControl_r1i1p1f1_gr2_06310101-06501231.nc


227kKiB [00:02, 96.1kKiB/s]                          


Writing to GFDL-CM4_piControl_tas_processed.nc
[########################################] | 100% Completed | 297.65 s
Downloading  huss_day_GFDL-CM4_piControl_r1i1p1f1_gr1_06110101-06301231.nc


1.09MKiB [00:09, 115kKiB/s]                           


Downloading  huss_day_GFDL-CM4_piControl_r1i1p1f1_gr1_06310101-06501231.nc


1.09MKiB [00:09, 113kKiB/s]                           


Downloading  huss_day_GFDL-CM4_piControl_r1i1p1f1_gr2_06110101-06301231.nc


287kKiB [00:02, 105kKiB/s]                           


Downloading  huss_day_GFDL-CM4_piControl_r1i1p1f1_gr2_06310101-06501231.nc


287kKiB [00:02, 107kKiB/s]                           


Writing to GFDL-CM4_piControl_huss_processed.nc
[########################################] | 100% Completed | 303.90 s
Downloading  hurs_day_GFDL-CM4_piControl_r1i1p1f1_gr1_06110101-06301231.nc


1.02MKiB [00:10, 99.2kKiB/s]                          


Downloading  hurs_day_GFDL-CM4_piControl_r1i1p1f1_gr1_06310101-06501231.nc


1.02MKiB [00:10, 95.2kKiB/s]                          


Downloading  hurs_day_GFDL-CM4_piControl_r1i1p1f1_gr2_06110101-06301231.nc


267kKiB [00:02, 110kKiB/s]                           


Downloading  hurs_day_GFDL-CM4_piControl_r1i1p1f1_gr2_06310101-06501231.nc


267kKiB [00:03, 78.2kKiB/s]                           


Writing to GFDL-CM4_piControl_hurs_processed.nc
[########################################] | 100% Completed | 301.51 s
Downloading  ps_AERmon_GFDL-CM4_piControl_r1i1p1f1_gr1_055101-065012.nc


139kKiB [00:13, 10.2kKiB/s]                          


Writing to GFDL-CM4_piControl_ps_processed.nc
[########################################] | 100% Completed | 68.97 s


In [2]:
filePaths.model.unique()

array(['ACCESS-CM2', 'ACCESS-ESM1-5', 'CESM2', 'CESM2-FV2', 'CESM2-WACCM',
       'CESM2-WACCM-FV2', 'CMCC-CM2-SR5', 'CMCC-ESM2', 'CNRM-CM6-1',
       'CNRM-CM6-1-HR', 'CNRM-ESM2-1', 'CanESM5', 'EC-Earth3',
       'EC-Earth3-CC', 'EC-Earth3-Veg-LR', 'GFDL-CM4', 'HadGEM3-GC31-LL',
       'HadGEM3-GC31-MM', 'IITM-ESM', 'IPSL-CM5A2-INCA', 'KIOST-ESM',
       'MIROC-ES2H', 'MIROC-ES2L', 'MIROC6', 'NorESM2-LM', 'NorESM2-MM',
       'SAM0-UNICON', 'TaiESM1', 'UKESM1-0-LL'], dtype=object)

In [2]:
ds = xr.open_mfdataset(['ACCESS-CM2_abrupt-4xCO2_ps_processed.nc',
                   'ACCESS-CM2_abrupt-4xCO2_huss_processed.nc', 
                   'ACCESS-CM2_abrupt-4xCO2_hurs_processed.nc', 
                   'ACCESS-CM2_abrupt-4xCO2_tas_processed.nc'], use_cftime=True)

In [None]:
ds.tas.mean('time').plot()

<matplotlib.collections.QuadMesh at 0x7f2293ba4390>

In [3]:
ds.tas.max().values

array(325.07022, dtype=float32)