## Step 1: Extracting variables from ranalysis (BARRA-R2) and projections (BARPA & CSIRO-CCAM) for specific locations
Detailed description of BARRA paramters here: https://opus.nci.org.au/spaces/NDP/pages/338002591/BARRA2+Parameter+Descriptions 

In [None]:
import warnings
warnings.filterwarnings('ignore')

import xarray as xr
import os
import sys
import dask.distributed
import glob
from dask.distributed import Client
import tempfile
import dask
import numpy as np
import time

# Import utils
sys.path.append('/home/565/dh4185/mn51-dh4185/repos_collab/nesp_bff/')
import utils
# Static metadata dictionaries
from utils import locations, model_dict, cmap_dict

# Import datafinder
sys.path.append('/home/565/dh4185/mn51-dh4185/repos_collab/dataset_finder/')
from dataset_finder import *

In [None]:
# Dask settings
dask.config.set({
    #'array.chunk-size': "256 MiB",
    #'array.slicing.split_large_chunks': True, 
    'distributed.comm.timeouts.connect': '120s',
    'distributed.comm.timeouts.tcp': '120s',
    'distributed.comm.retry.count': 10,
    'distributed.scheduler.allowed-failures': 20,
    "distributed.scheduler.worker-saturation": 1.1, #< This should use the new behaviour which helps with memory pile up
})

client = Client(n_workers=25, threads_per_worker=1, local_directory = tempfile.mkdtemp(), memory_limit = "63000mb")
client

In [None]:
# client.close()

In [None]:
##### Settings
# Setting up the metadata for what should be computed
# - Toggle between hourly and daily data
# - Scenarios: historical, ssp126 or ssp370 (Note, BARRA-R2 has only historical data)
# - RCM: BARPA-R, BARRA-R2 or CCAM-v2203-SN
# - Start year: for reference period use 1985, for 2050 use 2035 (Note, BARRA-R2 can't take years post 2022)
# - End year: for reference period use 2014, for 2050 use 2064   (Note, BARRA-R2 can't take years post 2022)
# - Root directory: Computed output is saved here (don't change). Final output directory is depending on the RCM chosen
#####

# Switch between hourly (True) and daily (False) frequency
HOURLY_FREQ = True
_scenario = "historical"
_rcm = "CCAM-v2203-SN"
_gcm_ccam_1hr = "ACCESS-CM2"
start_y = 1985
end_y = 2014
root_dir = "/g/data/eg3/nesp_bff/step1_raw_data_extraction/"

### List of hourly and daily variables that go in the datafinder check
vars_1hr_list = ['tas','hurs','huss','sfcWind','psl','uas','vas','clt','rsds','rsdsdir']
vars_day_list = ['tasmax','tasmin','huss','psl','sfcWind','sfcWindmax','rsds','rsdsdir']

vars_1hr = {
    'temperature': ['tas'],
    'cloud_cover': ['clt'],
    'humidity_relative': ['hurs'],
    'humidity_specific': ['huss'],
    'wind_speed_10m': ['sfcWind'],
    'pressure': ['psl'],
    'wind_direction_u': ['uas'],
    'wind_direction_v': ['vas'],
    'cloud_cover': ['clt'],
    'solar_global': ['rsds'],
    'solar_direct': ['rsdsdir']
}

vars_day = {
    'temperature_max': ['tasmax'],
    'temperature_min': ['tasmin'],
    'humidity_specific_max': ['huss'],
    'humidity_specific_min': ['huss'],
    'pressure': ['psl'],
    'wind_speed_10m': ['sfcWind'],
    'wind_speed_10m_max': ['sfcWindmax'],
    'solar_global': ['rsds'],
    'solar_direct': ['rsdsdir']
}

### Check daily data availability across models
Uses the datafinder tool from ACS to find suitable data. Handy to check if all variables, scenarios and years exist for a given RCM at **daily** timescale. More info here: https://github.com/AusClimateService/dataset_finder 

In [None]:
%%time
#< Specify datasets - do this to find out what models have the required variables
all_data_day = get_datasets("ACS_DS",
                        rcm = _rcm,
                        scenario = ["historical","ssp126","ssp370"],
                        timescale ="day",
                        year = year_range(start_y, end_y))

select_data_day = all_data_day.select(var = vars_day_list, exact_match=True).condense("scenario")
select_data_day

### Check hourly data availability across models
Uses the datafinder tool from ACS to find suitable data. Handy to check if all variables, scenarios and years exist for a given RCM at **hourly** timescale. More info here: https://github.com/AusClimateService/dataset_finder 

In [None]:
%%time
#< Specify datasets - do this to find out what models have the required variables
all_data_1hr = get_datasets("ACS_DS",
                        rcm = _rcm,
                        scenario = ["historical","ssp126","ssp370"],
                        timescale = "1hr",
                        year = year_range(start_y, end_y))

select_data_1hr = all_data_1hr.select(var = vars_1hr_list, exact_match=True).condense("scenario")
select_data_1hr

In [None]:
matching_day = select_data_day.find_matches(select_data_1hr, exclude_keys = "timescale")
matching_1hr = select_data_1hr.find_matches(select_data_day, exclude_keys = "timescale")

## Process extraction of variables
Does the same as the executable script __step1_extracting_variables.py__. Good for debugging or calculating individual files

In [None]:
%%time

# Sets timescale, output directory and variable list depending on input in 'Settings' cell at the top
freq = "1hr" if HOURLY_FREQ else "day"
_vars = vars_1hr if HOURLY_FREQ else vars_day
# Output location
if _rcm == "CCAM-v2203-SN":
    out_dir = f"{root_dir}CSIRO-CCAM/"
else:
    out_dir = f"{root_dir}{_rcm}/"

print(f"---------- {_rcm} for '{freq}' data ----------")
# Corrects location coordinates specified in locations dictionary in utils.py to ensure the selected grid cell from an RCM is on land.  
updated_locations = utils.update_locations(xr.open_dataset(model_dict[_rcm]["sftlf"]).sftlf,locations)

# ======================== MAIN LOOP ============================
# Contains a number of print statements to track progress.

# Iterating though the 12 locations in the updated locations dictionary
for loc in updated_locations:
    start_time_loc = time.time()  # Start timer
    print(f"========================== {loc} =======================")
    lat = updated_locations[loc]['Lat']
    lon = updated_locations[loc]['Lon']
    print(f"Lat: {lat}, Lon: {lon}")

    # Iterating through GCMs for the selected RCM
    for _gcm in model_dict[_rcm]["gcms"]:
        print(f"***** {_gcm} *****")
        should_continue = False
        
        start_time_gcm = time.time()  # Start timer

        # Specifying output file name in line was naming convention
        out_file = (
            f"{out_dir}{loc}_"
            f"{model_dict[_rcm]['grid']}_"
            f"{_gcm}_{_scenario}_"
            f"{model_dict[_rcm]['gcms'][_gcm]['mdl_run']}_"
            f"{model_dict[_rcm]['org']}_"
            f"{_rcm}_{model_dict[_rcm]['gcms'][_gcm]['version']}_"
            f"{freq}_{start_y}-{end_y}.nc"
        )

        # Skip creation of file if it exists. Note, a file might exist in an incomplete state on disk due to an interrupted job (e.g.
        # wall time exceeded or keyboard interrupt in the notebook) and needs to be deleted manually before executing the script again.
        if not os.path.exists(out_file):
            print(f"Processing: {out_file}.....")

            # Empty var list where the extracted variables for the processed location are stored to merge them into a single dataset later
            var_list = []

            # Iterating through the variables (daily or hourly var_list)
            for _var in _vars:
                start_time_var = time.time()  # Start timer
                print(f"{_var}: {_vars[_var]}")

                # Convert boolean time specifier into string 
                _timescale = "1hr" if HOURLY_FREQ else "day"

                # Maximum and minimum specific humidity (hussmax, hussmin) is not provided at daily timescale and needs to be 
                # derived from hourly data.
                if _timescale == "day" and (_var == 'humidity_specific_max' or _var == 'humidity_specific_min'):
                    print(f"Use hourly data for {_var}.")
                    _timescale = "1hr"
                
                # BARPA-R is very efficiently chunked four our operation which favours little chunking across time
                # and lots of chunking along lat and lon. CCAM is chunked for each time step but not at all
                # along lat and lon dimensions which requires the dataset to be fully loaded. This takes con-
                # siderable more time to process: BARPA-R day: ~2min, hourly: ~5min. CCAM daily: ~25min, hourly: >7.5h hours
                # Hence, CCAM hourly data is preprocessed (rechunked and stored on /scratch/eg3/dh4185/) to interim files per year, 
                # only loaded if all files for one GCM are present.
                if _rcm == "CCAM-v2203-SN" and _timescale == "1hr" and _var not in ['humidity_specific_max', 'humidity_specific_min']:# and _gcm == _gcm_ccam_1hr:
                    print("Doing hourly CCAM data...")
                    
                    # Read proprocessd/rechunked hourly CCAM from /scratch/eg3
                    scratch_dir = f"/scratch/eg3/dh4185/rechunked/{_gcm}/{_scenario}/"
                    rechunk_files = sorted(glob.glob(
                        f"{scratch_dir}{_vars[_var][0]}_"
                        f"{model_dict[_rcm]['grid']}_"
                        f"{_gcm}_{_scenario}_"
                        f"{model_dict[_rcm]['gcms'][_gcm]['mdl_run']}_"
                        f"{model_dict[_rcm]['org']}_{_rcm}_"
                        f"{model_dict[_rcm]['gcms'][_gcm]['version']}_1hr_*.nc"))
                    
                    if len(rechunk_files) != 30 and len(rechunk_files) >= 1:
                        print(f"Files don't cover 30 years from {start_y} to {end_y}. Check files and "
                              f"rerun rechunk_ccam.sh")
                        if rechunk_files:
                            for file in rechunk_files:
                                print(file)
                            should_continue = True
                            break
                    elif len(rechunk_files) == 0:
                        print(f"No files for GCM {_gcm} and {_var} exists. Run "
                              f"rechunk_ccam.sh first.")
                        should_continue = True
                        break
                    else:
                        # print(rechunk_files)
                        # Read all years and preprocessing lat/lon selection
                        da = xr.open_mfdataset(rechunk_files, parallel=True,
                                                            preprocess=lambda ds: utils.preprocess_location(ds, lat, lon))[_vars[_var][0]]
                        da = da.chunk({'time': -1}).sel(time=slice(str(start_y),str(end_y)))
                        # Aliging time coordinates (mix of variables at half hour and full hours)
                        da_all = utils.process_time(da,_vars[_var][0],_timescale)
                        var_list.append(da_all.to_dataset())
                                                
                        print(f"Processing time for {_var}: {((time.time() - start_time_var)/60):.2f} minutes\n")


                # If BARPA-R or BARRA-R2 at daily or hourly timescale, or CCAM at daily time scale selected
                # process all years at once.
                elif _rcm in ["BARPA-R","BARRA-R2"] or _rcm == "CCAM-v2203-SN" and _timescale == "day":

                    # Get file paths using the ACS dataset finder
                    in_dir = model_dict[_rcm]["root_dir"]
                    all_data = get_datasets("ACS_DS",
                                rcm = _rcm,
                                gcm = _gcm,
                                scenario = _scenario,
                                grid = model_dict[_rcm]["grid"],
                                org = model_dict[_rcm]["org"],
                                mdl_run = model_dict[_rcm]["gcms"][_gcm]["mdl_run"],
                                ver = model_dict[_rcm]["gcms"][_gcm]["version"],
                                timescale = _timescale,
                                year = year_range(start_y, end_y)).select(var = _vars[_var], exact_match=True)

                    # Read all years and preprocessing lat/lon selection
                    da = xr.open_mfdataset(all_data.get_files(), parallel=True,
                                            preprocess=lambda ds: utils.preprocess_location(ds, lat, lon))[_vars[_var][0]]
                    da = da.chunk({'time': -1})
                    # Using hourly huss data to determine daily hussmax and hussmin
                    da_temp = utils.process_humidity(da,_var)
                    # Aliging time coordinates (mix of variables at half hour and full hours)
                    da_all = utils.process_time(da_temp,_vars[_var][0],_timescale)
                    var_list.append(da_all.to_dataset())

                else:
                    print("Inappropriate RCM, GCM, timescale requested. Check Settings.")
                    break
                                
                print(f"Processing time for {_var}: {((time.time() - start_time_var)/60):.2f} minutes\n")

            if should_continue:
                print("Move to the next GCM.")
                continue  # Move to the next GCM
                
            # Remove unwanted variables
            cleaned_list = [da.drop_vars(["bnds","height","level_height","model_level_number","sigma"], errors="ignore") for da in var_list]

            # Merge all variables per GCM
            da_var = xr.merge(cleaned_list)
            print(da_var)
            # Write to disk
            da_var.to_netcdf(out_file)

            print(f"Processing time for {_rcm}-{_gcm}: {((time.time() - start_time_gcm)/60):.2f} minutes\n")

        else:
            print(f'File for {loc} exists in output directory.')

    print(f"Processing time for {loc}: {((time.time() - start_time_loc)/60):.2f} minutes\n")
    
print("Done.")

In [None]:
%%time
#####################################################
################ For hourly CCAM data ###############
#####################################################

# Sets timescale, output directory and variable list depending on input in 'Settings' cell at the top
freq = "1hr" if HOURLY_FREQ else "day"
_vars = vars_1hr if HOURLY_FREQ else vars_day
_gcm = "ACCESS-CM2"

# Output location
if _rcm == "CCAM-v2203-SN":
    out_dir = f"{root_dir}CSIRO-CCAM/"
else:
    out_dir = f"{root_dir}{_rcm}/"

print(f"---------- {_rcm} for '{freq}' data ----------")
# Corrects location coordinates specified in locations dictionary in utils.py to ensure the selected grid cell from an RCM is on land.  
updated_locations = utils.update_locations(xr.open_dataset(model_dict[_rcm]["sftlf"]).sftlf,locations)

# ======================== MAIN LOOP ============================
# Contains a number of print statements to track progress.

# Iterating though the 12 locations in the updated locations dictionary
for loc in updated_locations:
    start_time_loc = time.time()  # Start timer
    print(f"========================== {loc} =======================")
    lat = updated_locations[loc]['Lat']
    lon = updated_locations[loc]['Lon']
    print(f"Lat: {lat}, Lon: {lon}")
    print(f"***** {_gcm} *****")
        
    start_time_gcm = time.time()  # Start timer

    # Specifying output file name in line was naming convention
    out_file = (
        f"{out_dir}{loc}_"
        f"{model_dict[_rcm]['grid']}_"
        f"{_gcm}_{_scenario}_"
        f"{model_dict[_rcm]['org']}_"
        f"{model_dict[_rcm]['gcms'][_gcm]['mdl_run']}_"
        f"{_rcm}_{model_dict[_rcm]['gcms'][_gcm]['version']}_"
        f"{freq}_{start_y}-{end_y}.nc"
    )

    # Skip creation of file if it exists. Note, a file might exist in an incomplete state on disk due to an interrupted job (e.g.
    # wall time exceeded or keyboard interrupt in the notebook) and needs to be deleted manually before executing the script again.
    if not os.path.exists(out_file):
        print(f"Processing: {out_file}.....")

        # Empty var list where the extracted variables for the processed location are stored to merge them into a single dataset later
        var_list = []

        # Iterating through the variables (daily or hourly var_list)
        for _var in _vars:
            start_time_var = time.time()  # Start timer
            print(f"{_var}: {_vars[_var]}")

            # Convert boolean time specifier into string 
            _timescale = "1hr" if HOURLY_FREQ else "day"

            # Read proprocessd/rechunked hourly CCAM from /scratch/eg3
            scratch_dir = f"/scratch/eg3/dh4185/rechunked/{_gcm}/{_scenario}/"
            rechunk_files = sorted(glob.glob(
                f"{scratch_dir}{_vars[_var][0]}_"
                f"{model_dict[_rcm]['grid']}_"
                f"{_gcm}_{_scenario}_"
                f"{model_dict[_rcm]['gcms'][_gcm]['mdl_run']}_"
                f"{model_dict[_rcm]['org']}_{_rcm}_"
                f"{model_dict[_rcm]['gcms'][_gcm]['version']}_1hr_*.nc"))
            
            if len(rechunk_files) != 30:
                print(f"Files don't cover 30 years from {start_y} to {end_y}. Check files and "
                      f"rerun rechunk_ccam.sh")
                if rechunk_files:
                    for file in rechunk_files:
                        print(file)
            else:
                # print(rechunk_files)
                # Read all years and preprocessing lat/lon selection
                da = xr.open_mfdataset(rechunk_files, parallel=True,
                                                preprocess=lambda ds: utils.preprocess_location(ds, lat, lon))[_vars[_var][0]]
                da = da.chunk({'time': -1}).sel(time=slice(str(start_y),str(end_y)))
                # Aliging time coordinates (mix of variables at half hour and full hours)
                da_all = utils.process_time(da,_var,_timescale)
                var_list.append(da_all.to_dataset())
                                    
                print(f"Processing time for {_var}: {((time.time() - start_time_var)/60):.2f} minutes\n")

        # Remove unwanted variables
        cleaned_list = [da.drop_vars(["bnds","height","level_height","model_level_number","sigma"], errors="ignore") for da in var_list]

        # Merge all variables per GCM
        da_var = xr.merge(cleaned_list)
        print(da_var)
        # Write to disk
        da_var.to_netcdf(out_file)

        print(f"Processing time for {_rcm}-{_gcm}: {((time.time() - start_time_gcm)/60):.2f} minutes\n")

    else:
        print(f'File for {loc} exists in output directory.')

    print(f"Processing time for {loc}: {((time.time() - start_time_loc)/60):.2f} minutes\n")
    
print("Done.")

In [None]:
%%time
rechunk_dir = f"/scratch/eg3/dh4185/rechunked/ACCESS-CM2/{_scenario}/"
lst = []
for Var in vars_1hr_list:
    print(Var)
    rechunk_files = sorted(glob.glob(f"{rechunk_dir}{Var}_AUS-10i_ACCESS-CM2_{_scenario}_r4i1p1f1_CSIRO_CCAM-v2203-SN_v1-r1_1hr_*.nc"))
    # print(len(rechunk_files))
    # Read all years and preprocessing lat/lon selection
    da = xr.open_mfdataset(rechunk_files, parallel=True,
                            preprocess=lambda ds: utils.preprocess_location(ds, -15, 140))[Var]
    da = da.chunk({'time': -1})
    # print(da)
    da_time = utils.process_time(da,Var,"1hr")
    print(da_time)

    lst.append(da_time.to_dataset())

In [None]:
# Old script with processing CCAM hourly data one file at a time
%%time
#####################################################
######## For everything NOT hourly CCAM data ########
#####################################################

# Sets timescale, output directory and variable list depending on input in 'Settings' cell at the top
freq = "1hr" if HOURLY_FREQ else "day"
_vars = vars_1hr if HOURLY_FREQ else vars_day
# Output location
if _rcm == "CCAM-v2203-SN":
    out_dir = f"{root_dir}CSIRO-CCAM/"
else:
    out_dir = f"{root_dir}{_rcm}/"

print(f"---------- {_rcm} for '{freq}' data ----------")
# Corrects location coordinates specified in locations dictionary in utils.py to ensure the selected grid cell from an RCM is on land.  
updated_locations = utils.update_locations(xr.open_dataset(model_dict[_rcm]["sftlf"]).sftlf,locations)

# ======================== MAIN LOOP ============================
# Contains a number of print statements to track progress.

# Iterating though the 12 locations in the updated locations dictionary
for loc in updated_locations:
    start_time_loc = time.time()  # Start timer
    print(f"========================== {loc} =======================")
    lat = updated_locations[loc]['Lat']
    lon = updated_locations[loc]['Lon']
    print(f"Lat: {lat}, Lon: {lon}")

    # Iterating through GCMs for the selected RCM
    for _gcm in model_dict[_rcm]["gcms"]:
        print(f"***** {_gcm} *****")
        
        # File list to story temporary files if CCAM and hourly time scale are selected
        files_gcm_list = []
        start_time_gcm = time.time()  # Start timer

        # Specifying output file name in line was naming convention
        out_file = (
            f"{out_dir}{loc}_"
            f"{model_dict[_rcm]['grid']}_"
            f"{_gcm}_{_scenario}_"
            f"{model_dict[_rcm]['gcms'][_gcm]['mdl_run']}_"
            f"{model_dict[_rcm]['org']}_"
            f"{_rcm}_{model_dict[_rcm]['gcms'][_gcm]['version']}_"
            f"{freq}_{start_y}-{end_y}.nc"
        )

        # Skip creation of file if it exists. Note, a file might exist in an incomplete state on disk due to an interrupted job (e.g.
        # wall time exceeded or keyboard interrupt in the notebook) and needs to be deleted manually before executing the script again.
        if not os.path.exists(out_file):
            print(f"Processing: {out_file}.....")

            # Empty var list where the extracted variables for the processed location are stored to merge them into a single dataset later
            var_list = []

            # Iterating through the variables (daily or hourly var_list)
            for _var in _vars:
                start_time_var = time.time()  # Start timer
                print(f"{_var}: {_vars[_var]}")

                # Convert boolean time specifier into string 
                _timescale = "1hr" if HOURLY_FREQ else "day"

                # Maximum and minimum specific humidity (hussmax, hussmin) is not provided at daily timescale and needs to be 
                # derived from hourly data.
                if _timescale == "day" and (_var == 'humidity_specific_max' or _var == 'humidity_specific_min'):
                    print(f"Use hourly data for {_var}.")
                    _timescale = "1hr"

                # Get file paths using the ACS dataset finder
                in_dir = model_dict[_rcm]["root_dir"]
                all_data = get_datasets("ACS_DS",
                                rcm = _rcm,
                                gcm = _gcm,
                                scenario = _scenario,
                                grid = model_dict[_rcm]["grid"],
                                org = model_dict[_rcm]["org"],
                                mdl_run = model_dict[_rcm]["gcms"][_gcm]["mdl_run"],
                                ver = model_dict[_rcm]["gcms"][_gcm]["version"],
                                timescale = _timescale,
                                year = year_range(start_y, end_y)).select(var = _vars[_var], exact_match=True)
                
                # BARPA-R is very efficiently chunked four our operation which favours little chunking across time
                # and lots of chunking along lat and lon. CCAM is chunked for each time step but not at all
                # along lat and lon dimensions which requires the dataset to be fully loaded. This takes con-
                # siderable more time to process: BARPA-R day: ~2min, hourly: ~5min. CCAM daily: ~25min, hourly: >7.5h hours
                # Hence, CCAM hourly data is preprocessed to interim files per year, and then loaded and concatenated.
                if _rcm == "CCAM-v2203-SN" and _timescale == "1hr" and _var not in ['humidity_specific_max', 'humidity_specific_min']:
                    print("Creating temporary files for variables.")
                    start_time_CCAM_1hr = time.time()
                    files = sorted(all_data.get_files())
                    temp_dir = "/g/data/eg3/nesp_bff/step1_raw_data_extraction/CSIRO-CCAM/temp/"

                    pattern = f"{temp_dir}{loc}_{files[0].split('/')[-1][:-20]}*.nc"
                    matching_files = sorted(glob.glob(pattern))
                            
                    if matching_files:
                        # Sort by modification time (latest last)
                        most_recent_file = max(matching_files, key=os.path.getmtime)
                        print(f"Deleting most recent temp file for safety: {most_recent_file}")
                        try:
                            os.remove(most_recent_file)
                        except Exception as e:
                            print(f"Could not delete {most_recent_file}: {e}")
    
                    file_list = []
                    for file in files:
                        print("Doing time period: ",file.split("_")[-1][:-3])
                        out_file_temp = f"{temp_dir}{loc}_{file.split('/')[-1]}"

                        # Validate if the file exists and is readable. If not, delete them and 
                        # recompute. Needed if previous job didn't finish because it hit the walltime and 
                        # an unfinished file is sitting in the directory.
                        # Check if file exists
                        if os.path.exists(out_file_temp):
                            try:
                                # Try to open the file to verify it's valid
                                test = xr.open_dataset(out_file_temp, engine="netcdf4")
                                test.close()
                                print(f"Valid temp file already exists: {out_file_temp}")
                            except (OSError, KeyError, ValueError, RuntimeError) as e:
                                print(f"Warning: {out_file_temp} exists but could not be opened (possibly corrupted). Deleting and recreating.")
                                os.remove(out_file_temp)
                            
                        # Create file if it doesn't already exist
                        if not os.path.exists(out_file_temp):
                            da = xr.open_dataset(file, chunks={'time':-1}).sel(lat=lat,
                                                                                   lon=lon,
                                                                                   method="nearest")[_vars[_var][0]]
                            # Aliging time coordinates (mix of variables at half hour and full hours)
                            da_temp = utils.process_time(da,_vars[_var],_timescale)
                            ds = da_temp.to_dataset()

                            # Write temporary file for var and year
                            ds.to_netcdf(out_file_temp)
                            if hasattr(da, 'close'):
                                da.close()

                        # Collecting yearly file names per variable
                        file_list.append(out_file_temp)
                        # Collecting all files names for a GCM (all variables) to collectively delete them from
                        # temp directory when final file is written
                        files_gcm_list.append(out_file_temp)
                        
                    # Open all years for each variable and concatenating them along time
                    da_all = xr.open_mfdataset(file_list, combine='nested',
                                                concat_dim='time', parallel=True)
                    print(da_all)
                    # Append variable dataset list
                    var_list.append(da_all)
                    da_all.close()
                    print(f"Processing time for {_rcm}-{_gcm} {_var}: {((time.time() - start_time_CCAM_1hr)/60):.2f} minutes\n")

                # If BARPA-R or BARRA-R2 at daily or hourly timescale, or CCAM at daily time scale selected
                # process all years at once.
                else:
                    # Read all years and preprocessing lat/lon selection
                    da = xr.open_mfdataset(all_data.get_files(), parallel=True,
                                            preprocess=lambda ds: utils.preprocess_location(ds, lat, lon))[_vars[_var][0]]
                    da = da.chunk({'time': -1})
                    # Using hourly huss data to determine daily hussmax and hussmin
                    da_temp = utils.process_humidity(da,_var)
                    # Aliging time coordinates (mix of variables at half hour and full hours)
                    da_all = utils.process_time(da_temp,_var,_timescale)
                    var_list.append(da_all.to_dataset())
                                
                print(f"Processing time for {_var}: {((time.time() - start_time_var)/60):.2f} minutes\n")

            # Remove unwanted variables
            cleaned_list = [da.drop_vars(["bnds","height","level_height","model_level_number","sigma"], errors="ignore") for da in var_list]

            # Merge all variables per GCM
            da_var = xr.merge(cleaned_list)
            print(da_var)
            # Write to disk
            da_var.to_netcdf(out_file)

            # Remove temporary files from disk
            for f in files_gcm_list:
                os.remove(f)
            print(f"Processing time for {_rcm}-{_gcm}: {((time.time() - start_time_gcm)/60):.2f} minutes\n")

        else:
            print(f'File for {loc} exists in output directory.')

    print(f"Processing time for {loc}: {((time.time() - start_time_loc)/60):.2f} minutes\n")
    
print("Done.")