In [1]:
import os
from glob import glob
import multiprocessing as mp

import dask
import numpy as np
import pandas as pd
import xarray as xr
import salem

from utils import city_list, gev_metric_ids
import metric_funcs as mf

In [2]:
print(mp.cpu_count())

80


## Calculate metrics 

### Preliminaries

In [3]:
################
#### Paths #####
################
# NOTE: this is run on a different system from other datasets
# Update these for reproduction
from utils import hopper_code_path as project_code_path
from utils import hopper_data_path as project_data_path

tgw_path = "/home/shared/vs498_0001/im3_hyperfacets_tgw/"

tgw_scenarios = os.listdir(tgw_path)

In [4]:
# Worker function must be defined at module level for multiprocessing
def process_file(args):
    file_path, var_id_in, var_id_out, daily_agg_func, log_path = args
    try:
        return mf.tgw_hourly_to_daily(file_path=file_path,
                                     var_id_in=var_id_in,
                                     var_id_out=var_id_out,
                                     agg_func=daily_agg_func,
                                     log_path=log_path)
    except Exception as e:
        print(f"Error processing file {file_path}: {str(e)}")
        return None

# Metric calculation function
def calculate_metric(scenario,
                     var_id_in,
                     var_id_out,
                     daily_agg_func,
                     metric_func,
                     tgw_path=tgw_path):
    # Log path
    log_path = f"{project_code_path}/scripts/logs"
    
    try:
        # Get all hourly files sorted
        file_paths = np.sort(glob(f"{tgw_path}/{scenario}/hourly/*"))
        
        # Prepare arguments for each file
        task_args = [(file_path, var_id_in, var_id_out, daily_agg_func, log_path) 
                    for file_path in file_paths]
        
        # Set number of processes
        num_processes = mp.cpu_count() - 5
        
        # Create a pool of workers and distribute the work
        with mp.Pool(processes=num_processes) as pool:
            # Map the worker function to all tasks
            results = pool.map(process_file, task_args)
            
            # Filter out None results
            ds_daily_all = [ds for ds in results if ds is not None]
        
        # Concatenate all results if we have any
        if ds_daily_all:
            ds_daily_all = xr.concat(ds_daily_all, dim='time')
            
            # Compute annual statistics
            ds_annual = metric_func(ds_daily_all, var_id_out)
            
            return ds_annual
        else:
            print(f"No valid results for scenario {scenario}")
            return None
            
    except Exception as e:
        with open(f"{log_path}/{scenario}_TGW.txt", "w") as f:
            f.write(str(e))
        print(f"Error processing scenario {scenario}: {str(e)}")
        return None

### Calculations

In [5]:
%%time
# max tasmax
var_id_in = 'T2C'
var_id_out = 'tasmax'
daily_agg_func = 'max'
metric_func = mf.calculate_max

# Loop through scenarios
for scenario in tgw_scenarios:
    save_path = f"{project_data_path}/metrics/TGW/max_{var_id_out}_{scenario}.nc"
    if not os.path.exists(save_path):
        # Calculate
        ds_out = calculate_metric(scenario=scenario,
                                  var_id_in=var_id_in,
                                  var_id_out=var_id_out,
                                  daily_agg_func=daily_agg_func,
                                  metric_func=metric_func)
        # Store
        ds_out.to_netcdf(save_path)
        print(f"Done: max_{var_id_out}_{scenario}.nc")
    else:
        print(f"Already done: max_{var_id_out}_{scenario}.nc")

Already done: max_tasmax_rcp85hotter_2060_2099.nc
Already done: max_tasmax_rcp45cooler_2020_2059.nc
Already done: max_tasmax_rcp45cooler_2060_2099.nc
Already done: max_tasmax_rcp85hotter_2020_2059.nc
Already done: max_tasmax_rcp45hotter_2020_2059.nc
Already done: max_tasmax_rcp85cooler_2060_2099.nc
Already done: max_tasmax_rcp85cooler_2020_2059.nc
Already done: max_tasmax_rcp45hotter_2060_2099.nc
Already done: max_tasmax_historical_1980_2019.nc
CPU times: user 1.14 ms, sys: 1.04 ms, total: 2.18 ms
Wall time: 61.3 ms


In [6]:
%%time
# max tasmax
var_id_in = 'T2C'
var_id_out = 'tasmin'
daily_agg_func = 'min'
metric_func = mf.calculate_min

# Loop through scenarios
for scenario in tgw_scenarios:
    save_path = f"{project_data_path}/metrics/TGW/min_{var_id_out}_{scenario}.nc"
    if not os.path.exists(save_path):
        # Calculate
        ds_out = calculate_metric(scenario=scenario,
                                  var_id_in=var_id_in,
                                  var_id_out=var_id_out,
                                  daily_agg_func=daily_agg_func,
                                  metric_func=metric_func)
        # Store
        ds_out.to_netcdf(save_path)
        print(f"Done: min_{var_id_out}_{scenario}.nc")
    else:
        print(f"Already done: min_{var_id_out}_{scenario}.nc")

Already done: min_tasmin_rcp85hotter_2060_2099.nc
Already done: min_tasmin_rcp45cooler_2020_2059.nc
Already done: min_tasmin_rcp45cooler_2060_2099.nc
Already done: min_tasmin_rcp85hotter_2020_2059.nc
Already done: min_tasmin_rcp45hotter_2020_2059.nc
Already done: min_tasmin_rcp85cooler_2060_2099.nc
Already done: min_tasmin_rcp85cooler_2020_2059.nc
Already done: min_tasmin_rcp45hotter_2060_2099.nc
Already done: min_tasmin_historical_1980_2019.nc
CPU times: user 1.54 ms, sys: 816 μs, total: 2.36 ms
Wall time: 34.8 ms


In [None]:
%%time
# max pr
var_id_in = 'PRCP'
var_id_out = 'pr'
daily_agg_func = 'sum'
metric_func = mf.calculate_max

# Loop through scenarios
for scenario in tgw_scenarios:
    save_path = f"{project_data_path}/metrics/TGW/max_{var_id_out}_{scenario}.nc"
    if not os.path.exists(save_path):
        # Calculate
        ds_out = calculate_metric(scenario=scenario,
                                  var_id_in=var_id_in,
                                  var_id_out=var_id_out,
                                  daily_agg_func=daily_agg_func,
                                  metric_func=metric_func)
        # Store
        ds_out.to_netcdf(save_path)
        print(f"Done: max_{var_id_out}_{scenario}.nc")
    else:
        print(f"Already done: max_{var_id_out}_{scenario}.nc")

Already done: max_pr_rcp85hotter_2060_2099.nc
Already done: max_pr_rcp45cooler_2020_2059.nc
Already done: max_pr_rcp45cooler_2060_2099.nc
Already done: max_pr_rcp85hotter_2020_2059.nc
Already done: max_pr_rcp45hotter_2020_2059.nc
Already done: max_pr_rcp85cooler_2060_2099.nc
Already done: max_pr_rcp85cooler_2020_2059.nc
Already done: max_pr_rcp45hotter_2060_2099.nc


In [5]:
%%time
# max cdd
var_id_in = 'T2C'
var_id_out = 'cdd'
daily_agg_func = 'cdd'
metric_func = mf.calculate_max

# Loop through scenarios
for scenario in tgw_scenarios:
    save_path = f"{project_data_path}/metrics/TGW/max_{var_id_out}_{scenario}.nc"
    if not os.path.exists(save_path):
        # Calculate
        ds_out = calculate_metric(scenario=scenario,
                                  var_id_in=var_id_in,
                                  var_id_out=var_id_out,
                                  daily_agg_func=daily_agg_func,
                                  metric_func=metric_func)
        # Store
        ds_out.to_netcdf(save_path)
        print(f"Done: max_{var_id_out}_{scenario}.nc")
    else:
        print(f"Already done: max_{var_id_out}_{scenario}.nc")

Already done: max_cdd_rcp85hotter_2060_2099.nc
Already done: max_cdd_rcp45cooler_2020_2059.nc
Already done: max_cdd_rcp45cooler_2060_2099.nc
Already done: max_cdd_rcp85hotter_2020_2059.nc
Already done: max_cdd_rcp45hotter_2020_2059.nc
Already done: max_cdd_rcp85cooler_2060_2099.nc
Already done: max_cdd_rcp85cooler_2020_2059.nc
Already done: max_cdd_rcp45hotter_2060_2099.nc
Already done: max_cdd_historical_1980_2019.nc
CPU times: user 817 μs, sys: 938 μs, total: 1.76 ms
Wall time: 32.1 ms


In [6]:
%%time
# max hdd
var_id_in = 'T2C'
var_id_out = 'hdd'
daily_agg_func = 'hdd'
metric_func = mf.calculate_max

# Loop through scenarios
for scenario in tgw_scenarios:
    save_path = f"{project_data_path}/metrics/TGW/max_{var_id_out}_{scenario}.nc"
    if not os.path.exists(save_path):
        # Calculate
        ds_out = calculate_metric(scenario=scenario,
                                  var_id_in=var_id_in,
                                  var_id_out=var_id_out,
                                  daily_agg_func=daily_agg_func,
                                  metric_func=metric_func)
        # Store
        ds_out.to_netcdf(save_path)
        print(f"Done: max_{var_id_out}_{scenario}.nc")
    else:
        print(f"Already done: max_{var_id_out}_{scenario}.nc")

Already done: max_hdd_rcp85hotter_2060_2099.nc
Already done: max_hdd_rcp45cooler_2020_2059.nc
Already done: max_hdd_rcp45cooler_2060_2099.nc
Already done: max_hdd_rcp85hotter_2020_2059.nc
Done: max_hdd_rcp45hotter_2020_2059.nc
Done: max_hdd_rcp85cooler_2060_2099.nc
Done: max_hdd_rcp85cooler_2020_2059.nc
Done: max_hdd_rcp45hotter_2060_2099.nc
Done: max_hdd_historical_1980_2019.nc
CPU times: user 1min 48s, sys: 1min 51s, total: 3min 40s
Wall time: 6h 6min 52s
