# Exercise 13: Compute local downscaled CMIP6 ensemble Heat Index

Load needed packages.  Be sure they have been installed using "conda install -c conda-forge packagename".

This Notebook is licensed for free and open consumption under the [Attribution-NonCommercial 4.0 International (CC BY-NC 4.0)](https://creativecommons.org/licenses/by-nc/4.0/) license.

CVS:  $Id: CMIP6-HeatIndex.ipynb,v 25.1 2025/04/16 17:04:53 brikowi Exp $

In [None]:
# Load needed packages
import glob
import matplotlib.pyplot as plt
import urllib.request              # Manage URL's
import numpy as np                 # Basic math and simple arrays
import xarray as xr                # Better & faster multidimensional arrays
import zarr                        # Chunked very fast cloud-based arrays
from sklearn.linear_model import LinearRegression  # Use conda install -c conda-forge scikit-learn

import pandas as pd                # Dataframes, similar to internal spreadsheet
from difflib import get_close_matches
import gcsfs                       # Google cloud filesystem routines
import datetime
import time                        # Get runtime data
startTime = time.time()
import os
import gc

# Define functions to get desired variable from desired model

In [None]:
def getVar (modelList, modelName):
    # Custom function to get variable for model 'modelName' from list of possible models 'modelList'
    zstore = modelList.query(f"source_id == '{modelName}'").zstore.values[0]
    ds = xr.open_zarr(zstore, consolidated = True)
    ds.load()
    return ds

In [None]:
def heatIndex (T, RH):
    # Compute heat index given temperature (T, Centrigrade) and relative humidity (RH, percent)
    # Official Imperial units formula at https://www.wpc.ncep.noaa.gov/html/heatindex_equation.shtml
    # N.B. skipping high-low range corrections for now.  Test at https://www.wpc.ncep.noaa.gov/html/heatindex.shtml
    # Assign coefficients
    c1, c2, c3, c4, c5, c6 = -42.379, 2.04901523, 10.14333127, -0.22475541, -6.83783e-3, -5.481717e-2
    c7, c8, c9 = 1.22874e-3, 8.5282e-4, -1.99e-6
    # Convert T to degF 
    F = T*9/5. + 32.0
    
    # Steadman approximation
    HI = 0.5 * (F + 61.0 + (F-68.0)*1.2 + (RH*0.094))
    
    if (HI+F)/2 > 80.:
        # Use full Rothfusz regression
        HI = c1 + c2*F + c3*RH + c4*F*RH + c5*F**2 + c6*RH**2 + c7*F**2*RH + c8*F*RH**2 + c9*F**2*RH**2
        # Apply adjustments if needed
        if ((RH<13.0) & (F>80.) & (F<112.)):
            HI = HI +  ((13-RH)/4) * sqrt((17-abs(F-95.))/17)
        elif ((RH>85.) * (F>80.) & (F<87.)):
                HI = HI + (RH-85)/10 * (87-F)/5

    return HI

In [None]:
# Define lat-long box, initially the DFW Metroplex
pointLat = 32.7666
pointLon = 360 - 96.7778

# Get desired models available at Google
Get file list from Google via the given URL, use Pandas (pd) to read the CSV file and save in a *dataframe* "df".  A Pandas dataframe is like a spreadsheet format for Python.

In [None]:
df = pd.read_csv('https://storage.googleapis.com/cmip6/cmip6-zarr-consolidated-stores.csv')

## Get list of all available SSP3 7.0 models with *tasmax*
Find lines in the file-list dataframe "df" that meet the given criteria, most important are that the variable *tasmax* is in the file and that it is from a model of experiment *ssp370* (the currently most-likely CO2 emissions scenario).

In [None]:
df_ssp370TAS = df.query("activity_id=='ScenarioMIP' & table_id == 'Amon' & " +\
    "variable_id == 'tas' & experiment_id == 'ssp370' & member_id == 'r1i1p1f1'")
print(len(df_ssp370TAS), 'forecast files match the search criteria, the first 3 are:')
df_ssp370TAS.head(3)

In [None]:
# Monthly relative humidity
df_ssp370RH = df.query("activity_id=='ScenarioMIP' & table_id == 'Amon' & " +\
    "variable_id == 'hur' & experiment_id == 'ssp370' & member_id == 'r1i1p1f1'")
print(len(df_ssp370RH), 'forecast files match the search criteria, the first 3 are:')
df_ssp370RH.head(3)

## Get list of all historical models with *tas* and *RH* (hur)

In [None]:
df_historicalTAS = df.query("activity_id == 'CMIP' & table_id == 'Amon' & " +\
    "variable_id == 'tas' & experiment_id == 'historical' & member_id == 'r1i1p1f1'")
df_historicalRH = df.query("activity_id == 'CMIP' & table_id == 'Amon' & " +\
    "variable_id == 'hur' & experiment_id == 'historical' & member_id == 'r1i1p1f1'")
print(len(df_historicalTAS), 'historical files match the TAS search criteria.')
print(len(df_historicalRH), 'historical files match the RH search criteria, the first 3 are:')
df_historicalRH.head(3)

## Make list of models with  historical and future tas and RH
See <a href="https://stackoverflow.com/questions/55898796/how-to-match-keys-of-2-data-frames-and-create-new-df-with-matching-keys">StackOverflow</a> for suggestions.  Current code requires start with shortest dataframe (!).

In [None]:
TASmatch = [r for r in df_ssp370TAS["source_id"] if get_close_matches(r, df_historicalTAS["source_id"], n=1, cutoff = .85)]
hindForeTAS = np.unique(np.array(TASmatch))
RHmatch = [r for r in df_ssp370RH["source_id"] if get_close_matches(r, df_historicalRH["source_id"], n=1, cutoff = .85)]
hindForeRH = np.unique(np.array(RHmatch))
hindForeTAS_RH = [r for r in hindForeTAS if get_close_matches(r, hindForeRH, n=1, cutoff = .85)]
# print(len(hindForeTAS), "models have both historical and SSP3-7.0 TAS results")
# print(len(hindForeRH), "models have both historical and SSP3-7.0 RH results")
print(len(hindForeTAS_RH), "models have historical and SSP3-7.0 TAS and RH results")
print("e.g. ", hindForeTAS_RH[3])

gcs = gcsfs.GCSFileSystem(token='anon')

## Skip statistical downscaling
PRISM does have gridded historical RH, based on "climate-assisted" modeling (PRISM-Historical)[https://prism.oregonstate.edu/historical/].  Might be questionable for downscaling.

# Compute monthly heat index

Compute monthly heat index for each hindcast and forecast model, assemble ensemble statistics of those.  Many of the 29 files require large RAM during download.  We'll limit ourselves to the 3 fastest-downloading models in the list *goodsources*

## Get forecast tas & RH

In [None]:
%%time

# Revised from CMIPP6-HeatIndex.py v. 1.5
foreTASfile = "ssp370forecastTAS.csv"
foreRHfile = "ssp370forecastRH.csv"
goodSources = ['NorESM2-MM', 'MPI-ESM1-2-LR', 'CanESM5']
# goodSources = ['CanESM5']
for i in range(len(goodSources)):
    model = goodSources[i]
    print(f"Processing model: {model} TAS", end='...', flush=True)
    
    # Get forecast TAS values for desired point
    ds_TASfore = getVar(df_ssp370TAS, model)
    pointTASfore = ds_TASfore.sel(lat=pointLat, lon=pointLon, method="nearest").tas - 273.15 # Convert to Celsius
    # Convert to dataframe, Drop unneeded columns
    tempDF = pointTASfore.to_dataframe().drop(columns=["lat", "lon", "height"])       
    # Append TAS results to dataframe 'foreTAS'
    if i==0:
        foreTAS = tempDF
        foreTAS.columns = [model]
    else:
        newTAS = tempDF['tas']           # Extract values, Assuming same time index for all files
        foreTAS[model] = (newTAS.values)
    # Delete large dataframe to save RAM
    del tempDF
    gc.collect()

    # Save results to file each step in case of trouble
    foreTAS.to_csv(foreTASfile)

    # Repeat for relative humidity ('hur')
    # Get infinte loop for some models (e.g. 'EC-Earth3') when try to download !?!
    # Others succeed as expected (e.g. 'CanESM5').  Try timeout? 
    # see https://medium.com/@chaoren/how-to-timeout-in-python-726002bf2291
    print("RH", flush=True)
    ds_RHfore = getVar(df_ssp370RH, model)
    pointRHfore = ds_RHfore.sel(lat=pointLat, lon=pointLon, method="nearest").hur
    tempDF = pointRHfore.to_dataframe()
    # N.B. tempDF indices are tuples [cftime, plev], plev is pressure level, 100000.0 Pa is surface
    surfaceRH = tempDF.xs(100000.0, level="plev").drop(columns=["lat", "lon"]) # Extract surface RH
    # Delete large dataframe and recover memory
    del tempDF         
    gc.collect()
    # Append surface RH results to dataframe 'foreRH'
    if i==0:
        foreRH = surfaceRH
        foreRH.columns = [model]
    else:
        newRH = surfaceRH['hur']           # Extract values, Assuming same time index for all files
        foreRH[model] = (newRH.values)

    # Save results to file each step in case of trouble
    foreRH.to_csv(foreRHfile)
    del surfaceRH         # Delete large dataframe and recover memory
    gc.collect()


## Get historical tas & RH

In [None]:
%%time

# Revised from CMIPP6-HeatIndex.py v. 1.5
hindTASfile = "ssp370hindcastTAS.csv"
hindRHfile = "ssp370hindcastRH.csv"

for i in range(len(goodSources)):
    model = goodSources[i]
    print(f"Processing model: {model} TAS", end='...', flush=True)
    
    # Get hindcast TAS values for desired point
    ds_TAShind = getVar(df_historicalTAS, model)
    pointTAShind = ds_TAShind.sel(lat=pointLat, lon=pointLon, method="nearest").tas - 273.15 # Convert to Celsius
    # Convert to dataframe, Drop unneeded columns
    tempDF = pointTAShind.to_dataframe().drop(columns=["lat", "lon", "height"])       
    # Append TAS results to dataframe 'hindTAS'
    if i==0:
        hindTAS = tempDF
        hindTAS.columns = [model]
    else:
        newTAS = tempDF['tas']           # Extract values, Assuming same time index for all files
        hindTAS[model] = (newTAS.values)
    # Delete large dataframe to save RAM
    del tempDF
    gc.collect()

    # Save results to file each step in case of trouble
    hindTAS.to_csv(hindTASfile)

    # Repeat for relative humidity ('hur')
    # Get infinte loop for some models (e.g. 'EC-Earth3') when try to download !?!
    # Others succeed as expected (e.g. 'CanESM5').  Try timeout? 
    # see https://medium.com/@chaoren/how-to-timeout-in-python-726002bf2291
    print("RH", flush=True)
    ds_RHhind = getVar(df_historicalRH, model)
    pointRHhind = ds_RHhind.sel(lat=pointLat, lon=pointLon, method="nearest").hur
    tempDF = pointRHhind.to_dataframe()
    # N.B. tempDF indices are tuples [cftime, plev], plev is pressure level, 100000.0 Pa is surface
    surfaceRH = tempDF.xs(100000.0, level="plev").drop(columns=["lat", "lon"]) # Extract surface RH
    # Delete large dataframe and recover memory
    del tempDF         
    gc.collect()
    # Append surface RH results to dataframe 'hindRH'
    if i==0:
        hindRH = surfaceRH
        hindRH.columns = [model]
    else:
        newRH = surfaceRH['hur']           # Extract values, Assuming same time index for all files
        hindRH[model] = (newRH.values)

    # Save results to file each step in case of trouble
    hindRH.to_csv(hindRHfile)
    del surfaceRH         # Delete large dataframe and recover memory
    gc.collect()


# Compute Heat Index

### Make dataframe of combined input data 
historical & forecast TAS & RH

In [None]:
allTAS = pd.concat([hindTAS, foreTAS])        # Append forecast to end of hindcast
allRH = pd.concat([hindRH, foreRH])
allTAS["mean"] = allTAS.mean(axis=1, numeric_only=True)
allRH["mean"] = allRH.mean(axis=1, numeric_only=True)

# Compute monthly ensemble mean of TAS & RH
meanTAS_RH = pd.DataFrame()
meanTAS_RH["TAS"] = allTAS["mean"].copy()
meanTAS_RH["RH"]  = allRH["mean"].copy()

In [None]:
meanTAS_RH

### Compute Heat Index for all years
See [IncludeHelp](https://www.includehelp.com/python/pandas-apply-function-with-two-arguments-to-columns.aspx) for applying user function with multiple arguments (heatIndex) to each line of a Pandas DataFrame (meanTAS_RH).  N.B. Heat Index is in units of <sup>o</sup>F

In [None]:
meanTAS_RH['HI'] = meanTAS_RH.apply(lambda x: heatIndex(x['TAS'], x['RH']),axis=1)
meanTAS_RH.head(5)

### Save monthly TAS-RH-HI results to file

In [None]:
meanTAS_RH.to_csv("monthlyTAS-RH-HI.csv")

## Save seasonal HI results

In [None]:
# Compute and save seasonal results.  Use convenient xarray tools, see
# https://stackoverflow.com/questions/64976340/keeping-time-series-while-grouping-by-season-in-xarray
# allHI = xr.DataArray.from_series(pd.concat([hindHI,foreHI]))  # Combined hind- & forecast HI as xarray
allHI = meanTAS_RH['HI'].to_xarray()

seasonalHIarray = allHI.resample(time='QS-DEC').mean(dim="time")

# Custom functions to determine season (using "time.season" gives unintended results for December)
def isJJA(month):
    # "month" is in NH summer
    return (month >= 6) & (month <= 8)

def isDJF(month):
    # "month" is in NH winter.  Works for seasonal data...
    return (month >= 11) & (month <= 12)

summerHI = seasonalHIarray.sel(time=isJJA(seasonalHIarray['time.month']))
pd.DataFrame(summerHI, index=summerHI["time.year"]).to_csv("summerHI.csv")
# Winter historical results absurd for DFW, skip them
winterHI = seasonalHIarray.sel(time=isDJF(seasonalHIarray['time.month']))
pd.DataFrame(winterHI, index=winterHI["time.year"]).to_csv("winterHI.csv")

In [None]:
endTime = time.time()
runTime = endTime - startTime
if (runTime < 60.0):
    print(f"Notebook run time = {runTime :5.2f} sec.")
else:
    print(f"Notebook run time = {(runTime/60) :5.2f} min.")