# Urban Heat Index

By Cascade Tuholske 2020.01.21

Notebook is designed to take areal-averaged CHIRTS Tmax for each GHS-UCDB and down-scaled MERRA-2 humidity data and calculate the heat index for each city with a Tmax >80F.

[NOAA Heat Index Equation](https://www.wpc.ncep.noaa.gov/html/heatindex_equation.shtml)

**THINGS I HAVE Coded**
- C to F function
- Rothfusz regression and adjustments
- Steadman's

**NOTES**
At first I thought the heat index values for the hottest areas were insane (> 140F), but I spot checked the results and the [NOAA Heat Index Table](https://www.kjrh.com/weather/weather-blog-what-exaclty-is-the-heat-index) simply reds out values where Tmax >40C and RH > 50%. So I guess were are on track ...

#### Dependencies

In [1]:
import pandas as pd
import numpy as np
import xarray as xr
from random import random
from itertools import groupby
from operator import itemgetter
import geopandas as gpd 
import glob
from statistics import mean
import julian
import math

## Functions

In [2]:
def C_to_F(Tmax_C):
    "Function converts temp in C to F"
    Tmax_F = (Tmax_C * (9/5)) + 32
    
    return Tmax_F

In [20]:
def F_to_C(Tmax_F):
    "Function converts temp in F to C"
    Tmax_C = (Tmax_F - 32) * (5/9)
    
    return Tmax_C

In [23]:
def csv_to_xr(file_in, time_dim, space_dim):
    
    """ Function reads in a csv w/ GHS-UCDB IDs and temp, isolates the temp
    and returns a xarray data array with dims set to city ids and dates
    
    Args:
        file_in = file name and path
        time_dim = name for time dim as a str ... use date :-)
        space_dim = col name for GHS-UCDB IDs as an str (ID_HDC_G0)
    """
    
    df = pd.read_csv(file_in) # read the file in as a df
    print(df.shape)
    
    df_id = df[space_dim] # get IDs
    df = df.iloc[:,3:] # get only temp columns
    df.index = df_id # set index values
    df_drop = df.dropna() # Drop cities w/ no temp record 
    print(len(df_drop))
    
    arr = df_drop.to_numpy() # turn temp cols into an np array
    
    # make xr Data Array w/ data as temp and dims as spece (e.g. id)
    
    # Note 2019 09 17 changed to xr.Dataset from xr.Dataarray
    xr_da = xr.DataArray(arr, coords=[df_drop.index, df_drop.columns], 
                            dims=[space_dim, time_dim])
    return xr_da

In [36]:
def heatindex(Tmax, RH, unit_in, unit_out):
    
    """Make Heat Index from 2m air and relative humidity following NOAA's guidelines: 
    https://www.wpc.ncep.noaa.gov/html/heatindex_equation.shtml. It is assumed that the
    tempatures and RH are geographically and temporally aligned in the x-arrays and can be stacked
    to the funciton.
    
    --- update as needed cpt 2020.02.17
    
    Args:
        Tmax = x-array of tempatures
        RH = x-array of realtive humitity
        unit_in = F or C, will convert C to F to apply heat index
        unit_out = If C is desired, will convert data to C
        
    Returns HI
    """
    
    # Make all data as float
    Tmax = Tmax.astype('float')
    RH = RH.astype('float')
    
    # 1 convert C to F if needed
    if unit_in == 'C':
        Tmax = C_to_F(Tmax)
        
    # 2 Apply Steadman's and average with Tmax
    USE_STEADMAN = (0.5 * (Tmax + 61.0 + ((Tmax-68.0)*1.2) + (RH*0.094)) + Tmax) / 2 < 80
    STEADMAN = USE_STEADMAN * (0.5 * (Tmax + 61.0 + ((Tmax-68.0)*1.2) + (RH*0.094))) #.astype(int)
    
    # 3 Use Rothfusz if (STEADMAN + Tmax) / 2 > 80
    USE_ROTH = (0.5 * (Tmax + 61.0 + ((Tmax-68.0)*1.2) + (RH*0.094)) + Tmax) / 2 > 80
    ROTH = USE_ROTH * (-42.379 + 2.04901523*Tmax + 10.14333127*RH - .22475541*Tmax *RH - .00683783*Tmax*Tmax - .05481717*RH*RH + .00122874*Tmax*Tmax*RH + .00085282*Tmax*RH*RH - .00000199*Tmax*Tmax*RH*RH)

    # 3 Adjust Roth 1
    USE_ADJ1 = (RH < 13) & (Tmax > 80) & (Tmax < 112)
    ADJ1_RH = USE_ADJ1 * RH #.astype(int)
    ADJ1_RH = ADJ1_RH.where(ADJ1_RH != 0) #ADJ1_RH[ADJ1_RH == 0] = np.nan
    ADJ1_Tmax = USE_ADJ1 * Tmax # .astype(int)
    ADJ1_Tmax = ADJ1_Tmax.where(ADJ1_Tmax != 0) #ADJ1_Tmax[ADJ1_Tmax == 0] = np.nan
    ADJ1 = ((13-ADJ1_RH)/4)*np.sqrt((17-abs(ADJ1_Tmax-95.))/17)
    ADJ1 = np.nan_to_num(ADJ1, 0)
    
    ADJ1_ROTH = ROTH * USE_ADJ1
    ADJ1_ROTH = ADJ1_ROTH - ADJ1
    
    # 4 Adjust Roth 2
    USE_ADJ2 = (RH > 85) & (Tmax > 80) & (Tmax < 87)
    ADJ2_RH = USE_ADJ2 * RH #.astype(int)
    ADJ2_RH = ADJ2_RH.where(ADJ2_RH != 0) #ADJ2_RH[ADJ2_RH == 0] = np.nan
    ADJ2_Tmax = USE_ADJ2.astype(int) * Tmax
    ADJ2_Tmax = ADJ2_Tmax.where(ADJ2_Tmax != 0) #ADJ2_Tmax[ADJ2_Tmax == 0] = np.nan
    ADJ2 = ((ADJ2_RH-85)/10) * ((87-ADJ2_Tmax)/5)
    ADJ2 = np.nan_to_num(ADJ2, 0)
    
    ADJ2_ROTH = ROTH * USE_ADJ2
    ADJ2_ROTH = ADJ2_ROTH + ADJ2
    
    # Roth w/o adjustments
    ROTH = ROTH * ~USE_ADJ1 * ~USE_ADJ2
    
    # sum the stacked arrays
    HI = ROTH + STEADMAN + ADJ1_ROTH +  ADJ2_ROTH 
    
    # Convert HI to C if desired
    if unit_out == 'C':
        HI = F_to_C(HI)
    
    # return for test
    # return STEADMAN, ADJ1_ROTH, ADJ2_ROTH, ROTH, HI
    
    return HI

In [61]:
def apply_heatindex(DIR_Tmax, DIR_RH, DIR_HI, unit_in, unit_out):
    """Function applies NOAA's heatindex to two pair directories w/ CSVs of realitive humidity
    and tempatures, respective, in a pairwise fashion
    
    Args:
        DIR_Tmax = the directory where Tmax .csv files are stored
        DIR_RH = the directory where RH .csv files are stored
        DIR_HI = the directory where HI files will be written
        unit_in = temp unit for Tmax (C or F)
        unit_out = desired temp unit for HI (C or F) for the output
    """
    Tmax_fn_list = glob.glob(DIR_Tmax+'*.csv')
    RH_fn_list = glob.glob(DIR_RH+'*.csv')

    for Tmax_fn, RH_fn in zip(sorted(Tmax_fn_list),sorted(RH_fn_list)):
    
        # Check the years RH and Tmax 
        Tmax_year = Tmax_fn.split('GHS-Tmax-DAILY_')[1].split('.csv')[0]
        print('Tmax year is ',Tmax_year)
        RH_year = RH_fn.split('GHS-Tmax-RH_')[1].split('.csv')[0]
        print('RH year is ', RH_year)

        # Read csv as x-array
        Tmax_xr = csv_to_xr(Tmax_fn, time_dim = 'date', space_dim = 'ID_HDC_G0')
        RH_xr = csv_to_xr(RH_fn, time_dim = 'date', space_dim = 'ID_HDC_G0')

        # Make heat index
        hi = heatindex(Tmax_xr, RH_xr, unit_in = unit_in, unit_out = unit_out)


        # CASCADE GO LOOK AT HOW X-ARRAYS ARE WRITTEN TO CSVS IN EARLIER CODE <<<<---- 

        # write to csv
        df = hi.to_pandas()
        df_out_nm = 'GHS-HI-DAILY_'+Tmax_year+'.csv'
        df.to_csv(DIR_HI+df_out_nm)
        print(RH_year, ' done \n')
    
    print('ALL DONE!')

# Run it

In [62]:
DIR_Tmax = '/home/cascade/projects/UrbanHeat/data/interim/CHIRTS-GHS-DAILY-Tmax/'
DIR_RH = '/home/cascade/projects/UrbanHeat/data/interim/CHIRTS-GHS-DAILY-RH/'
DIR_HI = '/home/cascade/projects/UrbanHeat/data/interim/CHIRTS-GHS-DAILY-HI/'

# FIRE HI
apply_heatindex(DIR_Tmax, DIR_RH, DIR_HI, unit_in = 'C', unit_out = 'C')

Tmax year is  1983
RH year is  1983
(13135, 368)
13067
(13135, 368)
13067
1983  done 

Tmax year is  1984
RH year is  1984
(13135, 369)
13067
(13135, 369)
13067
1984  done 

Tmax year is  1985
RH year is  1985
(13135, 368)
13067
(13135, 368)
13067
1985  done 

Tmax year is  1986
RH year is  1986
(13135, 368)
13067
(13135, 368)
13067
1986  done 

Tmax year is  1987
RH year is  1987
(13135, 368)
13067
(13135, 368)
13067
1987  done 

Tmax year is  1988
RH year is  1988
(13135, 369)
13067
(13135, 369)
13067
1988  done 

Tmax year is  1989
RH year is  1989
(13135, 368)
13067
(13135, 368)
13067
1989  done 

Tmax year is  1990
RH year is  1990
(13135, 368)
13067
(13135, 368)
13067
1990  done 

Tmax year is  1991
RH year is  1991
(13135, 368)
13067
(13135, 368)
13067
1991  done 

Tmax year is  1992
RH year is  1992
(13135, 369)
13067
(13135, 369)
13067
1992  done 

Tmax year is  1993
RH year is  1993
(13135, 368)
13067
(13135, 368)
13067
1993  done 

Tmax year is  1994
RH year is  1994
(13135,

# Test

In [34]:
# Test with the conditions above test
temp = xr.DataArray([70, 100, 85, 100])
rh = xr.DataArray([5, 10, 90, 20])
a,b,c,d, e = heatindex(temp, rh, unit_in = 'F', unit_out = 'F')
print('Stead ', a)
print('Adj1', b)
print('Adj2', c)
print('Roth', d)
print('All HI values', e)

Stead  <xarray.DataArray (dim_0: 4)>
array([66.935,  0.   ,  0.   ,  0.   ])
Dimensions without coordinates: dim_0
Adj1 <xarray.DataArray (dim_0: 4)>
array([ 0.      , 94.122483,  0.      ,  0.      ])
Dimensions without coordinates: dim_0
Adj2 <xarray.DataArray (dim_0: 4)>
array([  0.      ,   0.      , 101.780804,   0.      ])
Dimensions without coordinates: dim_0
Roth <xarray.DataArray (dim_0: 4)>
array([ 0.     ,  0.     ,  0.     , 97.47396])
Dimensions without coordinates: dim_0
All HI values <xarray.DataArray (dim_0: 4)>
array([ 66.935   ,  94.122483, 101.780804,  97.47396 ])
Dimensions without coordinates: dim_0


In [35]:
# Test with actual data from HI chart : https://www.weather.gov/safety/heat-index
# Results fit the table

temp = xr.DataArray([80, 82, 94, 104])
rh = xr.DataArray([40, 40, 40, 40])
a,b,c,d, e = heatindex(temp, rh, unit_in = 'F', unit_out = 'F')
print(rh[0], temp[0], a)
print(rh[0], temp[1], b)
print(rh[0], temp[2], c)
print(rh[0], temp[3], d)
print('All HI values', e)

<xarray.DataArray ()>
array(40) <xarray.DataArray ()>
array(80) <xarray.DataArray (dim_0: 4)>
array([79.58,  0.  ,  0.  ,  0.  ])
Dimensions without coordinates: dim_0
<xarray.DataArray ()>
array(40) <xarray.DataArray ()>
array(82) <xarray.DataArray (dim_0: 4)>
array([0., 0., 0., 0.])
Dimensions without coordinates: dim_0
<xarray.DataArray ()>
array(40) <xarray.DataArray ()>
array(94) <xarray.DataArray (dim_0: 4)>
array([0., 0., 0., 0.])
Dimensions without coordinates: dim_0
<xarray.DataArray ()>
array(40) <xarray.DataArray ()>
array(104) <xarray.DataArray (dim_0: 4)>
array([  0.      ,  81.453392,  97.170973, 118.877065])
Dimensions without coordinates: dim_0
All HI values <xarray.DataArray (dim_0: 4)>
array([ 79.58    ,  81.453392,  97.170973, 118.877065])
Dimensions without coordinates: dim_0


# Test With CSVs

In [37]:
DIR_RH = '/home/cascade/projects/UrbanHeat/data/interim/CHIRTS-GHS-DAILY-RH/'
DIR_Tmax = '/home/cascade/projects/UrbanHeat/data/interim/CHIRTS-GHS-DAILY-Tmax/'
DIR_HI = '/home/cascade/projects/UrbanHeat/data/interim/CHIRTS-GHS-DAILY-HI/'

In [38]:
FN_RH = 'GHS-Tmax-RH_1984.csv'
FN_Tmax = 'GHS-Tmax-DAILY_1984.csv'

In [39]:
RH = csv_to_xr(DIR_RH+FN_RH, space_dim = 'ID_HDC_G0', time_dim = 'date')
Tmax = csv_to_xr(DIR_Tmax+FN_Tmax, space_dim = 'ID_HDC_G0', time_dim = 'date')

(13135, 369)
13067
(13135, 369)
13067


In [40]:
test = heatindex(Tmax, RH, unit_in = 'C', unit_out = 'C')

In [41]:
test_pd = test.to_pandas()

In [42]:
test_pd['1984.07.01'].sort_values(ascending = False).head(50)

ID_HDC_G0
11220    69.308345
11182    68.724773
10101    61.483578
5348     57.905744
6284     57.433921
5530     57.274213
6273     57.244881
6286     57.195767
10099    56.981850
11228    56.764815
6320     56.699861
5523     56.672009
6274     56.379129
11199    56.377981
6308     56.355506
5543     56.245587
11284    56.120548
6306     56.120305
6318     56.069588
6311     55.826775
5548     55.779169
6215     55.771830
5493     55.728903
6314     55.713507
6328     55.711013
6333     55.661648
6325     55.623955
6278     55.591737
6253     55.521060
6223     55.458858
5540     55.421757
5509     55.397743
11162    55.327412
5506     55.325984
11104    55.294701
11131    55.290137
6194     55.211096
6269     55.175960
6281     54.964046
5538     54.913131
11167    54.903616
6251     54.893903
11311    54.855647
6257     54.768511
11255    54.760381
11294    54.734018
11202    54.719885
6232     54.712800
11280    54.613163
6247     54.572206
Name: 1984.07.01, dtype: float64

# Build Dir Walk

In [44]:
# Get files from both RH and temp

DIR_Tmax = '/home/cascade/projects/UrbanHeat/data/interim/CHIRTS-GHS-DAILY-Tmax/'
DIR_RH = '/home/cascade/projects/UrbanHeat/data/interim/CHIRTS-GHS-DAILY-RH/'
DIR_HI = '/home/cascade/projects/UrbanHeat/data/interim/CHIRTS-GHS-DAILY-HI/'

print(DIR_Tmax)
print(DIR_RH) 

Tmax_fn_list = glob.glob(DIR_Tmax+'*.csv')
RH_fn_list = glob.glob(DIR_RH+'*.csv')

for Tmax_fn, RH_fn in zip(sorted(Tmax_fn_list),sorted(RH_fn_list)):
    
    # Check the years RH and Tmax 
    Tmax_year = Tmax_fn.split('GHS-Tmax-DAILY_')[1].split('.csv')[0]
    print('Tmax year is ',Tmax_year)
    RH_year = RH_fn.split('GHS-Tmax-RH_')[1].split('.csv')[0]
    print('RH year is ', RH_year)
    
    # Read csv as x-array
    Tmax_xr = csv_to_xr(Tmax_fn, time_dim = 'date', space_dim = 'ID_HDC_G0')
    RH_xr = csv_to_xr(RH_fn, time_dim = 'date', space_dim = 'ID_HDC_G0')
    
    # Make heat index
    hi = heatindex(Tmax_xr, RH_xr, unit_in = 'C', unit_out = 'F')
    

    # CASCADE GO LOOK AT HOW X-ARRAYS ARE WRITTEN TO CSVS IN EARLIER CODE <<<<---- 
    
    # write to csv
    df = hi.to_pandas()
    df_out_nm = 'GHS-HI-DAILY_'+Tmax_year+'.csv'
    df.to_csv(DIR_HI+df_out_nm)
    print(RH_year, ' done \n')

/home/cascade/projects/UrbanHeat/data/interim/CHIRTS-GHS-DAILY-Tmax/
/home/cascade/projects/UrbanHeat/data/interim/CHIRTS-GHS-DAILY-RH/
Tmax year is  1983
RH year is  1983
(13135, 368)
13067
(13135, 368)
13067
1983  done 

Tmax year is  1984
RH year is  1984
(13135, 369)
13067
(13135, 369)
13067
1984  done 

Tmax year is  1985
RH year is  1985
(13135, 368)
13067
(13135, 368)
13067
1985  done 

Tmax year is  1986
RH year is  1986
(13135, 368)
13067
(13135, 368)
13067
1986  done 

Tmax year is  1987
RH year is  1987
(13135, 368)
13067
(13135, 368)
13067
1987  done 

Tmax year is  1988
RH year is  1988
(13135, 369)
13067
(13135, 369)
13067
1988  done 

Tmax year is  1989
RH year is  1989
(13135, 368)
13067
(13135, 368)
13067
1989  done 

Tmax year is  1990
RH year is  1990
(13135, 368)
13067
(13135, 368)
13067
1990  done 

Tmax year is  1991
RH year is  1991
(13135, 368)
13067
(13135, 368)
13067
1991  done 

Tmax year is  1992
RH year is  1992
(13135, 369)
13067
(13135, 369)
13067
1992  d

In [46]:
df.head()

date,2016.01.01,2016.01.02,2016.01.03,2016.01.04,2016.01.05,2016.01.06,2016.01.07,2016.01.08,2016.01.09,2016.01.10,...,2016.12.22,2016.12.23,2016.12.24,2016.12.25,2016.12.26,2016.12.27,2016.12.28,2016.12.29,2016.12.30,2016.12.31
ID_HDC_G0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5782,-22.592424,-23.554328,2.574384,-0.19271,-19.426103,-24.760829,-28.83201,-25.283141,-26.063285,-23.418672,...,-41.773782,-41.003207,-36.506794,-25.580735,-22.45383,3.642904,6.551416,0.463518,0.936664,-27.764096
3316,21.079766,11.637573,13.558403,-9.893018,2.573208,-22.632429,12.95962,18.931481,15.791468,-0.968569,...,27.752771,21.888568,15.209581,18.406033,14.39437,27.769338,28.28793,25.557817,26.247857,26.464227
5645,6.462986,0.2979,-9.333129,-3.300113,-8.258814,-20.615119,-29.202079,-25.054321,-12.786564,-4.775674,...,-19.774123,-6.890295,25.462953,18.750033,28.544915,29.574194,26.730558,18.537209,0.690558,-9.914011
3185,21.567372,17.21361,18.869283,5.228945,3.181769,-9.277802,-4.43852,9.120031,13.654372,14.644074,...,33.183937,32.004164,29.872098,29.184973,27.761547,26.321243,29.199746,30.961252,34.033286,34.13588
3539,0.165386,21.354151,2.08376,-6.174497,-6.231603,1.162061,0.659924,-3.969674,-3.165011,-12.191002,...,30.159408,28.490874,26.150959,29.76686,28.474605,26.887929,26.165526,27.567009,28.703594,30.255283


# Check HI

In [47]:
df.head()

date,2016.01.01,2016.01.02,2016.01.03,2016.01.04,2016.01.05,2016.01.06,2016.01.07,2016.01.08,2016.01.09,2016.01.10,...,2016.12.22,2016.12.23,2016.12.24,2016.12.25,2016.12.26,2016.12.27,2016.12.28,2016.12.29,2016.12.30,2016.12.31
ID_HDC_G0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5782,-22.592424,-23.554328,2.574384,-0.19271,-19.426103,-24.760829,-28.83201,-25.283141,-26.063285,-23.418672,...,-41.773782,-41.003207,-36.506794,-25.580735,-22.45383,3.642904,6.551416,0.463518,0.936664,-27.764096
3316,21.079766,11.637573,13.558403,-9.893018,2.573208,-22.632429,12.95962,18.931481,15.791468,-0.968569,...,27.752771,21.888568,15.209581,18.406033,14.39437,27.769338,28.28793,25.557817,26.247857,26.464227
5645,6.462986,0.2979,-9.333129,-3.300113,-8.258814,-20.615119,-29.202079,-25.054321,-12.786564,-4.775674,...,-19.774123,-6.890295,25.462953,18.750033,28.544915,29.574194,26.730558,18.537209,0.690558,-9.914011
3185,21.567372,17.21361,18.869283,5.228945,3.181769,-9.277802,-4.43852,9.120031,13.654372,14.644074,...,33.183937,32.004164,29.872098,29.184973,27.761547,26.321243,29.199746,30.961252,34.033286,34.13588
3539,0.165386,21.354151,2.08376,-6.174497,-6.231603,1.162061,0.659924,-3.969674,-3.165011,-12.191002,...,30.159408,28.490874,26.150959,29.76686,28.474605,26.887929,26.165526,27.567009,28.703594,30.255283


In [48]:
# get GHS-UCDB
ghs = gpd.read_file('/home/cascade/projects/UrbanHeat/data/raw/GHS_UCDB/GHS_STAT_UCDB2015MT_GLOBE_R2019A_V1_0.shp')

In [49]:
out = gpd.GeoDataFrame()
out['geometry'] = ghs['geometry']
out['ID_HDC_G0'] = ghs['ID_HDC_G0']

HI_out = gpd.GeoDataFrame()
HI_out['ID_HDC_G0'] = df.index
HI_out['2016.07.01'] = df['2016.07.01']

out = pd.merge(out, HI_out, on = 'ID_HDC_G0', how = 'inner')

In [None]:
#out.to_file('/home/cascade/projects/UrbanHeat/HI_20160701_test.shp')

In [50]:
# Check hotcheck values 

Tmax =  pd.read_csv(Tmax_fn)
RH = pd.read_csv(RH_fn)

check = pd.DataFrame()
check['2016.07.01.HI'] = df['2016.07.01'].sort_values(ascending = False).head(50)
check = check.merge(Tmax[['ID_HDC_G0', '2016.07.01']], on = 'ID_HDC_G0', how = 'inner')
check.rename(columns = {'2016.07.01':'2016.07.01.Tmax'}, inplace = True)
check = check.merge(RH[['ID_HDC_G0', '2016.07.01']], on = 'ID_HDC_G0', how = 'inner')
check.rename(columns = {'2016.07.01':'2016.07.01.RH'}, inplace = True)
# check['2016.07.01.RH'] = RH['2016.07.01'].sort_values(ascending = False).head(50)


# df['2016.07.01'].sort_values(ascending = False).head(50)

In [51]:
check.head()

Unnamed: 0,ID_HDC_G0,2016.07.01.HI,2016.07.01.Tmax,2016.07.01.RH
0,8076,149.982576,42.480167,52.281685
1,7947,149.849048,42.41635,52.465736
2,7801,149.146126,41.838413,54.470192
3,7866,148.968031,41.753212,54.72762
4,7988,148.927533,41.975243,53.759575


In [52]:
checkTmax = xr.DataArray(check['2016.07.01.Tmax'].to_numpy())
checkRH = xr.DataArray(check['2016.07.01.RH'].to_numpy())


In [58]:
# Now check the heat index
checkHI = heatindex(checkTmax, checkRH, unit_in = 'C', unit_out = 'F')

In [59]:
checkHI

<xarray.DataArray (dim_0: 50)>
array([149.982576, 149.849048, 149.146126, 148.968031, 148.927533, 148.537434,
       148.477097, 147.963079, 147.88449 , 147.599435, 147.24237 , 147.044344,
       146.72067 , 146.666001, 146.608057, 146.579288, 146.572298, 146.421116,
       146.400205, 146.34497 , 146.309686, 146.178303, 146.133537, 146.08185 ,
       146.020217, 146.005356, 145.989807, 145.983705, 145.956715, 145.784795,
       145.690167, 145.656583, 145.607071, 145.575731, 145.55699 , 145.320791,
       145.197122, 144.971663, 144.917968, 144.845825, 144.60538 , 144.56525 ,
       144.446476, 144.414984, 144.358871, 144.35668 , 144.345614, 144.283951,
       144.282328, 144.281888])
Dimensions without coordinates: dim_0

In [60]:
# Test with the conditions above test
temp = xr.DataArray([42, 41, 40, 39])
rh = xr.DataArray([52, 52, 52, 52])
heatindex(temp, rh, unit_in = 'C', unit_out = 'F')


<xarray.DataArray (dim_0: 4)>
array([146.188224, 139.536591, 133.219817, 127.237902])
Dimensions without coordinates: dim_0

# Well it makes sense

These crazy high values make sense when checked with NOAA's table. But once you have a Tmax >40C and RH > 40%, the HI values is crazy high ...

# OLD CODE

In [None]:
def csv_to_xr(file_in, time_dim, space_dim):
    
    """ Function reads in a csv w/ GHS-UCDB IDs and temp, isolates the temp
    and returns a xarray data array with dims set to city ids and dates
    
    Args:
        file_in = file name and path
        time_dim = name for time dim as a str ... use date :-)
        space_dim = col name for GHS-UCDB IDs as an str (ID_HDC_G0)
    """
    
    df = pd.read_csv(file_in) # read the file in as a df
    print(df.shape)
    
    df_id = df[space_dim] # get IDs
    df = df.iloc[:,3:] # get only temp columns
    df.index = df_id # set index values
    df_drop = df.dropna() # Drop cities w/ no temp record 
    print(len(df_drop))
    
    arr = df_drop.to_numpy() # turn temp cols into an np array
    
    # make xr Data Array w/ data as temp and dims as spece (e.g. id)
    
    # Note 2019 09 17 changed to xr.Dataset from xr.Dataarray
    temp_xr_da = xr.DataArray(arr, coords=[df_temp_drop.index, df_temp_drop.columns], 
                            dims=[space_dim, time_dim])
    
    return xr_da

#### #2 Function finds all the Tmax Events and writes it to a dateframe w/ dates for each city

In [None]:
def tmax_days(xarray, Tthresh):
    """ Function finds all the tmax days in a year and sums total days per year 
    greater than a threshold within a year where Tmax > Tthresh for each city. Returns the total number of days,
    the dates, the tempatures, and the intensity (daily Tmax - Tthresh)
    
    Args: 
        xarray = an xarray object with dims = (space, times)
        Tthresh = int of temp threshold
    """
    
    # empty lists & df
    id_list = []
    date_list = []
    dayTot_list = []
    tmax_list = []
    intensity_list = []
    df_out = pd.DataFrame()
    
    # subset xarray
    out = xarray.where(xarray > Tthresh, drop = True)

    # start loop 
    for index, loc in enumerate(out.ID_HDC_G0):
        id_list.append(out.ID_HDC_G0.values[index]) # get IDS
        date_list.append(out.sel(ID_HDC_G0 = loc).dropna(dim = 'date').date.values) # get event dates
        
        # this is actually getting the total events of all, 2019-09-22
        dayTot_list.append(len(out.sel(ID_HDC_G0 = loc).dropna(dim = 'date').date.values)) # get event totals
        
        tmax_list.append(out.sel(ID_HDC_G0 = loc).dropna(dim = 'date').values) # get temp values
        intensity_list.append(out.sel(ID_HDC_G0 = loc).dropna(dim = 'date').values - Tthresh) # get severity

    # write to a data frame
    df_out['ID_HDC_G0'] = id_list
    df_out['total_days'] = dayTot_list
    df_out['dates'] = date_list
    df_out['tmax'] = tmax_list
    df_out['tmax_tntensity'] = intensity_list

    # return df_out
    return df_out

#### #3 Function splits the dataset into Tmax events (continuous days >Tmax) for each city

In [None]:
def jul_convert(dates):
    "Function turn days into julian datetime"
    jul_days = pd.to_datetime(dates).to_julian_date()
    
    return jul_days

def event_split(dates, ID_HDC_G0, intensity, tmax, total_days):
    """ Searchs a list of dates and isolates sequential dates as a list, then calculates event stats.
    See comments in code for more details. 
    
    Args:
        dates: pandas.core.index as julian dates
        ID_HDC_G0: city ID as string
        country: country for each city as string
        intensity: numpy.ndarray of intensities values
        tmax: numpy.ndarray of intensities values of tmax values
        total_days: total number of tmax days in a year for a given city

    """

    # city id
    city_id = ID_HDC_G0
    tot_days = total_days
    
    # lists to fill
    city_id_list = []
    tot_days_list = []
    event_dates_list = []
    dur_list = []
    intensity_list = []
    tmax_list = []
    avg_temp_list = []
    avg_int_list = []
    tot_int_list = []
    
    # data frame out
    df_out = pd.DataFrame()
    
    # turn days into julian days
    jul_days = jul_convert(dates)
    
    # Counters to make sure we write the correct event dates to a list, don't want julian days in output
    counter = 0
    start = 0
    end = 0
    
    # Loop through dur list and isolate seq days, temps, and intensities
    for k, g in groupby(enumerate(jul_days.values), lambda x: x[1]-x[0]):
        
        seq = list(map(itemgetter(1), g)) # isolate seq. days
        dur = len(seq) # duration of each event
        
        counter = counter + dur # add duration to counter
        end = counter # end of current event
        
        event_dates = dates[start:end] # dates of tmax days during each event
        intense = intensity[start:end] # intensity of each day during event
        temp = tmax[start:end] # temp of each day during event
        avg_temp = mean(temp) # avg. temp during event
        avg_int = mean(intense) # avg. intensity during event
        tot_int = intense.sum() # total intensity during event
        
        start = counter # reset start to current end (e.g. counter)
        
        # fill lists
        city_id_list.append(city_id)
        tot_days_list.append(tot_days)
        dur_list.append(dur)
        event_dates_list.append(event_dates)
        intensity_list.append(intense)
        tmax_list.append(temp)
        avg_temp_list.append(avg_temp)
        avg_int_list.append(avg_int)
        tot_int_list.append(tot_int)

    # write out as a dateframe
    df_out['ID_HDC_G0'] = city_id_list
    df_out['total_days'] = tot_days_list
    df_out['duration'] = dur_list
    df_out['avg_temp'] = avg_temp_list
    df_out['avg_intensity'] = avg_int_list
    df_out['tot_intensity'] = tot_int_list
    df_out['event_dates'] = event_dates_list
    df_out['duration'] = dur_list
    df_out['intensity'] = intensity_list
    df_out['tmax'] = tmax_list

    return df_out

#### #4 Function feeds output from function 2 into function 3

In [None]:
def tmax_stats(df_in):
    """ runs event_split functionon a dataframe to produce desired tmax stats
    
        NOTE - If you add arguments to event_split to make more states, 
        be sure to update this function
    
        args:
            df: input dataframe
        
    """
    df_out = pd.DataFrame()
    
    # NOTE - If you add arguments to event_split to make more stats, 
    # be sure to update this function
    
    for index, row in df_in.iterrows():
        dates = row['dates'] # Get event dates
        intensity = row['tmax_tntensity'] # Get intensity for each day
        tmax = row['tmax'] # Get tmax for each day
        ID_HDC_G0 = row['ID_HDC_G0'] # get city id
        total_days = row['total_days'] # get total number of tmax days

        df = event_split(dates, ID_HDC_G0, intensity, tmax, total_days)

        df_out = df_out.append(df)
    
    return df_out

#### #5 Loops through a file list and applies functions 1 - 4 to the data to produce Tmax stats for all tmax events in a given year across all cities in the dataset

In [None]:
def stats_loop(dir_in, dir_out, fn_out, time_dim, space_dim, Tthresh):
    
    """ Loop through a dir with csvs to apply csv_to_xr and
    tmax_stats function and save out a .csv for each year
    
    Args:
        dir_in = dir path to loop through
        dir_out = dir path to save files out
        fn_out = string to label out files
        time_dim = name for time dim as a str ... use date :-) for csv_to_xr function
        space_dim = col name for GHS-UCDB IDs as an str (ID_HDC_G0) for csv_to_xr function
        Tthresh = int of temp threshold for temp_event function -- 40.6 is used

    """
    
    # Open the GHS-ID List with GeoPANDAS read_file
    ghs_ids_fn = 'GHS-UCSB-IDS.csv'
    ghs_ids_df = pd.read_csv(DATA_INTERIM+ghs_ids_fn)
        
    # Git File list
    fn_list = glob.glob(dir_in+'*.csv')
    
    for fn in sorted(fn_list):
        
        # Get year for arg for temp_event function
        year = fn.split('GHS-Tmax-DAILY_')[1].split('.csv')[0]
        print(year)
        
        # read csv as a data array
        temp_xr_da = csv_to_xr(fn, time_dim, space_dim)
        
        # data array to tmax events, out as df
        df_days = tmax_days(temp_xr_da, Tthresh)
        
        # tmax events stats, out as df
        df_out = tmax_stats(df_days)
        
        # merge to get countries
        ghs_ids_df_out = ghs_ids_df.merge(df_out, on='ID_HDC_G0', how = 'inner') 
        
        # write it all out
        ghs_ids_df_out.to_csv(dir_out+fn_out+year+'.csv')

        print(year, 'SAVED!')

#### Run Code

In [None]:
dir_in = '/home/cascade/projects/data_out_urbanheat/CHIRTS-GHS-DAILY/' # output from avg temp
DATA_INTERIM = '/home/cascade/projects/UrbanHeat/data/interim/' # ghs ID list
dir_out = '/home/cascade/projects/data_out_urbanheat/CHIRTS-GHS-Events-Stats/'
fn_out = 'CHIRTS-GHS-Events-Stats'
time_dim = 'date'
space_dim = 'ID_HDC_G0'
Tthresh = 40.6


In [None]:
#stats_loop(dir_in, dir_out, fn_out, time_dim, space_dim, Tthresh)

## QA/QC

In [None]:
# Load files

dir_out = '/home/cascade/projects/data_out_urbanheat/CHIRTS-GHS-Events-Stats/'
fn_out = 'CHIRTS-GHS-Events-Stats'

In [None]:
qc_df = pd.read_csv(dir_out+fn_out+'1983.csv')

In [None]:
qc_df.head()

In [None]:
dc_city = qc_df[qc_df['ID_HDC_G0'] == 5534]

In [None]:
dc_city

## Testing

In [None]:
# File Paths
DAILY_PATH = '/home/cascade/projects/data_out_urbanheat/CHIRTS-GHS-DAILY/' # output from avg temp
DATA_INTERIM = '/home/cascade/projects/data_out_urbanheat/data/interim/'
DATA_OUT = '/home/cascade/projects/data_out/'

In [None]:
# File name to test
fn_in = 'GHS-Tmax-DAILY_1983.csv'

In [None]:
# Open a raw file
xr1983 = csv_to_xr(DAILY_PATH+fn_in, 'date', 'ID_HDC_G0')

In [None]:
days1983 = tmax_days(xr1983, 40.6)

In [None]:
test = days1983[0:30]
test

# Maybe add in days_total

In [None]:
# Build routine for loop through a csv

df_out = pd.DataFrame()

for index, row in test.iterrows():
    dates = row['dates'] # Get event dates
    intensity = row['tmax_tntensity'] # Get intensity for each day
    tmax = row['tmax'] # Get tmax for each day
    ID_HDC_G0 = row['ID_HDC_G0'] # get city id
    total_days = row['total_days'] # get total number of tmax days
    
    df = event_split(dates, ID_HDC_G0, intensity, tmax, total_days)
    
    df_out = df_out.append(df)

In [None]:
df_out

# Fixing code
2019.10.19 Cascade Tuholkse

Need to fix ```event split``` function

Somewhere in 1984 is this event sequence: ['1984.01.01' '1984.01.02' '1984.01.07']


**FOUND PROBLEM AND IT IS FIXED**

In [None]:
# File Paths
DAILY_PATH = '/home/cascade/projects/data_out_urbanheat/CHIRTS-GHS-DAILY/' # output from avg temp
DATA_INTERIM = '/home/cascade/projects/data_out_urbanheatv/data/interim/'
DATA_OUT = '/home/cascade/projects/data_out_urbanheat/testout/'

In [None]:
# File name to test
fn_in = 'GHS-Tmax-DAILY_1983.csv'

In [None]:
# Open a raw file
xr1983 = csv_to_xr(DAILY_PATH+fn_in, 'date', 'ID_HDC_G0')

In [None]:
# Find all the tmax days
tmax1983 = tmax_days(xr1983, 40.6)

In [None]:
# Make a subset

test = tmax1983[tmax1983['ID_HDC_G0'] == 6279]
test

In [None]:
for index, row in test.iterrows():
    dates = row['dates'] # Get event dates
    intensity = row['tmax_tntensity'] # Get intensity for each day
    tmax = row['tmax'] # Get tmax for each day
    ID_HDC_G0 = row['ID_HDC_G0'] # get city id
    total_days = row['total_days'] # get total number of tmax days



In [None]:
# Convert to jul days
jul_days = pd.to_datetime(dates).to_julian_date()
jul_days


In [None]:
mjd = 2445507.5 - 43200
dt = julian.from_jd(mjd, fmt='mjd')
print(dt)


In [None]:
dates

In [None]:
dates = ['1983.06.20', '1983.06.23', '1983.06.24', '1983.06.25',
        '1983.06.26', '1983.06.27', '1983.06.28', '1983.06.29',
        '1983.06.30', '1983.07.01', '1983.07.21', '1983.07.22',
        '1983.07.23', '1983.08.01']

pd_dates = pd.to_datetime(dates)
df_dates = pd.DataFrame()
df_dates['dates'] = pd_dates



test = df_dates['dates'].apply(lambda x: x.toordinal())

In [None]:
from datetime import date
date.fromordinal(test[1])

In [None]:
counter = 0

In [None]:
event_dates = dates[0:10] # dates of tmax days during each event
print(event_dates)
print(len(event_dates))

event_dates = dates[10:13] # dates of tmax days during each event
print(event_dates)
print(len(event_dates))

event_dates = dates[13:14] # dates of tmax days during each event
print(event_dates)
print(len(event_dates))

In [None]:
event_dates_list = []
counter = 0
start = 0
end = 0

for k, g in groupby(enumerate(jul_days.values), lambda x: x[1]-x[0]):
    
    len_dates = len(dates) # len of all Tmax dates for a given city
#   print(len(dates))
    
    seq = list(map(itemgetter(1), g)) # isolate seq. days
    dur = len(seq) # duration of each event 
    
    counter = counter + dur
    end = counter
    
    print(end)
    
    event_dates = dates[start:end] # dates of tmax days during each event
    print(event_dates)
    
    start = counter
    
# #     print('dur= ', dur)
    
#     event_dates = dates[0:10] # dates of tmax days during each event
#     print(event_dates)
    
#     event_dates = dates[11:13] # dates of tmax days during each event
#     print(event_dates)
    
#     event_dates = dates[:len_dates] # dates of tmax days during each event
#     print(event_dates)
    
    
    
    
#     counter = counter + dur

#     print('counter = ', counter)
    
#     dif = dur - counter
#     print('dif = ', dif)
    
#     start = start 
#     print('start = ',start)
    
# #     end = counter + dur
# #     print("start = ",end)
    
    
    
#     counter = counter + dur
#     print(counter)
#     end = counter + dur
#     start
#     event_dates = dates[start:end] # dates of tmax days during each event
#     print(event_dates)
# #     intense = intensity[0:dur] # intensity of each day during event
#     temp = tmax[0:dur] # temp of each day during event
#     avg_temp = mean(temp) # avg. temp during event
#     avg_int = mean(intense) # avg. intensity during event
#     tot_int = intense.sum() # total intensity during event
    
#     event_dates_list.append(event_dates)
    


In [None]:
event_dates_list

In [None]:
df = pd.DataFrame()
df['event_dates'] = event_dates_list
df

In [None]:
import more_itertools as mit

In [None]:
# run tmax stats ------> CLEARLY NOTE WORKING

# tmax1983_sub_stats = tmax_stats(tmax1983_sub)

In [None]:
type(tmax1983['dates'][1])

In [None]:
dates = tmax1983_sub['dates']
#jul_days = pd.to_datetime(tmax1983['dates']).to_julian_date()

In [None]:
# Try to break up events

for index, row in tmax1983_sub.iterrows():
    dates = row['dates'] # Get event dates
    intensity = row['tmax_tntensity'] # Get intensity for each day
    tmax = row['tmax'] # Get tmax for each day
    ID_HDC_G0 = row['ID_HDC_G0'] # get city id
    total_days = row['total_days'] # get total number of tmax days

#     df = event_split(dates, ID_HDC_G0, intensity, tmax, total_days)

#     df_out = df_out.append(df)

dates

In [None]:
type(df_out['events'][0])

#### Test Heat Index

Get some paired values from the [NOAA CHART](https://www.weather.gov/safety/heat-index) and test the output. 

These results seem to be working well, though 104 F and 60 RH produced a higher number than the chart ...

In [None]:
temp = [25, 27, 35, 40, 41] # temp C
RH = 60
test = pd.DataFrame()
test['temp'] = temp
test['RH'] = RH

In [None]:
for i, row in test.iterrows():
    out = make_hi(row['temp'], row['RH'])
    print('\n')

In [None]:
# turn the temp x array from C to F

temp_xr_da = C_to_F(temp_xr_da)

In [None]:
# apply steadman's

HI_steadman = steadman_hi(temp_xr_da, RH_xr_da)

In [None]:

## this doesn't work 
#HI_rothfusz = rothfusz_hi(HI_steadman, temp_xr_da, RH_xr_da)

In [None]:
def C_to_F(Tmax_C):
    "Function converts temp in c to f"
    Tmax_F = (Tmax_C * (9/5)) + 32
    
    return Tmax_F

In [None]:
def steadman_hi(Tmax_F, RH):
    "Simple heat index calculation"
    
    HI_steadman = 0.5 * (Tmax_F + 61.0 + ((Tmax_F-68.0)*1.2) + (RH*0.094))
    
    HI_steadman = (HI_steadman + Tmax_F) / 2
    
    return HI_steadman

In [None]:
def rothfusz_hi(HI_steadman, Tmax_F, RH):
    
    "Heat Index applied to Steadman's heat index >80F"
    
    if (HI_steadman > 80):
    
        HI_rothfusz = -42.379 + 2.04901523*Tmax_F + 10.14333127*RH - .22475541*Tmax_F*RH - .00683783*Tmax_F*Tmax_F - .05481717*RH*RH + .00122874*Tmax_F*Tmax_F*RH + .00085282*Tmax_F*RH*RH - .00000199*Tmax_F*Tmax_F*RH*RH
    
    if (RH < 13) & (Tmax_F > 80) & (Tmax_F < 112):
        adjustment = ((13-RH)/4)*math.sqrt((17-abs(Tmax_F-95))/17)
        HI_rothfusz = HI_rothfusz - adjustment 
    
    "If the RH is greater than 85% and the temperature is between 80 and 87 degrees F" 
    if (RH > 85) & (Tmax_F > 80) & (Tmax_F < 87):
        adjustment = ((RH-85)/10) * ((87-Tmax_F)/5)
        HI_rothfusz = HI_rothfusz + adjustment 
    
    return HI_rothfusz

In [None]:
def make_hi(Tmax_F, RH):
    "Calculates the heat index for a CHIRTS Tmax value"
    Tmax_F = C_to_F(Tmax_C)
#     print('F is ', Tmax_F)
#     print('H is ', RH)
    HI_steadman = steadman_hi(Tmax_F, RH)
#     print('HI_steadman is ', HI_steadman)
    HI_rothfusz = rothfusz_hi(HI_steadman, Tmax_F, RH)
#     print('HI_rothfusz is ', HI_rothfusz)
    
    return HI_rothfusz

In [None]:
DIR_RH = '/home/cascade/projects/UrbanHeat/data/interim/CHIRTS-GHS-DAILY-RH/'
DIR_RAW = '/home/cascade/projects/UrbanHeat/data/interim/CHIRTS-GHS-DAILY-TEMP/'

In [None]:
FN_RH = 'GHS-Tmax-RH_1983.csv'
FN_RAW = 'GHS-Tmax-DAILY_1983.csv'

In [None]:
RH = pd.read_csv(DIR_RH+FN_RH)
RAW = pd.read_csv(DIR_RAW+FN_RAW)

In [None]:
RH = csv_to_xr(DIR_RH+FN_RH, time_dim = 'date', space_dim = 'ID_HDC_G0')

In [None]:
df_temp_id = RAW['ID_HDC_G0'] # get IDs
df_temp = RAW.iloc[:,3:] # get only temp columns
df_temp.index = df_temp_id # set index values
df_temp_drop = df_temp.dropna() # Drop cities w/ no temp record 
print(len(df_temp_drop))

In [None]:
df_temp_drop.head()

In [None]:
df_RH_id = RH['ID_HDC_G0'] # get IDs
df_RH = RH.iloc[:,3:] # get only temp columns
df_RH.index = df_RH_id # set index values
df_RH_drop = df_RH.dropna() # Drop cities w/ no temp record 
print(len(df_RH_drop))

In [None]:
# try np arrays
# temp_arr = df_temp_drop.to_numpy()
# rh_arr = df_RH_drop.to_numpy()

# Make them into a data array
temp_xr_da = xr.DataArray(df_temp_drop, coords=[df_temp_drop.index, df_temp_drop.columns], 
                            dims=['ID_HDC_G0', 'date'])
RH_xr_da = xr.DataArray(df_RH_drop, coords=[df_RH_drop.index, df_RH_drop.columns], 
                            dims=['ID_HDC_G0', 'date'])

In [None]:
out = hi(temp_xr_da, RH_xr_da, 'C') # STILL THROWS ERROR FIX IT 

In [None]:
out

# Leap Year Load Issue