# MS Calculations

Notebook to crunch numbers for the MS.

by Cascade Tuholske 2020.02.23 

Updated 2020.08.27 - CPT
Was run on ERA5 RH with CHIRTS-Daily Tmax from ERA5

In [53]:
#### Depdencies 
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
import glob
import matplotlib.dates as mdates
import os

In [54]:
#### Regressions, no intercept addition is needed because we're using SK LEARN HERE 

def lm_func(df, col):
    
    "simple linear model of a time series data, returns coef"
    
    # Get Data
    X_year = np.array(df.groupby('year')['ID_HDC_G0'].mean().index).reshape((-1, 1))
    Y_stats = np.array(df.groupby('year')[col].sum()).reshape((-1, 1))

    # Add Intercept
    X_year_2 = sm.add_constant(X_year)

    # Regress
    model = sm.OLS(Y_stats, X_year_2).fit() 
        
    coef = int(model.params[1])
    #coef = int(coef)
            
    # R2 and P
    r2 = model.rsquared_adj
    p = model.pvalues[0]
    
    return coef, round(r2, 2), round(p, 3)

In [55]:
#### Load Data
DATA = 'wbgtmax30' # UPDATE 

# file paths
DATA_IN = "/scratch/cascade/UEH-daily/stats/"  
FIG_OUT = "/home/cascade/projects/UrbanHeat/figures/"
FN_IN = os.path.join(DATA_IN,DATA+'_EXP.json')
HI_STATS = pd.read_json(FN_IN, orient = 'split')

# Set scale
scale = 10**9


In [56]:
# Drop cites where 1983 had 1 day and none elsewhere

print(len(HI_STATS))
only83 = HI_STATS.groupby('ID_HDC_G0')['tot_days'].sum() == 1 # sum up total days and find those with 1 day
only83 = list(only83[only83 == True].index) # make a list of IDs
sub = HI_STATS[HI_STATS['ID_HDC_G0'].isin(only83)] # subset those IDs
bad_ids = sub[(sub['year'] == 1983) & (sub['tot_days'] == 1)] # drop those from 1983 only
drop_list = list(bad_ids['ID_HDC_G0']) # make a list
HI_STATS= HI_STATS[~HI_STATS['ID_HDC_G0'].isin(drop_list)] # drop those from the list
print(len(HI_STATS))

303518
303178


In [57]:
#### Add In Meta Data (e.g. geographic data)
meta_fn = os.path.join('/home/cascade/projects/UrbanHeat/data/interim/','GHS-UCDB-IDS.csv')
meta_data = pd.read_csv(meta_fn)

#### Merge in meta
HI_STATS = HI_STATS.merge(meta_data, on = 'ID_HDC_G0', how = 'left')

In [58]:
 HI_STATS

Unnamed: 0,ID_HDC_G0,year,tot_days,P,P1983,P2016,people_days,people_days_heat,people_days_pop,CTR_MN_NM,UC_NM_MN,GCPNT_LAT,GCPNT_LON,region,sub-region,intermediate-region
0,18,2006,2,622283.428656,494664.495616,661553.577433,1.244567e+06,989328.991233,255237.866079,United States,Concord [USA],37.913946,-122.048318,Americas,Northern America,Northern America
1,22,2006,2,70915.650551,52064.452435,73006.671133,1.418313e+05,104128.904869,37702.396233,United States,Livermore [USA],37.688409,-121.753980,Americas,Northern America,Northern America
2,26,2006,3,247944.032154,194088.886834,268055.635628,7.438321e+05,582266.660503,161565.435959,United States,Antioch [USA],37.985433,-121.797516,Americas,Northern America,Northern America
3,27,2006,2,90805.062622,80540.779940,93335.494324,1.816101e+05,161081.559880,20528.565365,United States,Fairfield [USA],38.264013,-122.030253,Americas,Northern America,Northern America
4,28,1998,1,68062.052369,59320.971209,91449.606255,6.806205e+04,59320.971209,8741.081160,United States,Tracy [USA],37.730079,-121.431413,Americas,Northern America,Northern America
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
303173,13135,2010,0,,132501.788370,211650.696320,0.000000e+00,0.000000,0.000000,Fiji,Suva [FJI],-18.104802,178.461255,Oceania,Melanesia,Melanesia
303174,13135,2011,0,,132501.788370,211650.696320,0.000000e+00,0.000000,0.000000,Fiji,Suva [FJI],-18.104802,178.461255,Oceania,Melanesia,Melanesia
303175,13135,2012,0,,132501.788370,211650.696320,0.000000e+00,0.000000,0.000000,Fiji,Suva [FJI],-18.104802,178.461255,Oceania,Melanesia,Melanesia
303176,13135,2013,0,,132501.788370,211650.696320,0.000000e+00,0.000000,0.000000,Fiji,Suva [FJI],-18.104802,178.461255,Oceania,Melanesia,Melanesia


In [59]:
#### Check to num days per year greatest > 32
check = HI_STATS.sort_values('tot_days', ascending = False)
check[['CTR_MN_NM', 'UC_NM_MN', 'year', 'tot_days']].head(5)

Unnamed: 0,CTR_MN_NM,UC_NM_MN,year,tot_days
52805,Eritrea,Mits'Iwa [ERI],2016,255
52791,Eritrea,Mits'Iwa [ERI],2002,254
52804,Eritrea,Mits'Iwa [ERI],2015,248
52799,Eritrea,Mits'Iwa [ERI],2010,244
52792,Eritrea,Mits'Iwa [ERI],2003,243


# Global Trends

In [60]:
#### Total Change in people Days
data = HI_STATS.groupby('year')['people_days'].sum()
year = str(data.index[33])
value = str(data.values[33]/10**9)
print('person days in 2016 was '+value+' billion')

year = str(data.index[0])
value = str(data.values[0]/10**9)
print('person days in 1983 was '+value+' billion')

#### Pct Change in Poeple Days 1983 - 2016
pdays16 = data.iloc[len(data) -1]
pdays83 = data.iloc[0]
out = (data.iloc[len(data) -1] - data.iloc[0]) / data.iloc[0] * 100
print('pct increase in people days 83 - 16 is ', out)


person days in 2016 was 118.6608238905162 billion
person days in 1983 was 39.691073649233786 billion
pct increase in people days 83 - 16 is  198.960982862722


In [61]:
#### Rate of change
data = HI_STATS
coef, r2, p = lm_func(data, 'people_days')
print('annual increase in people days ', 'was', coef/10**9, ' p=', p)
coef1, r21, p1 = lm_func(data, 'people_days_heat')
print('annual increase in people days heat ', 'was', coef1/10**9, ' p=', p)
coef2, r22, p2 = lm_func(data, 'people_days_pop')
print('annual increase in people days pop ', 'was', coef2/10**9, ' p=', p)
print('attrib heat ', 'was', coef1 / coef *100, ' p=', p, '\n')

annual increase in people days  was 2.108968631  p= 0.0
annual increase in people days heat  was 0.716240045  p= 0.0
annual increase in people days pop  was 1.392728586  p= 0.0
attrib heat  was 33.96162628840922  p= 0.0 



In [62]:
#### Pct Pday Annual Increase from Heat
coef_pdays, r2_pdays, p_pdays = lm_func(HI_STATS, 'people_days') # regress pdays
coef_heat, r2_heat, p_heat = lm_func(HI_STATS, 'people_days_heat') # regreas heat

print('warming is what pct of total?', coef_heat/coef_pdays *100)

warming is what pct of total? 33.96162628840922


In [63]:
#### Pct heat vs pop
coef_pop, r2_pdays, p_pdays = lm_func(HI_STATS, 'people_days_pop') # regress pdays
coef_heat, r2_heat, p_heat = lm_func(HI_STATS, 'people_days_heat') # regreas heat

print('pct of heat vs pop', coef_heat/coef_pop *100)

pct of heat vs pop 51.42710878485551


# City-level

#### Largest cities compared to global total

In [64]:
#### Top cities
cities_fn = os.path.join('/scratch/cascade/UEH-daily/ms-tables/wbgtmax30_EXP-TOP50.csv')
cities = pd.read_csv(cities_fn)

In [65]:
top = cities.sort_values('coef_pdays', ascending = False).head(25) # get the top ten cities


In [66]:
# What pct of the global annual increase comes from the top 25 cities?
ans = top['coef_pdays'].sum() / coef # coef comes from rate of change cell
print('Top 25 cities of total annual increase', ans * 100)

Top 25 cities of total annual increase 24.658691183911717


#### Pdays

In [67]:
city_coefs_fn = os.path.join('/scratch/cascade/UEH-daily/stats/wbgtmax30_TREND_PDAYS05.json')
city_coefs = pd.read_json(city_coefs_fn, orient = 'split')
GHS = gpd.read_file('/home/cascade/projects/UrbanHeat/data/raw/GHS_UCDB/GHS_STAT_UCDB2015MT_GLOBE_R2019A_V1_0.shp')

In [68]:
len(city_coefs)

6269

In [69]:
#### Number of cities w/ sig increase in exposure?
print('The pct of cities w/ increases in exposure: ', len(city_coefs)/len(GHS)*100)


The pct of cities w/ increases in exposure:  47.727445755614774


In [70]:
len(city_coefs)

6269

In [71]:
ans = len(city_coefs[(city_coefs['GCPNT_LAT'] < 23.5) & (city_coefs['GCPNT_LAT'] > -23.5)]) / len(city_coefs)
print('what pct of pday cities are low lat?', ans*100)

what pct of pday cities are low lat? 49.91226670920402


In [72]:
print('what pct of global pop are cities with sig pdays?')

what pct of global pop are cities with sig pdays?


In [73]:
def country_search(country, data_set):
    "what pct of cities had a p-day increase?"
    print('Num of Cities in '+country+' ', len(data_set[data_set['CTR_MN_NM'] == country]) / len(GHS[GHS['CTR_MN_NM'] == country]) *100)

In [74]:
data_set = city_coefs

In [75]:
country_search('Senegal', data_set)

Num of Cities in Senegal  100.0


In [76]:
country_search('Nigeria', data_set)

Num of Cities in Nigeria  87.99171842650104


In [77]:
country_search('India', data_set)

Num of Cities in India  88.85467980295566


#### Pct of global population exposured

In [78]:
city_coefs_fn = os.path.join('/scratch/cascade/UEH-daily/stats/wbgtmax30_TREND_PDAYS05.json')
city_coefs = pd.read_json(city_coefs_fn, orient = 'split')

In [79]:
pop_fn = os.path.join('/home/cascade/projects/UrbanHeat/data/interim/','GHS-UCDB-Interp.csv')
pop = pd.read_csv(pop_fn)

p16 = pop[['ID_HDC_G0', 'P2016']]

In [80]:
len(p16)

13135

In [81]:
pdays_pop = pd.merge(city_coefs[['ID_HDC_G0']], p16, on = 'ID_HDC_G0', how = 'inner')

In [82]:
ans = pdays_pop['P2016'].sum() / p16['P2016'].sum() * 100 
print('What is the global urban population in 2016', p16['P2016'].sum() / 10**9)
print('How many people live in cities with increasing exp in 2016', pdays_pop['P2016'].sum() / 10**9)
print('What pct of total urban pop has sig increase exp in 2015', ans)

What is the global urban population in 2016 3.5353262985424414
How many people live in cities with increasing exp in 2016 1.709039189516783
What pct of total urban pop has sig increase exp in 2015 48.34176664884915


In [83]:
# From UN-DESA 2018 estimates for total global pop in 2015
ans =  pdays_pop['P2016'].sum() / 7383009000 * 100
print('What pct of total world pop has sig increase exp in 2016', ans)

What pct of total world pop has sig increase exp in 2016 23.148274497793285


In [84]:
# UN-DESA Urban pop in 2015 was  3 981 498
p16['P2016'].sum()

3535326298.5424414

#### Total Heat Days 

In [98]:
city_totdays_fn = os.path.join('/scratch/cascade/UEH-daily/stats/wbgtmax30_TREND_HEATP05.json')
city_totdays = pd.read_json(city_totdays_fn, orient = 'split')

In [99]:
print('What pct of all cities had sig increase in days/yr > WBGT30 C ?')
print(len(city_totdays)/len(GHS))
print(len(city_totdays))

What pct of all cities had sig increase in days/yr > WBGT30 C ?
0.4155310239817282
5458


In [100]:
print('What pct of all cities >1 day / yr in days/yr > WBGT32 C ?')
print(len(city_totdays[city_totdays['coef_totDays'] >= 1])/len(GHS))
print(len(city_totdays[city_totdays['coef_totDays'] >= 1]))

What pct of all cities >1 day / yr in days/yr > WBGT32 C ?
0.17967263037685574
2360


In [None]:
## How many cities day increase per year ... 1, 3
top = len(city_totdays)
bottom = len(city_totdays[city_totdays['coef_totDays'] >= 2])

In [None]:
print(top)
print(bottom)

In [None]:
## What are some big cities (>1m people)?
hot1m = city_totdays[(city_totdays['coef_totDays'] >= 1.5) & (city_totdays['P2016'] >= 10**6)][['coef_totDays', 'UC_NM_MN', 'P2016']].sort_values('coef_totDays')

In [None]:
len(hot1m)

In [None]:
hot1m

In [None]:
2.162029e+07 / 10**6


#### Dehli & Kolkata 

In [None]:
# Delhi 6955 & Kolkata 9691
K = city_coefs[city_coefs['ID_HDC_G0']== 9691]
D = city_coefs[city_coefs['ID_HDC_G0']== 6955]

In [None]:
print('Share of heat Kolkata', K.coef_heat / K.coef_pdays * 100)

In [None]:
print('Share of heat Delhi', D.coef_heat / D.coef_pdays * 100)

#### Populations of specific cities

In [None]:
pop = pd.read_csv(DATA_IN+'interim/GHS-UCDB-Interp.csv')

In [None]:
# 9691, Kolkata 1998
# 2046, Paris 2003
# 4417, Aleppo 2010

In [None]:
ans = pop[pop['ID_HDC_G0'] == 9691]['P2015'] / 10**3
print('Pop of Kolkata in 1998', ans)

## WBGT 32 vs 28

In [95]:
wbgt32 = pd.read_json('/scratch/cascade/UEH-daily/stats/wbgtmax32_TREND_PDAYS05.json', orient = 'split')
wbgt30 = pd.read_json('/scratch/cascade/UEH-daily/stats/wbgtmax30_TREND_PDAYS05.json', orient = 'split')
wbgt28 = pd.read_json('/scratch/cascade/UEH-daily/stats/wbgtmax28_TREND_PDAYS05.json', orient = 'split')

In [97]:
print('how many cities wbgt 32?', len(wbgt32))
print('how many cities wbgt 32?', len(wbgt30))
print('how many cities wbgt 28?', len(wbgt28))
print('no trend', len(meta_data) - len(wbgt28))
print('dif is', 8510 - 6022)

how many cities wbgt 32? 3094
how many cities wbgt 32? 6269
how many cities wbgt 28? 7946
no trend 5189
dif is 2488


# Regional Trends

In [None]:
#### Annual Rates

scale = 10**6
geog = 'sub-region'

for label in np.unique(HI_STATS[geog]):
    label = label
    data = HI_STATS[HI_STATS[geog] == label]
    
    #### Rate of change
    coef, r2, p = lm_func(data, 'people_days')
    print('annual increase in people days '+label, 'was', coef/scale, ' p=', p)
    coef1, r21, p1 = lm_func(data, 'people_days_heat')
    print('annual increase in people days heat '+label, 'was', coef1/scale, ' p=', p)
    coef2, r22, p2 = lm_func(data, 'people_days_pop')
    print('annual increase in people days pop '+label, 'was', coef2/scale, ' p=', p)
    print('attrib heat '+label, 'was', coef1 / coef *100, ' p=', p, '\n')
  

In [None]:
#### Trends for Africa, N & SS
geog = 'region'
location = 'Africa'
data = HI_STATS[HI_STATS[geog] == location]
print(location)

#### Total Change in people Days
data = data.groupby('year')['people_days'].sum()
year = str(data.index[33])
value = str(data.values[33]/10**9)
print('person days in 2016 was '+value+' billion')

year = str(data.index[0])
value = str(data.values[0]/10**9)
print('person days in 1983 was '+value+' billion')

#### Pct Change in Poeple Days 1983 - 2016
pdays16 = data.iloc[len(data) -1]
pdays83 = data.iloc[0]
out = (data.iloc[len(data) -1] - data.iloc[0]) / data.iloc[0] * 100
print('pct increase in people days 83 - 16 is ', out)



In [None]:
#### S Asia as pct of total  global = 5.245146271 B 

print('pct of total pdays from S Asia is ', 1899.70765 / 10**3 / 5.245146271 * 100)

In [None]:
#### Median Slope
region = 'Europe'
col = 'coef_heat'
geog = 'region'
scale = 10**3
result = city_coefs[city_coefs[geog]== region][col].median()
print(region, col, 'is ', result/scale)

# Heat Waves

- 9691 Kolkata 1998
- 2046 Paris 2003
- 4417, Aleppo 2010

In [2]:
def make_data(dir_in, geog, location):
    """Function makes data to plot daily city-level HI Max and average
    Args:
        dir_in = directory to get data
        geog = column for geography, city-level = 'ID_HDC_G0'
        location = usually a city id
    """
    
    fn_list = sorted(glob.glob(dir_in+'*.csv')) # get data
    df_out = pd.DataFrame() # to write dataframe
    
     # get leap year cols from 2016
    hi16 = pd.read_csv(fn_list[33]) 
    cols = list(hi16.iloc[:,9:].columns)
    cols = [year[5:] for year in cols] # cols for data frame
    
    temp_list = [] # empty list for temps
    
    # loop through dir and get data
    for i, fn in enumerate(fn_list):
        df = pd.read_csv(fn) # open data frame
        year_label = [(df.columns[9]).split('.')[0]] # get year
        row = df[df[geog] == location]
        temp = row.iloc[:,9:] # get only temp columns
        
        # add in col for leap years
        if temp.shape[1] == 365:
            temp.insert(loc = 59, column = year_label[0]+'.02.29', value = np.nan, allow_duplicates=False)

        # Set Index & Columns
        temp.index = year_label
        temp.columns = cols # revalue to m.d
    
        # add to list
        temp_list.append(temp)
    
    df_out = pd.concat(temp_list) # make one big dataframe
    
    return df_out

def plot_data(df, year, start, end):#, start, end):
    """ Make the data for a plot
    Args: 
        df = df w/ daily HI max for a given city
        year = year you want to plot against average
        start = start of plot in julian days (e.g 1 - 365/366)
        end = end of plot in julian days
    """

    # Deal with leap year
    if year % 4 !=0:
        df.drop(columns ='02.29', inplace = True)
    
    # Subset data
    start = start - 1 # zero indexing 
    subset = df.iloc[:,start:end]
    
    # HI Max for year
    hi_year = subset.loc[str(year)]
    
    # make 34-avg daily hi and std
    means = subset.mean(axis = 0)
    stds = subset.std(axis = 0)
    
    # make colums to date time
    cols = pd.to_datetime([str(year)+'.'+date for date in hi_year.index])
    
    return hi_year, means, cols, stds

In [3]:
# Find Heat Wave From All DATA
def select_city_year(df, city_id, year):
    "Quick search to find city and years within HI_STATS"
    df_out = df[(df['ID_HDC_G0'] == city_id) & (df['year'] == year)]
    
    return df_out

In [4]:
# open heat events data
events_fn = os.path.join('/scratch/cascade/UEH-daily/stats/','himax406_STATS.json')
events = pd.read_json(events_fn, orient = 'split')

In [35]:
# [4417, 'Aleppo', 2010]  [9691, 'Kolkata', 1998] 
city = select_city_year(events, 9691, 1998)
city

Unnamed: 0,ID_HDC_G0,year,duration,avg_temp,avg_intensity,tot_intensity,event_dates,intensity,tmax,UID
369,9691,1998,1,42.934967,2.334967,2.334967,[1998.03.22],[2.334967],[42.934967],UID-3010498
370,9691,1998,10,43.568205,2.968205,29.682055,"[1998.04.06, 1998.04.07, 1998.04.08, 1998.04.0...","[1.183974, 1.842955, 1.922453, 0.125414, 4.671...","[41.783974, 42.442955, 42.522453, 40.725414, 4...",UID-3010499
371,9691,1998,6,44.825648,4.225648,25.353888,"[1998.04.18, 1998.04.19, 1998.04.20, 1998.04.2...","[3.6564300000000003, 5.748175, 4.377676, 6.766...","[44.25643, 46.348175, 44.977676, 47.36685, 44....",UID-3010500
372,9691,1998,8,44.860171,4.260171,34.081366,"[1998.04.25, 1998.04.26, 1998.04.27, 1998.04.2...","[1.03104, 1.941386, 1.84029, 3.833132, 5.41878...","[41.63104, 42.541386, 42.44029, 44.433132, 46....",UID-3010501
373,9691,1998,1,43.11688,2.51688,2.51688,[1998.05.04],[2.51688],[43.11688],UID-3010502
374,9691,1998,5,42.655299,2.055299,10.276495,"[1998.05.06, 1998.05.07, 1998.05.08, 1998.05.0...","[0.014624, 1.948923, 1.734152, 5.525946, 1.05285]","[40.614624, 42.548923, 42.334152, 46.125946, 4...",UID-3010503
375,9691,1998,53,46.928288,6.328288,335.399261,"[1998.05.12, 1998.05.13, 1998.05.14, 1998.05.1...","[3.658385, 5.072863, 5.40717, 6.700617, 1.9240...","[44.258385, 45.672863, 46.00717, 47.300617, 42...",UID-3010504
376,9691,1998,1,42.06064,1.46064,1.46064,[1998.07.08],[1.4606400000000002],[42.06064],UID-3010505
377,9691,1998,2,42.185946,1.585946,3.171891,"[1998.07.10, 1998.07.11]","[2.957545, 0.21434599999999998]","[43.557545, 40.814346]",UID-3010506
378,9691,1998,5,42.240843,1.640843,8.204213,"[1998.07.13, 1998.07.14, 1998.07.15, 1998.07.1...","[1.014307, 2.590765, 2.17667, 1.752463, 0.6700...","[41.614307, 43.190765, 42.77667, 42.352463, 41...",UID-3010507


In [37]:
# Select event and compare to long term means
# Kolkata 1998 UID-3010504, Aleppo 2010 UID-984387 and UID-984386
event_id = 'UID-3010504'
event = city[city['UID'] == event_id]
tmax = list(event['tmax'])[0]
dates = list(event['event_dates'])[0]

In [38]:
# Look at event tmax 
tmax

[44.258385,
 45.672863,
 46.00717,
 47.300617,
 42.52402,
 42.747894,
 40.94503,
 45.23583,
 49.371773,
 53.958176,
 53.112072,
 51.47835,
 48.916313,
 48.40086,
 42.25589,
 43.499332,
 49.97506,
 53.392044,
 53.647552,
 53.467106,
 52.420647,
 51.869835,
 50.09438,
 49.974426,
 50.773907,
 51.098557,
 50.924046,
 54.331165,
 52.15966,
 51.29312,
 46.17026,
 46.86537,
 44.397667,
 43.608288,
 41.59041,
 44.49989,
 47.334194,
 47.493652,
 47.780872,
 48.224854,
 41.623146,
 41.111736,
 43.190582,
 44.582874,
 41.157124,
 44.693645,
 41.130524,
 43.177464,
 44.312164,
 43.226967,
 43.829956,
 42.336132,
 43.75541]

In [39]:
# Look at event dates 
dates

['1998.05.12',
 '1998.05.13',
 '1998.05.14',
 '1998.05.15',
 '1998.05.16',
 '1998.05.17',
 '1998.05.18',
 '1998.05.19',
 '1998.05.20',
 '1998.05.21',
 '1998.05.22',
 '1998.05.23',
 '1998.05.24',
 '1998.05.25',
 '1998.05.26',
 '1998.05.27',
 '1998.05.28',
 '1998.05.29',
 '1998.05.30',
 '1998.05.31',
 '1998.06.01',
 '1998.06.02',
 '1998.06.03',
 '1998.06.04',
 '1998.06.05',
 '1998.06.06',
 '1998.06.07',
 '1998.06.08',
 '1998.06.09',
 '1998.06.10',
 '1998.06.11',
 '1998.06.12',
 '1998.06.13',
 '1998.06.14',
 '1998.06.15',
 '1998.06.16',
 '1998.06.17',
 '1998.06.18',
 '1998.06.19',
 '1998.06.20',
 '1998.06.21',
 '1998.06.22',
 '1998.06.23',
 '1998.06.24',
 '1998.06.25',
 '1998.06.26',
 '1998.06.27',
 '1998.06.28',
 '1998.06.29',
 '1998.06.30',
 '1998.07.01',
 '1998.07.02',
 '1998.07.03']

In [40]:
#### Heat Index Data
data = 'himax'
data_label = '$WBGT_{max}$ '
DATA_IN = os.path.join('/scratch/cascade/UEH-daily/','GHS-'+data+'/') # output from avg wbgt/hi max
FIG_OUT = '/home/cascade/projects/UrbanHeat/figures/'
DS = u"\N{DEGREE SIGN}"
t = 40.6 # wbgt (30) or hi (46.1) threshold

In [41]:
# Get long term means
# Args
#[4417, 'Aleppo'] 2010 [2046, 'Paris'] 2003 [9691, 'Kolkata'] 1998 ['6955, Dehli']

# Args
city_list = [9691, 'Kolkata']
year = 1998

# April 1 to Sep 30 (Use Julian Days), or 1 - 182 lagos
start = 91 
end = 273

# Make Data 
df = make_data(DATA_IN, 'ID_HDC_G0', city_list[0])
years, means, cols, stds = plot_data(df, year, start, end)

In [42]:
# what is the rank of HImax in the entire record?
sorted(df.to_numpy().flatten(), reverse = True)

[55.76492,
 55.540943000000006,
 55.47238,
 55.034977000000005,
 54.444187,
 54.443565,
 54.352623,
 54.342144,
 54.332671999999995,
 54.331165000000006,
 53.958176,
 53.82029300000001,
 53.72861999999999,
 53.708397,
 53.647552000000005,
 53.46710600000001,
 53.39204399999999,
 53.292942000000004,
 53.16766,
 53.160056999999995,
 53.112072,
 53.07143000000001,
 52.902626,
 52.89521800000001,
 52.868988,
 52.852672999999996,
 52.811066000000004,
 52.779109999999996,
 52.77304,
 52.731167000000006,
 52.670704,
 52.662333999999994,
 52.620667000000005,
 52.612743,
 52.584669999999996,
 52.566722999999996,
 52.475216,
 52.428986,
 52.42276,
 52.420646999999995,
 52.38,
 52.344208,
 52.315533,
 52.309875,
 52.292396999999994,
 52.227932,
 52.16578,
 52.162647,
 52.159659999999995,
 52.13902,
 52.129852,
 52.125885,
 52.120678000000005,
 52.108288,
 52.083546000000005,
 52.071636,
 52.06754,
 52.048534000000004,
 52.026196,
 52.024937,
 51.973704999999995,
 51.932045,
 51.890766,
 51.869834

In [43]:
cols

DatetimeIndex(['1998-04-01', '1998-04-02', '1998-04-03', '1998-04-04',
               '1998-04-05', '1998-04-06', '1998-04-07', '1998-04-08',
               '1998-04-09', '1998-04-10',
               ...
               '1998-09-21', '1998-09-22', '1998-09-23', '1998-09-24',
               '1998-09-25', '1998-09-26', '1998-09-27', '1998-09-28',
               '1998-09-29', '1998-09-30'],
              dtype='datetime64[ns]', length=183, freq=None)

In [44]:
# make a df
df = pd.DataFrame()
df['dates'] = cols
df['max']  = years.to_list()
df['avg'] = means.to_list()
df['dif'] = df['max'] - df['avg']

In [45]:
type(df['dates'][0])

pandas._libs.tslibs.timestamps.Timestamp

In [46]:
type(dates[0][0])

str

In [47]:
# start and end of heat wave
start = dates[0]
end = dates[-1]

In [48]:
out = df[(df['dates'] >= start) & (df['dates'] <= end)]

In [49]:
# peak heat wave
out.max()

dates    1998-07-03 00:00:00
max                  54.3312
avg                  46.8626
dif                  9.01821
dtype: object

In [20]:
out[out['dif'] == out['dif'].max()]

Unnamed: 0,dates,max,avg,dif
121,2010-07-31,47.341,38.285811,9.055189


In [51]:
for dif in df.dif:
    print(dif)

-8.514848352941186
-7.632409029411768
-5.644966235294113
-3.406062205882357
-1.1614617941176348
0.9526582352941162
0.9823153823529509
0.5851120882353129
-1.471458558823521
2.31912011764706
3.466034558823523
4.373313264705878
-0.20679329411765934
2.978790294117651
-0.9681718235294099
-4.212847823529401
-3.0254702647058735
0.43199894117647375
1.9738507058823416
1.0314872647058806
2.929795352941177
0.9579803235294264
-2.3054202058823563
-5.5697241176470556
-1.7806481470588196
-0.2646762058823455
-0.6511630294117623
1.3561781764705785
3.3569348529411727
4.370823470588235
4.25089208823529
2.7447991470588207
-4.05382838235294
0.2782233529411755
-6.904281352941176
-2.9348011470588133
-0.8604608235294151
-1.2189000882352872
1.4717213823529391
-2.833015617647071
-5.668943441176495
-0.0652439705882486
1.1217894117647162
1.569347794117654
3.399449764705871
-1.2908185000000003
-1.6786421470588095
-3.401395147058828
0.8567205294117599
4.6025100882352845
9.018213352941174
7.08439814705882
6.00284214

In [52]:
df[df['dif'] > 8]

Unnamed: 0,dates,max,avg,dif
50,1998-05-21,53.958176,44.939963,9.018213
58,1998-05-29,53.392044,45.332771,8.059273
59,1998-05-30,53.647552,45.51961,8.127942
68,1998-06-08,54.331165,45.479782,8.851383


# Phoenix Caclulations
1990-6-26 Phoenix, AZ Weather History. https://www.wunderground.com/history/daily/KPHX/date/1990-6-26. Weather Underground. (June 10, 2021).<br>A. Minkler, At 118 degrees, Thursday heat in Phoenix breaks daily record set in 1934. The Arizona Republic (2020) (June 18, 2021).


In [None]:
# Functions
def c_to_f(C):
    "Convert HI C to F"
    return 1.8 * C + 32 

def hi_to_wbgt(HI):
    """ Convert HI to WBGT using emprical relationship from Bernard and Iheanacho 2015
    WBGT [◦C] = −0.0034 HI2 + 0.96 HI−34; for HI [◦F]
    """
    
    WBGT = -0.0034*HI**2 + 0.96*HI - 34
    
    return WBGT

In [None]:
# tmax was 122, rh was 11 using https://www.wpc.ncep.noaa.gov/html/heatindex.shtml
# then HI was 49 

# Check out cities

In [None]:
def select_city_year(df, city_id, year):
    "Quick search to find city and years within HI_STATS"
    df_out = df[(df['ID_HDC_G0'] == city_id) & (df['year'] == year)]
    
    return df_out


In [None]:
city = select_city_year(ALL_DATA, 4417, 2010)
city

# LA

In [None]:
from glob import glob 
import os

In [None]:
dir_list = '/home/cascade/projects/UrbanHeat/data/interim/ERA5_HI/'


In [None]:
fn = 'GHS-ERA5-HI_2009.csv'
data = pd.read_csv(dir_list+fn)
city = data[data['ID_HDC_G0'] == 14]

In [None]:
temps_list = []
dates_list = []
for fn in sorted(os.listdir(dir_list)):
    data = pd.read_csv(dir_list+fn)
    city = data[data['ID_HDC_G0'] == 14]
    dates = list(city.iloc[:,3:])
    temps = list(city.iloc[:,3:].values[0])
    dates_list.extend(dates)
    temps_list.extend(temps)

In [None]:
len(dates_list)

# Africa

In [None]:
#### Dependencies
import glob
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from matplotlib import rcParams
import matplotlib.dates as mdates

In [None]:
def make_data(dir_in, geog, location):
    """Function makes data to plot daily city-level HI Max and average
    Args:
        dir_in = directory to get data
        geog = column for geography, city-level = 'ID_HDC_G0'
        location = usually a city id
    """
    
    fn_list = sorted(glob.glob(dir_in+'*.csv')) # get data
    df_out = pd.DataFrame() # to write dataframe
    
     # get leap year cols from 2016
    hi16 = pd.read_csv(fn_list[33]) 
    cols = list(hi16.iloc[:,3:].columns)
    cols = [year[5:] for year in cols] # cols for data frame
    
    temp_list = [] # empty list for temps
    
    # loop through dir and get data
    for i, fn in enumerate(fn_list):
        df = pd.read_csv(fn) # open data frame
        year_label = [(df.columns[3]).split('.')[0]] # get year
        row = df[df[geog] == location]
        temp = row.iloc[:,3:] # get only temp columns
        
        # add in col for leap years
        if temp.shape[1] == 365:
            temp.insert(loc = 59, column = year_label[0]+'.02.29', value = np.nan, allow_duplicates=False)

        # Set Index & Columns
        temp.index = year_label
        temp.columns = cols # revalue to m.d
    
        # add to list
        temp_list.append(temp)
    
    df_out = pd.concat(temp_list) # make one big dataframe
    
    return df_out

In [None]:
def plot_data(df, year, start, end):#, start, end):
    """ Make the data for a plot
    Args: 
        df = df w/ daily HI max for a given city
        year = year you want to plot against average
        start = start of plot in julian days (e.g 1 - 365/366)
        end = end of plot in julian days
    """

    # Deal with leap year
    if year % 4 !=0:
        df.drop(columns ='02.29', inplace = True)
    
    # Subset data
    start = start - 1 # zero indexing 
    subset = df.iloc[:,start:end]
    
    # HI Max for year
    hi_year = subset.loc[str(year)]
    
    # make 34-avg daily hi
    means = subset.mean(axis = 0)
    
    # make colums to date time
    cols = pd.to_datetime([str(year)+'.'+date for date in hi_year.index])
    
    return hi_year, means, cols

In [None]:
# Args
#[3342, 'Cairo'] #[1910, 'Accra'] 3268, Cape Town
#[4417, 'Aleppo'] 2010 [2046, 'Paris'] 2003 [9691, 'Kolkata'] 1998 ['6955, Dehli']

# Args
city_list = [9691, 'Kolkata'] 
year = 1998
font_size = 10


# April 1 to Sep 30 (Use Julian Days), or 1 - 182 lagos
start = 91 
end = 273

# Labels
hi_label = str(year)+' '+data
labels =  ['avg. '+data, hi_label, str(t)+''+DS+'C']  # <<<<<------------ Be sure to update! 

# Make Data 
df = make_data(DATA_IN, 'ID_HDC_G0', city_list[0])
years, means, cols = plot_data(df, year, start, end)

# Trend First vs. Second Half of Data

In [None]:
data.head()

In [None]:
#### Share of exposure due to heat by 17 year split

## 1983 - 1999
data1 = HI_STATS[(HI_STATS['year'] >= 1983) & (HI_STATS['year'] < 2000)]
coef1pop , r21pop, p1pop  = lm_func(data1 , 'people_days_pop')
coef1heat , r21heat, p1heat = lm_func(data1 , 'people_days_heat')

years = list(np.unique(data1['year']))
plt.plot(years, data1.groupby('year')['people_days_heat'].sum())
sns.regplot(years, data1.groupby('year')['people_days_heat'].sum(), 
            color = 'blue', scatter = False, truncate = True)

## 2000 - 2016
data2 = HI_STATS[(HI_STATS['year'] >= 2000) & (HI_STATS['year'] <= 2016)]
coef2heat , r22heat, p2heat = lm_func(data2 , 'people_days_heat')
coef2pop , r22pop, p1pop  = lm_func(data2 , 'people_days_pop')

years = list(np.unique(data1['year']))
plt.plot(years, data2.groupby('year')['people_days_heat'].sum())
sns.regplot(years, data2.groupby('year')['people_days_heat'].sum(), 
            color = 'orange', scatter = False, truncate = True)

In [None]:
## 2000 - 2016
data2pop = HI_STATS[(HI_STATS['year'] >= 1983) & (HI_STATS['year'] < 2000)]
coef2pop , r22pop, p1pop  = lm_func(data2pop , 'people_days_pop')

data2heat = HI_STATS[(HI_STATS['year'] >= 2000) & (HI_STATS['year'] <= 2016)]
coef2heat , r22heat, p2heat = lm_func(data2heat , 'people_days_heat')


In [None]:
## Estimates
print('From 83 - 99, contribution from heat was', coef1heat/(coef1pop+coef1heat))
print('From 00 - 16, contribution from heat was', coef2heat/(coef2pop+coef2heat))
print('From 83 - 00, heat was', coef1heat/10**9, round(p1heat, 3))
print('From 00 - 16, heat was', coef2heat/10**9)
print('From 83 - 00, pop was', coef1pop/10**9)
print('From 00 - 16, pop was', coef2pop/10**9)