# MS Calculations

Notebook to crunch numbers for the MS.

by Cascade Tuholske 2020.02.23 

Updated 2020.08.27 - CPT
Was run on ERA5 RH with CHIRTS-Daily Tmax from ERA5

In [2]:
#### Depdencies 
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns

In [3]:
#### Regressions, no intercept addition is needed because we're using SK LEARN HERE 

def lm_func(df, col):
    
    "simple linear model of a time series data, returns coef"
    
    # Get Data
    X_year = np.array(df.groupby('year')['ID_HDC_G0'].mean().index).reshape((-1, 1))
    Y_stats = np.array(df.groupby('year')[col].sum()).reshape((-1, 1))

    # Add Intercept
    X_year_2 = sm.add_constant(X_year)

    # Regress
    model = sm.OLS(Y_stats, X_year_2).fit() 
        
    coef = int(model.params[1])
    #coef = int(coef)
            
    # R2 and P
    r2 = model.rsquared_adj
    p = model.pvalues[0]
    
    return coef, round(r2, 2), round(p, 3)

In [4]:
#### Load Data
DATA = 'WBGT32_1D' # UPDATE 

# file paths
DATA_IN = "/home/cascade/projects/UrbanHeat/data/"  
FIG_OUT = "/home/cascade/projects/UrbanHeat/figures/"
FN_IN = 'processed/PNAS-DATA-v2/'+DATA+'_EXP.json'
HI_STATS = pd.read_json(DATA_IN+FN_IN, orient = 'split')

# Set scale
scale = 10**9


In [5]:
# Drop cites where 1983 had 1 day and none elsewhere

print(len(HI_STATS))
only83 = HI_STATS.groupby('ID_HDC_G0')['tot_days'].sum() == 1 # sum up total days and find those with 1 day
only83 = list(only83[only83 == True].index) # make a list of IDs
sub = HI_STATS[HI_STATS['ID_HDC_G0'].isin(only83)] # subset those IDs
bad_ids = sub[(sub['year'] == 1983) & (sub['tot_days'] == 1)] # drop those from 1983 only
drop_list = list(bad_ids['ID_HDC_G0']) # make a list
HI_STATS= HI_STATS[~HI_STATS['ID_HDC_G0'].isin(drop_list)] # drop those from the list
print(len(HI_STATS))

322082
321538


In [6]:
#### Add In Meta Data (e.g. geographic data)
meta_fn = DATA_IN+'interim/GHS-UCDB-IDS.csv'
meta_data = pd.read_csv(meta_fn)

#### Merge in meta
HI_STATS = HI_STATS.merge(meta_data, on = 'ID_HDC_G0', how = 'left')

# Global Trends

In [7]:
#### Total Change in people Days
data = HI_STATS.groupby('year')['people_days'].sum()
year = str(data.index[33])
value = str(data.values[33]/10**9)
print('person days in 2016 was '+value+' billion')

year = str(data.index[0])
value = str(data.values[0]/10**9)
print('person days in 1983 was '+value+' billion')

#### Pct Change in Poeple Days 1983 - 2016
pdays16 = data.iloc[len(data) -1]
pdays83 = data.iloc[0]
out = (data.iloc[len(data) -1] - data.iloc[0]) / data.iloc[0] * 100
print('pct increase in people days 83 - 16 is ', out)


person days in 2016 was 125.6137485348937 billion
person days in 1983 was 44.458472045749346 billion
pct increase in people days 83 - 16 is  182.54175808298746


In [20]:
#### Rate of change
data = HI_STATS
coef, r2, p = lm_func(data, 'people_days')
print('annual increase in people days ', 'was', coef/10**9, ' p=', p)
coef1, r21, p1 = lm_func(data, 'people_days_heat')
print('annual increase in people days heat ', 'was', coef1/10**9, ' p=', p)
coef2, r22, p2 = lm_func(data, 'people_days_pop')
print('annual increase in people days pop ', 'was', coef2/10**9, ' p=', p)
print('attrib heat ', 'was', coef1 / coef *100, ' p=', p, '\n')

annual increase in people days  was 2.217352888  p= 0.0
annual increase in people days heat  was 0.727434647  p= 0.0
annual increase in people days pop  was 1.48991824  p= 0.0
attrib heat  was 32.80644460955103  p= 0.0 



In [9]:
#### Pct Pday Annual Increase from Heat
coef_pdays, r2_pdays, p_pdays = lm_func(HI_STATS, 'people_days') # regress pdays
coef_heat, r2_heat, p_heat = lm_func(HI_STATS, 'people_days_heat') # regreas heat

print('warming is what pct of total?', coef_heat/coef_pdays *100)

warming is what pct of total? 32.807099734551066


# City-level

#### Largest cities compared to global total

In [21]:
#### Top cities
cities = pd.read_csv(DATA_PATH+'processed/PNAS-DATA-v2/WBGT32_1D_EXP-TOP50.csv')

In [22]:
top = cities.sort_values('coef_pdays', ascending = False).head(25) # get the top ten cities


In [23]:
# What pct of the global annual increase comes from the top ten cities?
ans = top['coef_pdays'].sum() / coef
print('Top 25 cities of total annual increase', ans * 100)

Top 25 cities of total annual increase 24.507376418058552


#### Pdays

In [5]:
city_coefs = pd.read_json(DATA_IN+'processed/PNAS-DATA-v2/WBGT32_1D_TREND_EXP05.json', orient = 'split')
GHS = gpd.read_file('/home/cascade/projects/UrbanHeat/data/raw/GHS_UCDB/GHS_STAT_UCDB2015MT_GLOBE_R2019A_V1_0.shp')

In [6]:
len(city_coefs)

6022

In [7]:
#### Number of cities w/ sig increase in exposure?
print('The pct of cities w/ increases in exposure: ', len(city_coefs)/len(GHS)*100)


The pct of cities w/ increases in exposure:  45.84697373429768


In [9]:
len(city_coefs)

6022

In [10]:
ans = len(city_coefs[(city_coefs['GCPNT_LAT'] < 23.5) & (city_coefs['GCPNT_LAT'] > -23.5)]) / len(city_coefs)
print('what pct of pday cities are low lat?', ans*100)

what pct of pday cities are low lat? 50.94652939222849


In [11]:
print('what pct of global pop are cities with sig pdays?')

what pct of global pop are cities with sig pdays?


In [12]:
def country_search(country, data_set):
    "what pct of cities had a p-day increase?"
    print('Num of Cities in '+country+' ', len(data_set[data_set['CTR_MN_NM'] == country]) / len(GHS[GHS['CTR_MN_NM'] == country]) *100)

In [13]:
data_set = city_coefs

In [14]:
country_search('Senegal', data_set)

Num of Cities in Senegal  93.93939393939394


In [15]:
country_search('Nigeria', data_set)

Num of Cities in Nigeria  91.92546583850931


In [16]:
country_search('India', data_set)

Num of Cities in India  86.48399014778325


#### Pct of global population exposured

In [20]:
city_coefs = pd.read_json(DATA_IN+'processed/PNAS-DATA-v2/WBGT32_1D_TREND_EXP05.json', orient = 'split')

In [21]:
pop = pd.read_csv(DATA_IN+'interim/GHS-UCDB-Interp.csv')
p16 = pop[['ID_HDC_G0', 'P2016']]

In [22]:
len(p16)

13135

In [23]:
pdays_pop = pd.merge(city_coefs[['ID_HDC_G0']], p16, on = 'ID_HDC_G0', how = 'inner')

In [24]:
ans = pdays_pop['P2016'].sum() / p16['P2016'].sum() * 100 
print('What is the global urban population in 2016', p16['P2016'].sum())
print('How many people live in cities with increasing exp in 2016', pdays_pop['P2016'].sum())
print('What pct of total urban pop has sig increase exp in 2015', ans)

What is the global urban population in 2016 3535326298.5424414
How many people live in cities with increasing exp in 2016 1661009084.2793088
What pct of total urban pop has sig increase exp in 2015 46.983190348345396


In [25]:
# From UN-DESA 2018 estimates for total global pop in 2015
ans =  pdays_pop['P2016'].sum() / 7383009000 * 100
print('What pct of total world pop has sig increase exp in 2016', ans)

What pct of total world pop has sig increase exp in 2016 22.49772530792403


In [None]:
# UN-DESA Urban pop in 2015 was  3 981 498
p16['P2016'].sum()

#### Total Heat Days 

In [31]:
city_totdays = pd.read_json(DATA_IN+'processed/PNAS-DATA-v2/WBGT32_1D_TREND_HEATP05.json', orient = 'split')

In [34]:
print('What pct of all cities had sig increase in days/yr > WBGT32 C ?')
print(len(city_totdays)/len(GHS))
print(len(city_totdays))

What pct of all cities had sig increase in days/yr > WBGT32 C ?
0.38797106966121053
5096


In [35]:
print('What pct of all cities >1 day / yr in days/yr > WBGT32 C ?')
print(len(city_totdays[city_totdays['coef_totDays'] >= 1])/len(GHS))
print(len(city_totdays[city_totdays['coef_totDays'] >= 1]))

What pct of all cities >1 day / yr in days/yr > WBGT32 C ?
0.1884278644842025
2475


In [46]:
## How many cities day increase per year ... 1, 3
top = len(city_totdays)
bottom = len(city_totdays[city_totdays['coef_totDays'] >= 2])

In [47]:
print(top)
print(bottom)

5096
66


In [59]:
## What are some big cities?
hot50 = city_totdays[(city_totdays['coef_totDays'] >= 1.5) * city_totdays['P2016'] >= 5*10**5][['coef_totDays', 'UC_NM_MN']].sort_values('coef_totDays')

In [60]:
len(hot50)

56

In [48]:
#### Columbo & San Salvador & Conakry
print('Conakry tot days:', city_totdays[city_totdays['ID_HDC_G0'] == 1502]['coef_totDays'].values)
print('columbo tot days:', city_totdays[city_totdays['ID_HDC_G0'] == 8835]['coef_totDays'].values)
print('San Sal tot days:', city_totdays[city_totdays['ID_HDC_G0'] == 321]['coef_totDays'].values)

Conakry tot days: []
columbo tot days: [0.1697479]
San Sal tot days: []


#### Dehli & Kolkata 

In [34]:
# Delhi 6955 & Kolkata 9691
K = city_coefs[city_coefs['ID_HDC_G0']== 9691]
D = city_coefs[city_coefs['ID_HDC_G0']== 6955]

In [36]:
print('Share of heat Kolkata', K.coef_heat / K.coef_pdays * 100)

Share of heat Kolkata 5599    50.402206
dtype: float64


In [38]:
print('Share of heat Delhi', D.coef_heat / D.coef_pdays * 100)

Share of heat Delhi 3201    23.607115
dtype: float64


#### Populations of specific cities

In [None]:
pop = pd.read_csv(DATA_IN+'interim/GHS-UCDB-Interp.csv')

In [None]:
# 9691, Kolkata 1998
# 2046, Paris 2003
# 4417, Aleppo 2010

In [None]:
ans = pop[pop['ID_HDC_G0'] == 9691]['P2015'] / 10**3
print('Pop of Kolkata in 1998', ans)

# Regional Trends

In [None]:
#### Annual Rates

scale = 10**6
geog = 'sub-region'

for label in np.unique(HI_STATS[geog]):
    label = label
    data = HI_STATS[HI_STATS[geog] == label]
    
    #### Rate of change
    coef, r2, p = lm_func(data, 'people_days')
    print('annual increase in people days '+label, 'was', coef/scale, ' p=', p)
    coef1, r21, p1 = lm_func(data, 'people_days_heat')
    print('annual increase in people days heat '+label, 'was', coef1/scale, ' p=', p)
    coef2, r22, p2 = lm_func(data, 'people_days_pop')
    print('annual increase in people days pop '+label, 'was', coef2/scale, ' p=', p)
    print('attrib heat '+label, 'was', coef1 / coef *100, ' p=', p, '\n')
  

In [None]:
#### Trends for Africa, N & SS
geog = 'region'
location = 'Africa'
data = HI_STATS[HI_STATS[geog] == location]
print(location)

#### Total Change in people Days
data = data.groupby('year')['people_days'].sum()
year = str(data.index[33])
value = str(data.values[33]/10**9)
print('person days in 2016 was '+value+' billion')

year = str(data.index[0])
value = str(data.values[0]/10**9)
print('person days in 1983 was '+value+' billion')

#### Pct Change in Poeple Days 1983 - 2016
pdays16 = data.iloc[len(data) -1]
pdays83 = data.iloc[0]
out = (data.iloc[len(data) -1] - data.iloc[0]) / data.iloc[0] * 100
print('pct increase in people days 83 - 16 is ', out)



In [None]:
#### S Asia as pct of total  global = 5.245146271 B 

print('pct of total pdays from S Asia is ', 1899.70765 / 10**3 / 5.245146271 * 100)

In [None]:
#### Median Slope
region = 'Europe'
col = 'coef_heat'
geog = 'region'
scale = 10**3
result = city_coefs[city_coefs[geog]== region][col].median()
print(region, col, 'is ', result/scale)

# Trend First vs. Second Half of Data

In [None]:
data.head()

In [None]:
#### Share of exposure due to heat by 17 year split

## 1983 - 1999
data1 = HI_STATS[(HI_STATS['year'] >= 1983) & (HI_STATS['year'] < 2000)]
coef1pop , r21pop, p1pop  = lm_func(data1 , 'people_days_pop')
coef1heat , r21heat, p1heat = lm_func(data1 , 'people_days_heat')

years = list(np.unique(data1['year']))
plt.plot(years, data1.groupby('year')['people_days_heat'].sum())
sns.regplot(years, data1.groupby('year')['people_days_heat'].sum(), 
            color = 'blue', scatter = False, truncate = True)

## 2000 - 2016
data2 = HI_STATS[(HI_STATS['year'] >= 2000) & (HI_STATS['year'] <= 2016)]
coef2heat , r22heat, p2heat = lm_func(data2 , 'people_days_heat')
coef2pop , r22pop, p1pop  = lm_func(data2 , 'people_days_pop')

years = list(np.unique(data1['year']))
plt.plot(years, data2.groupby('year')['people_days_heat'].sum())
sns.regplot(years, data2.groupby('year')['people_days_heat'].sum(), 
            color = 'orange', scatter = False, truncate = True)

In [None]:
## 2000 - 2016
data2pop = HI_STATS[(HI_STATS['year'] >= 1983) & (HI_STATS['year'] < 2000)]
coef2pop , r22pop, p1pop  = lm_func(data2pop , 'people_days_pop')

data2heat = HI_STATS[(HI_STATS['year'] >= 2000) & (HI_STATS['year'] <= 2016)]
coef2heat , r22heat, p2heat = lm_func(data2heat , 'people_days_heat')


In [None]:
## Estimates
print('From 83 - 99, contribution from heat was', coef1heat/(coef1pop+coef1heat))
print('From 00 - 16, contribution from heat was', coef2heat/(coef2pop+coef2heat))
print('From 83 - 00, heat was', coef1heat/10**9, round(p1heat, 3))
print('From 00 - 16, heat was', coef2heat/10**9)
print('From 83 - 00, pop was', coef1pop/10**9)
print('From 00 - 16, pop was', coef2pop/10**9)

# Heat Waves

- 9691 Kolkata 1998
- 2046 Paris 2003
- 4417, Aleppo 2010

In [None]:
# Find Heat Wave From All DATA
def select_city_year(df, city_id, year):
    "Quick search to find city and years within HI_STATS"
    df_out = df[(df['ID_HDC_G0'] == city_id) & (df['year'] == year)]
    
    return df_out

meta_fn = '/home/cascade/projects/UrbanHeat/data/processed/AllDATA-GHS-ERA5-HI406-META.csv'
ALL_DATA = pd.read_csv(meta_fn)

# [4417, 'Aleppo'] [2046, 'Paris'] [9691, 'Kolkata'] 
city = select_city_year(ALL_DATA, 4417, 2010)
city

In [None]:
# Make a def
df = pd.DataFrame()
df = df.append(means, ignore_index=True)
df = df.append(hi_year, ignore_index=True)
df.columns = cols
df

In [None]:
# Isloate daily HI vs averages 
# Kolkata df.iloc[:,54:72]
# Paris df.iloc[:,125:125+9] 
# Aleppo df.iloc[:,92 :145]
wave = df.iloc[:,92 :145]
wave = wave.transpose()
wave.columns = ['mean', 'HI']
wave['hi_dif'] = wave['HI'] - wave['mean'] 
# wave['>50'] = wave['HI'] - 50
print(wave['hi_dif'].mean())
wave

# Check out cities

In [None]:
def select_city_year(df, city_id, year):
    "Quick search to find city and years within HI_STATS"
    df_out = df[(df['ID_HDC_G0'] == city_id) & (df['year'] == year)]
    
    return df_out


In [None]:
city = select_city_year(ALL_DATA, 4417, 2010)
city

# LA

In [None]:
from glob import glob 
import os

In [None]:
dir_list = '/home/cascade/projects/UrbanHeat/data/interim/ERA5_HI/'


In [None]:
fn = 'GHS-ERA5-HI_2009.csv'
data = pd.read_csv(dir_list+fn)
city = data[data['ID_HDC_G0'] == 14]

In [None]:
temps_list = []
dates_list = []
for fn in sorted(os.listdir(dir_list)):
    data = pd.read_csv(dir_list+fn)
    city = data[data['ID_HDC_G0'] == 14]
    dates = list(city.iloc[:,3:])
    temps = list(city.iloc[:,3:].values[0])
    dates_list.extend(dates)
    temps_list.extend(temps)

In [None]:
len(dates_list)

# Africa

In [None]:
#### Total Change in people Days
print('For Africa ...')
data = HI_STATS[HI_STATS['region'] == 'Africa']
data = data.groupby('year')['people_days'].sum()
year = str(data.index[33])
value = str(data.values[33]/10**9)
print('person days in 2016 was '+value+' billion')

year = str(data.index[0])
value = str(data.values[0]/10**9)
print('person days in 1983 was '+value+' billion')

#### Pct Change in Poeple Days 1983 - 2016
pdays16 = data.iloc[len(data) -1]
pdays83 = data.iloc[0]
out = (data.iloc[len(data) -1] - data.iloc[0]) / data.iloc[0] * 100
print('pct increase in people days 83 - 16 is ', out)


In [None]:
print('What is the avg exp for Africa from 1986 - 2005')
data.iloc[3:23].mean() / 10**9 