# MS Calculations

Notebook to crunch numbers for the MS.

by Cascade Tuholske 2020.02.23 

Updated 2020.08.27 - CPT
Was run on ERA5 RH with CHIRTS-Daily Tmax from ERA5

In [1]:
#### Depdencies 
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns

In [2]:
#### Regressions, no intercept addition is needed because we're using SK LEARN HERE 

def lm_func(df, col):
    
    "simple linear model of a time series data, returns coef"
    
    # Get Data
    X_year = np.array(df.groupby('year')['ID_HDC_G0'].mean().index).reshape((-1, 1))
    Y_stats = np.array(df.groupby('year')[col].sum()).reshape((-1, 1))

    # Add Intercept
    X_year_2 = sm.add_constant(X_year)

    # Regress
    model = sm.OLS(Y_stats, X_year_2).fit() 
        
    coef = int(model.params[1])
    #coef = int(coef)
            
    # R2 and P
    r2 = model.rsquared_adj
    p = model.pvalues[0]
    
    return coef, round(r2, 2), round(p, 3)

In [3]:
#### Load Data

# file paths
DATA_IN = "/home/cascade/projects/UrbanHeat/data/"  # Note: Need ?dl=1 to make sure this file gets read correctly
FIG_OUT = "/home/cascade/projects/UrbanHeat/figures/"
FN_IN = 'processed/AllDATA-GHS-ERA5-HI406-PDAYS.csv'
HI_STATS = pd.read_csv(DATA_IN+FN_IN)

# Set scale
scale = 10**9

# STATS IN
HI_STATS = pd.read_csv(DATA_IN+FN_IN)

# GHS-UCDB IN
GHS_FN = 'raw/GHS_UCDB/GHS_STAT_UCDB2015MT_GLOBE_R2019A_V1_0.shp'
GHS = gpd.read_file(DATA_IN+GHS_FN)
print('Len of GHS-UCDB (all cities) is', len(GHS))

# Check it
HI_STATS.head()

Len of GHS-UCDB (all cities) is 13135


Unnamed: 0.1,Unnamed: 0,ID_HDC_G0,year,total_days,P,P1983,P2016,people_days,people_days_heat,people_days_pop
0,0,3091,1983,1,53715.040735,53715.040735,58301.880371,53715.040735,53715.040735,0.0
1,1,3091,1984,2,54204.006604,53715.040735,58301.880371,108408.013208,107430.08147,977.931738
2,2,3091,1985,2,54692.972473,53715.040735,58301.880371,109385.944946,107430.08147,1955.863477
3,3,3091,1986,2,55181.938342,53715.040735,58301.880371,110363.876685,107430.08147,2933.795215
4,4,3091,1988,1,56159.870081,53715.040735,58301.880371,56159.870081,53715.040735,2444.829346


In [4]:
# Drop cites where 1983 had 1 day and none elsewhere

print(len(HI_STATS))
only83 = HI_STATS.groupby('ID_HDC_G0')['total_days'].sum() == 1 # sum up total days and find those with 1 day
only83 = list(only83[only83 == True].index) # make a list of IDs
sub = HI_STATS[HI_STATS['ID_HDC_G0'].isin(only83)] # subset those IDs
bad_ids = sub[(sub['year'] == 1983) & (sub['total_days'] == 1)] # drop those from 1983 only
drop_list = list(bad_ids['ID_HDC_G0']) # make a list
HI_STATS= HI_STATS[~HI_STATS['ID_HDC_G0'].isin(drop_list)] # drop those from the list
print(len(HI_STATS))

390524
390354


In [5]:
## Add In Meta Data
geog = ['region', 'intermediate-region', 'sub-region','CTR_MN_NM', 'ID_HDC_G0', 'GCPNT_LAT', 'GCPNT_LON']
meta_fn = 'processed/AllDATA-GHS-ERA5-HI406-META.csv'
ALL_DATA = pd.read_csv(DATA_IN+meta_fn)
META = ALL_DATA[geog]
META = META.drop_duplicates('ID_HDC_G0')

## Merge in meta
HI_STATS= HI_STATS.merge(META, on = 'ID_HDC_G0', how = 'left')
HI_STATS.head()


Unnamed: 0.1,Unnamed: 0,ID_HDC_G0,year,total_days,P,P1983,P2016,people_days,people_days_heat,people_days_pop,region,intermediate-region,sub-region,CTR_MN_NM,GCPNT_LAT,GCPNT_LON
0,0,3091,1983,1,53715.040735,53715.040735,58301.880371,53715.040735,53715.040735,0.0,Europe,Eastern Europe,Eastern Europe,Poland,54.088065,18.783016
1,1,3091,1984,2,54204.006604,53715.040735,58301.880371,108408.013208,107430.08147,977.931738,Europe,Eastern Europe,Eastern Europe,Poland,54.088065,18.783016
2,2,3091,1985,2,54692.972473,53715.040735,58301.880371,109385.944946,107430.08147,1955.863477,Europe,Eastern Europe,Eastern Europe,Poland,54.088065,18.783016
3,3,3091,1986,2,55181.938342,53715.040735,58301.880371,110363.876685,107430.08147,2933.795215,Europe,Eastern Europe,Eastern Europe,Poland,54.088065,18.783016
4,4,3091,1988,1,56159.870081,53715.040735,58301.880371,56159.870081,53715.040735,2444.829346,Europe,Eastern Europe,Eastern Europe,Poland,54.088065,18.783016


# Global Trends

In [6]:
#### Total Change in people Days
data = HI_STATS.groupby('year')['people_days'].sum()
year = str(data.index[33])
value = str(data.values[33]/10**9)
print('person days in 2016 was '+value+' billion')

year = str(data.index[0])
value = str(data.values[0]/10**9)
print('person days in 1983 was '+value+' billion')

#### Pct Change in Poeple Days 1983 - 2016
pdays16 = data.iloc[len(data) -1]
pdays83 = data.iloc[0]
out = (data.iloc[len(data) -1] - data.iloc[0]) / data.iloc[0] * 100
print('pct increase in people days 83 - 16 is ', out)


person days in 2016 was 341.57494466059256 billion
person days in 1983 was 148.1538984504178 billion
pct increase in people days 83 - 16 is  130.55413879298382


In [7]:
#### Rate of change
data = HI_STATS
coef, r2, p = lm_func(data, 'people_days')
print('annual increase in people days ', 'was', coef/10**9, ' p=', p)
coef1, r21, p1 = lm_func(data, 'people_days_heat')
print('annual increase in people days heat ', 'was', coef1/10**9, ' p=', p)
coef2, r22, p2 = lm_func(data, 'people_days_pop')
print('annual increase in people days pop ', 'was', coef2/10**9, ' p=', p)
print('attrib heat ', 'was', coef1 / coef *100, ' p=', p, '\n')

annual increase in people days  was 5.593851065  p= 0.0
annual increase in people days heat  was 1.137151086  p= 0.0
annual increase in people days pop  was 4.456699978  p= 0.0
attrib heat  was 20.328590675483063  p= 0.0 



In [8]:
#### Pct Pday Annual Increase from Heat
coef_pdays, r2_pdays, p_pdays = lm_func(HI_STATS, 'people_days') # regress pdays
coef_heat, r2_heat, p_heat = lm_func(HI_STATS, 'people_days_heat') # regreas heat

print('warming is what pct of total?', coef_heat/coef_pdays *100)

warming is what pct of total? 20.328590675483063


# Africa

In [9]:
#### Total Change in people Days
print('For Africa ...')
data = HI_STATS[HI_STATS['region'] == 'Africa']
data = data.groupby('year')['people_days'].sum()
year = str(data.index[33])
value = str(data.values[33]/10**9)
print('person days in 2016 was '+value+' billion')

year = str(data.index[0])
value = str(data.values[0]/10**9)
print('person days in 1983 was '+value+' billion')

#### Pct Change in Poeple Days 1983 - 2016
pdays16 = data.iloc[len(data) -1]
pdays83 = data.iloc[0]
out = (data.iloc[len(data) -1] - data.iloc[0]) / data.iloc[0] * 100
print('pct increase in people days 83 - 16 is ', out)


For Africa ...
person days in 2016 was 56.29063764754387 billion
person days in 1983 was 17.243777295861413 billion
pct increase in people days 83 - 16 is  226.4402960078468


In [10]:
print('What is the avg exp for Africa from 1986 - 2005')
data.iloc[3:23].mean() / 10**9 

What is the avg exp for Africa from 1986 - 2005


27.491700539359275

# City-level

#### Largest cities compared to global total

In [11]:
#### Top cities
cities = pd.read_csv(DATA_IN+'/processed/AllDATA-GHS-ERA5-HI406-MAPDATA_PDAYS_P05.csv')
ghs = gpd.read_file('/home/cascade/tana-crunch-cascade/projects/UrbanHeat/data/raw/GHS_UCDB/GHS_STAT_UCDB2015MT_GLOBE_R2019A_V1_0.shp')

In [12]:
nms = ghs[['ID_HDC_G0', 'UC_NM_MN']]
top = cities.sort_values('coef_pdays', ascending = False).head(25) # get the top ten cities
top = top.merge(nms, on = 'ID_HDC_G0', how = 'inner')

In [13]:
# What pct of the global annual increase comes from the top ten cities?
ans = top['coef_pdays'].sum() / coef
print('Top 10 cities of total annual increase', ans * 100)

Top 10 cities of total annual increase 23.15661343157017


In [14]:
# Mumbia
100 * 38 / 62

61.29032258064516

#### Pdays

In [15]:
city_coefs = pd.read_csv(DATA_IN+'processed/AllDATA-GHS-ERA5-HI406-MAPDATA_PDAYS_P05.csv')

In [16]:
len(city_coefs)

7931

In [17]:
#### Number of cities w/ sig increase in exposure?
print('The pct of cities w/ increases in exposure: ', len(city_coefs)/len(GHS)*100)


The pct of cities w/ increases in exposure:  60.38066235249334


In [18]:
city_coefs.columns

Index(['Unnamed: 0', 'ID_HDC_G0', 'coef_pdays', 'p_value_pdays', 'coef_heat',
       'p_value_heat', 'coef_pop', 'p_value_pop', 'coef_totDays',
       'p_value_totDays', 'coef_attrib', 'coef_attrib_norm', 'region',
       'intermediate-region', 'sub-region', 'CTR_MN_NM', 'GCPNT_LAT',
       'GCPNT_LON', 'P1983', 'P2016'],
      dtype='object')

In [19]:
len(city_coefs)

7931

In [20]:
ans = len(city_coefs[(city_coefs['GCPNT_LAT'] < 23.5) & (city_coefs['GCPNT_LAT'] > -23.5)]) / len(city_coefs)
print('what pct of pday cities are low lat?', ans*100)

what pct of pday cities are low lat? 52.96936073635102


In [21]:
print('what pct of global pop are cities with sig pdays?')

what pct of global pop are cities with sig pdays?


In [None]:
def country_search(country, data_set):
    "what pct of cities had a p-day increase?"
    print('Num of Cities in '+country+' ', len(data_set[data_set['CTR_MN_NM'] == country]) / len(GHS[GHS['CTR_MN_NM'] == country]) *100)

In [None]:
data_set = city_coefs

In [None]:
country_search('Senegal', data_set)

In [None]:
country_search('Nigeria', data_set)

In [None]:
country_search('India', data_set)

#### Pct of global population exposured

In [33]:
city_coefs = pd.read_csv(DATA_IN+'processed/AllDATA-GHS-ERA5-HI406-MAPDATA_PDAYS_P05.csv')

In [41]:
pop = pd.read_csv(DATA_IN+'interim/GHS-UCDB-Interp.csv')
p16 = pop[['ID_HDC_G0', 'P2016']]

In [42]:
len(p16)

13135

In [43]:
pdays_pop = pd.merge(city_coefs[['ID_HDC_G0']], p16, on = 'ID_HDC_G0', how = 'inner')

In [47]:
ans = pdays_pop['P2016'].sum() / p16['P2016'].sum() * 100 
print('What is the global urban population in 2016', p16['P2016'].sum())
print('How many people live in cities with increasing exp in 2016', pdays_pop['P2016'].sum())
print('What pct of total urban pop has sig increase exp in 2015', ans)

What is the global population in 2016 3535326298.5424414
How many people live in cities with increasing exp in 2016 2263351715.490597
What pct of total urban pop has sig increase exp in 2015 64.02101317843676


In [49]:
# From UN-DESA 2018 estimates for total global pop in 2015
ans =  pdays_pop['P2016'].sum() / 7383009000 * 100
print('What pct of total world pop has sig increase exp in 2016', ans)

What pct of total world pop has sig increase exp in 2016 30.65622316714766


In [50]:
# UN-DESA Urban pop in 2015 was  3 981 498
p16['P2016'].sum()

3535326298.5424414

#### Total Heat Days 

In [None]:
city_totdays = pd.read_csv(DATA_IN+'processed/AllDATA-GHS-ERA5-HI406-MAPDATA_TOTDAYS_P05.csv')

In [None]:
print('What pct of all cities had sig increase in days/yr > 40.6 C Hi?')
print(len(city_totdays)/len(GHS))
print(len(city_totdays))

In [None]:
print('What pct of all cities >1 day / yr in days/yr > 40.6 C Hi?')
print(len(city_totdays[city_totdays['coef_totDays'] >= 1])/len(GHS))
print(len(city_totdays[city_totdays['coef_totDays'] >= 1]))

In [None]:
## How many cities day increase per year ... 1, 3
top = len(city_totdays)
bottom = len(city_totdays[city_totdays['coef_totDays'] >= 2])

In [None]:
print(top)
print(bottom)

In [None]:
#### Columbo & San Salvador & Conakry
print('Conakry tot days:', city_totdays[city_totdays['ID_HDC_G0'] == 1502]['coef_totDays'].values)
print('columbo tot days:', city_totdays[city_totdays['ID_HDC_G0'] == 8835]['coef_totDays'].values)
print('San Sal tot days:', city_totdays[city_totdays['ID_HDC_G0'] == 321]['coef_totDays'].values)

#### Populations of specific cities

In [52]:
pop = pd.read_csv(DATA_IN+'interim/GHS-UCDB-Interp.csv')

In [None]:
# 9691, Kolkata 1998
# 2046, Paris 2003
# 4417, Aleppo 2010

In [59]:
ans = pop[pop['ID_HDC_G0'] == 9691]['P2015'] / 10**3
print('Pop of Kolkata in 1998', ans)

Pop of Kolkata in 1998 7475    21620.289279
Name: P2015, dtype: float64


# Regional Trends

In [None]:
#### Annual Rates

scale = 10**6
geog = 'sub-region'

for label in np.unique(HI_STATS[geog]):
    label = label
    data = HI_STATS[HI_STATS[geog] == label]
    
    #### Rate of change
    coef, r2, p = lm_func(data, 'people_days')
    print('annual increase in people days '+label, 'was', coef/scale, ' p=', p)
    coef1, r21, p1 = lm_func(data, 'people_days_heat')
    print('annual increase in people days heat '+label, 'was', coef1/scale, ' p=', p)
    coef2, r22, p2 = lm_func(data, 'people_days_pop')
    print('annual increase in people days pop '+label, 'was', coef2/scale, ' p=', p)
    print('attrib heat '+label, 'was', coef1 / coef *100, ' p=', p, '\n')
  

In [None]:
#### Trends for Africa, N & SS
geog = 'region'
location = 'Africa'
data = HI_STATS[HI_STATS[geog] == location]
print(location)

#### Total Change in people Days
data = data.groupby('year')['people_days'].sum()
year = str(data.index[33])
value = str(data.values[33]/10**9)
print('person days in 2016 was '+value+' billion')

year = str(data.index[0])
value = str(data.values[0]/10**9)
print('person days in 1983 was '+value+' billion')

#### Pct Change in Poeple Days 1983 - 2016
pdays16 = data.iloc[len(data) -1]
pdays83 = data.iloc[0]
out = (data.iloc[len(data) -1] - data.iloc[0]) / data.iloc[0] * 100
print('pct increase in people days 83 - 16 is ', out)



In [None]:
#### S Asia as pct of total  global = 5.245146271 B 

print('pct of total pdays from S Asia is ', 1899.70765 / 10**3 / 5.245146271 * 100)

In [None]:
#### Median Slope
region = 'Europe'
col = 'coef_heat'
geog = 'region'
scale = 10**3
result = city_coefs[city_coefs[geog]== region][col].median()
print(region, col, 'is ', result/scale)

# Trend First vs. Second Half of Data

In [None]:
data.head()

In [None]:
#### Share of exposure due to heat by 17 year split

## 1983 - 1999
data1 = HI_STATS[(HI_STATS['year'] >= 1983) & (HI_STATS['year'] < 2000)]
coef1pop , r21pop, p1pop  = lm_func(data1 , 'people_days_pop')
coef1heat , r21heat, p1heat = lm_func(data1 , 'people_days_heat')

years = list(np.unique(data1['year']))
plt.plot(years, data1.groupby('year')['people_days_heat'].sum())
sns.regplot(years, data1.groupby('year')['people_days_heat'].sum(), 
            color = 'blue', scatter = False, truncate = True)

## 2000 - 2016
data2 = HI_STATS[(HI_STATS['year'] >= 2000) & (HI_STATS['year'] <= 2016)]
coef2heat , r22heat, p2heat = lm_func(data2 , 'people_days_heat')
coef2pop , r22pop, p1pop  = lm_func(data2 , 'people_days_pop')

years = list(np.unique(data1['year']))
plt.plot(years, data2.groupby('year')['people_days_heat'].sum())
sns.regplot(years, data2.groupby('year')['people_days_heat'].sum(), 
            color = 'orange', scatter = False, truncate = True)

In [None]:
## 2000 - 2016
data2pop = HI_STATS[(HI_STATS['year'] >= 1983) & (HI_STATS['year'] < 2000)]
coef2pop , r22pop, p1pop  = lm_func(data2pop , 'people_days_pop')

data2heat = HI_STATS[(HI_STATS['year'] >= 2000) & (HI_STATS['year'] <= 2016)]
coef2heat , r22heat, p2heat = lm_func(data2heat , 'people_days_heat')


In [None]:
## Estimates
print('From 83 - 99, contribution from heat was', coef1heat/(coef1pop+coef1heat))
print('From 00 - 16, contribution from heat was', coef2heat/(coef2pop+coef2heat))
print('From 83 - 00, heat was', coef1heat/10**9, round(p1heat, 3))
print('From 00 - 16, heat was', coef2heat/10**9)
print('From 83 - 00, pop was', coef1pop/10**9)
print('From 00 - 16, pop was', coef2pop/10**9)

# Heat Waves

- 9691 Kolkata 1998
- 2046 Paris 2003
- 4417, Aleppo 2010

In [None]:
# Find Heat Wave From All DATA
def select_city_year(df, city_id, year):
    "Quick search to find city and years within HI_STATS"
    df_out = df[(df['ID_HDC_G0'] == city_id) & (df['year'] == year)]
    
    return df_out

meta_fn = '/home/cascade/projects/UrbanHeat/data/processed/AllDATA-GHS-ERA5-HI406-META.csv'
ALL_DATA = pd.read_csv(meta_fn)

# [4417, 'Aleppo'] [2046, 'Paris'] [9691, 'Kolkata'] 
city = select_city_year(ALL_DATA, 4417, 2010)
city

In [None]:
# Make a def
df = pd.DataFrame()
df = df.append(means, ignore_index=True)
df = df.append(hi_year, ignore_index=True)
df.columns = cols
df

In [None]:
# Isloate daily HI vs averages 
# Kolkata df.iloc[:,54:72]
# Paris df.iloc[:,125:125+9] 
# Aleppo df.iloc[:,92 :145]
wave = df.iloc[:,92 :145]
wave = wave.transpose()
wave.columns = ['mean', 'HI']
wave['hi_dif'] = wave['HI'] - wave['mean'] 
# wave['>50'] = wave['HI'] - 50
print(wave['hi_dif'].mean())
wave

# Check out cities

In [None]:
def select_city_year(df, city_id, year):
    "Quick search to find city and years within HI_STATS"
    df_out = df[(df['ID_HDC_G0'] == city_id) & (df['year'] == year)]
    
    return df_out


In [None]:
city = select_city_year(ALL_DATA, 4417, 2010)
city

# LA

In [None]:
from glob import glob 
import os

In [None]:
dir_list = '/home/cascade/projects/UrbanHeat/data/interim/ERA5_HI/'


In [None]:
fn = 'GHS-ERA5-HI_2009.csv'
data = pd.read_csv(dir_list+fn)
city = data[data['ID_HDC_G0'] == 14]

In [None]:
temps_list = []
dates_list = []
for fn in sorted(os.listdir(dir_list)):
    data = pd.read_csv(dir_list+fn)
    city = data[data['ID_HDC_G0'] == 14]
    dates = list(city.iloc[:,3:])
    temps = list(city.iloc[:,3:].values[0])
    dates_list.extend(dates)
    temps_list.extend(temps)

In [None]:
len(dates_list)