# MS Calculations

Notebook to crunch numbers for the MS.

by Cascade Tuholske 2020.02.23

In [1]:
#### Depdencies 
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns

In [2]:
#### Regressions, no intercept addition is needed because we're using SK LEARN HERE 

def lm_func(df, col):
    
    "simple linear model of a time series data, returns coef"
    
    # Get Data
    X_year = np.array(df.groupby('year')['ID_HDC_G0'].mean().index).reshape((-1, 1))
    Y_stats = np.array(df.groupby('year')[col].sum()).reshape((-1, 1))

    # Add Intercept
    X_year_2 = sm.add_constant(X_year)

    # Regress
    model = sm.OLS(Y_stats, X_year_2).fit() 
        
    coef = int(model.params[1])
    #coef = int(coef)
            
    # R2 and P
    r2 = model.rsquared_adj
    p = model.pvalues[0]
    
    return coef, round(r2, 2), round(p, 3)

In [3]:
#### Load Data

# file paths
DATA_IN = "/home/cascade/projects/UrbanHeat/data/"  # Note: Need ?dl=1 to make sure this file gets read correctly
FIG_OUT = "/home/cascade/projects/UrbanHeat/figures/"
FN_IN = 'processed/AllDATA-GHS-ERA5-HI406-FIGDATA.csv'
HI_STATS = pd.read_csv(DATA_IN+FN_IN)

# Set scale
scale = 10**9

# STATS IN
HI_STATS = pd.read_csv(DATA_IN+FN_IN)

# GHS-UCDB IN
GHS_FN = 'raw/GHS_UCDB/GHS_STAT_UCDB2015MT_GLOBE_R2019A_V1_0.shp'
GHS = gpd.read_file(DATA_IN+GHS_FN)
print('Len of GHS-UCDB (all cities) is', len(GHS))

# Check it
HI_STATS.head()

Len of GHS-UCDB (all cities) is 13135


Unnamed: 0.1,Unnamed: 0,ID_HDC_G0,year,total_days,P,P1983,P2016,people_days,people_days_heat,people_days_pop
0,0,5645,1983,1,80796.79381,80796.79381,96123.248169,80796.79381,80796.79381,0.0
1,1,5645,1989,2,78918.494262,80796.79381,96123.248169,157836.988523,161593.58762,-3756.599097
2,2,5645,1990,3,78605.444337,80796.79381,96123.248169,235816.333011,242390.38143,-6574.04842
3,3,5645,1994,1,76481.266889,80796.79381,96123.248169,76481.266889,80796.79381,-4315.526921
4,4,5645,2005,1,80904.416534,80796.79381,96123.248169,80904.416534,80796.79381,107.622724


In [4]:
#### Drop cities where P1983 == 0
print(len(HI_STATS))
HI_STATS = HI_STATS[HI_STATS['P1983'] > 0]
print(len(HI_STATS))

#### Drop cities with only one Tmax Day in 1983 and none else because you cannot regress them
#drop_list = [2543, 2560, 3667, 3669, 6122, 6156] # city ids for MERRA2 
drop_list = [832, 1732, 5521] # city IDS for ERA5

HI_STATS= HI_STATS[~HI_STATS['ID_HDC_G0'].isin(drop_list)]
print(len(HI_STATS))

386920
385526
385424


In [7]:
## Add In Meta Data
geog = ['region', 'intermediate-region', 'sub-region','CTR_MN_NM', 'ID_HDC_G0', 'GCPNT_LAT', 'GCPNT_LON']
meta_fn = 'processed/AllDATA-GHS-ERA5-HI406-META.csv'
ALL_DATA = pd.read_csv(DATA_IN+meta_fn)
META = ALL_DATA[geog]
META = META.drop_duplicates('ID_HDC_G0')

## Merge in meta
HI_STATS= HI_STATS.merge(META, on = 'ID_HDC_G0', how = 'left')
HI_STATS.head()


Unnamed: 0.1,Unnamed: 0,ID_HDC_G0,year,total_days,P,P1983,P2016,people_days,people_days_heat,people_days_pop,region,intermediate-region,sub-region,CTR_MN_NM,GCPNT_LAT,GCPNT_LON
0,0,5645,1983,1,80796.79381,80796.79381,96123.248169,80796.79381,80796.79381,0.0,Europe,Eastern Europe,Eastern Europe,Russia,66.083799,76.64658
1,1,5645,1989,2,78918.494262,80796.79381,96123.248169,157836.988523,161593.58762,-3756.599097,Europe,Eastern Europe,Eastern Europe,Russia,66.083799,76.64658
2,2,5645,1990,3,78605.444337,80796.79381,96123.248169,235816.333011,242390.38143,-6574.04842,Europe,Eastern Europe,Eastern Europe,Russia,66.083799,76.64658
3,3,5645,1994,1,76481.266889,80796.79381,96123.248169,76481.266889,80796.79381,-4315.526921,Europe,Eastern Europe,Eastern Europe,Russia,66.083799,76.64658
4,4,5645,2005,1,80904.416534,80796.79381,96123.248169,80904.416534,80796.79381,107.622724,Europe,Eastern Europe,Eastern Europe,Russia,66.083799,76.64658


# Global Trends

In [None]:
#### Total Change in people Days
data = HI_STATS.groupby('year')['people_days'].sum()
year = str(data.index[33])
value = str(data.values[33]/10**9)
print('person days in 2016 was '+value+' billion')

year = str(data.index[0])
value = str(data.values[0]/10**9)
print('person days in 1983 was '+value+' billion')

#### Pct Change in Poeple Days 1983 - 2016
pdays16 = data.iloc[len(data) -1]
pdays83 = data.iloc[0]
out = (data.iloc[len(data) -1] - data.iloc[0]) / data.iloc[0] * 100
print('pct increase in people days 83 - 16 is ', out)

#### Rate of change
data = HI_STATS
coef, r2, p = lm_func(data, 'people_days')
print('annual increase in people days ', 'was', coef/10**9, ' p=', p)
coef1, r21, p1 = lm_func(data, 'people_days_heat')
print('annual increase in people days heat ', 'was', coef1/10**9, ' p=', p)
coef2, r22, p2 = lm_func(data, 'people_days_pop')
print('annual increase in people days pop ', 'was', coef2/10**9, ' p=', p)
print('attrib heat ', 'was', coef1 / coef *100, ' p=', p, '\n')

In [None]:
#### Pct Pday Annual Increase from Heat
coef_pdays, r2_pdays, p_pdays = lm_func(HI_STATS, 'people_days') # regress pdays
coef_heat, r2_heat, p_heat = lm_func(HI_STATS, 'people_days_heat') # regreas heat

print('warming is what pct of total?', coef_heat/coef_pdays *100)

# City-level

#### Pdays

In [None]:
city_coefs = pd.read_csv(DATA_IN+'processed/AllDATA-GHS-ERA5-HI406-MAPDATA_PDAYS_P05.csv')

In [None]:
len(city_coefs)

In [None]:
#### Number of cities w/ sig increase in exposure?
print('The pct of cities w/ increases in exposure: ', len(city_coefs)/len(GHS)*100)


In [None]:
#### Top Fifty Cities Worldwide Pdays
data = city_coefs[['ID_HDC_G0', 'CTR_MN_NM', 'coef_pdays']].sort_values('coef_pdays', ascending = False)
rank = list(range(1,len(data)+1))
data['rank'] = rank
data.head(50) # remove ; to show
#round(data['coef_pdays'].head(5)/10**6, 0) # remove ; to show

In [None]:
def country_search(country):
    "what pct of cities had a p-day increase?"
    print('Num of Cities in '+country+' ', len(data[data['CTR_MN_NM'] == country]) / len(GHS[GHS['CTR_MN_NM'] == country]) *100)

In [None]:
cities = data.head(50)
cities[cities['CTR_MN_NM'] == 'India']

In [None]:
country_search('Senegal')

In [None]:
country_search('Nigeria')

1.502437e+07 / 10**6

In [None]:
1.502437e+07 / 10**6

In [None]:
#### What about Tokyo?
# 12964
print('Tokyo pdays days:', city_coefs[city_coefs['ID_HDC_G0'] == 12964]['coef_pdays'].values)
print('Tokyo pdays heat days:', city_coefs[city_coefs['ID_HDC_G0'] == 12964]['coef_heat'].values)
print('Tokyo pdays heat days:', city_coefs[city_coefs['ID_HDC_G0'] == 12964]['coef_pop'].values)


In [None]:
print('Contribution from heat in Tokyo', 9747286.23462754/15024365.25712823*100)

#### Total Days 

In [None]:
city_totdays = pd.read_csv(DATA_IN+'processed/AllDATA-GHS-ERA5-HI406-MAPDATA_TOTDAYS_P05.csv')

In [None]:
#### How many cities in India?
print('Num of Cities in India :', len(data[data['CTR_MN_NM'] == 'India']) / len(GHS[GHS['CTR_MN_NM'] == 'India']) *100)

In [None]:
#### How many cities in Nigeria?
print('Num of Cities in Nigeria :', len(data[data['CTR_MN_NM'] == 'Nigeria']) / len(GHS[GHS['CTR_MN_NM'] == 'Nigeria']) *100)

In [None]:
len(city_totdays)

In [None]:
## How many cities day increase per year ... 1, 3
top = len(city_totdays)
bottom = len(city_totdays[city_totdays['coef_totDays'] >= 3])

In [None]:
city_totadays.columns

In [None]:
print(top)
print(bottom)

In [None]:
#### Columbo & San Salvador
print('columbo tot days:', city_totdays[city_totdays['ID_HDC_G0'] == 8835]['coef_totDays'].values)
print('San Sal tot days:', city_totdays[city_totdays['ID_HDC_G0'] == 321]['coef_totDays'].values)

# Regional Trends

In [11]:
#### Annual Rates

scale = 10**6
geog = 'sub-region'

for label in np.unique(HI_STATS[geog]):
    label = label
    data = HI_STATS[HI_STATS[geog] == label]
    
    #### Rate of change
    coef, r2, p = lm_func(data, 'people_days')
    print('annual increase in people days '+label, 'was', coef/scale, ' p=', p)
    coef1, r21, p1 = lm_func(data, 'people_days_heat')
    print('annual increase in people days heat '+label, 'was', coef1/scale, ' p=', p)
    coef2, r22, p2 = lm_func(data, 'people_days_pop')
    print('annual increase in people days pop '+label, 'was', coef2/scale, ' p=', p)
    print('attrib heat '+label, 'was', coef1 / coef *100, ' p=', p, '\n')
  

annual increase in people days Australia and New Zealand was 1.791013  p= 0.0
annual increase in people days heat Australia and New Zealand was 0.901771  p= 0.0
annual increase in people days pop Australia and New Zealand was 0.889242  p= 0.0
attrib heat Australia and New Zealand was 50.34977412224255  p= 0.0 

annual increase in people days Central Asia was 1.156314  p= 0.0
annual increase in people days heat Central Asia was 0.49997  p= 0.0
annual increase in people days pop Central Asia was 0.656344  p= 0.0
attrib heat Central Asia was 43.2382553527848  p= 0.0 

annual increase in people days Eastern Asia was 453.078748  p= 0.0
annual increase in people days heat Eastern Asia was 157.375669  p= 0.0
annual increase in people days pop Eastern Asia was 295.703078  p= 0.0
attrib heat Eastern Asia was 34.734727615165035  p= 0.0 

annual increase in people days Eastern Europe was 7.947795  p= 0.011
annual increase in people days heat Eastern Europe was 7.680745  p= 0.011
annual increase i

In [9]:
#### Trends for Africa, N & SS
geog = 'region'
location = 'Africa'
data = HI_STATS[HI_STATS[geog] == location]
print(location)

#### Total Change in people Days
data = data.groupby('year')['people_days'].sum()
year = str(data.index[33])
value = str(data.values[33]/10**9)
print('person days in 2016 was '+value+' billion')

year = str(data.index[0])
value = str(data.values[0]/10**9)
print('person days in 1983 was '+value+' billion')

#### Pct Change in Poeple Days 1983 - 2016
pdays16 = data.iloc[len(data) -1]
pdays83 = data.iloc[0]
out = (data.iloc[len(data) -1] - data.iloc[0]) / data.iloc[0] * 100
print('pct increase in people days 83 - 16 is ', out)



Africa
person days in 2016 was 50.16711694483583 billion
person days in 1983 was 14.595397210158913 billion
pct increase in people days 83 - 16 is  243.7187506614602


In [None]:
#### S Asia as pct of total  global = 5.245146271 B 

print('pct of total pdays from S Asia is ', 1899.70765 / 10**3 / 5.245146271 * 100)

In [None]:
#### Median Slope
region = 'Europe'
col = 'coef_heat'
geog = 'region'
scale = 10**3
result = city_coefs[city_coefs[geog]== region][col].median()
print(region, col, 'is ', result/scale)

# Trend First vs. Second Half of Data

In [None]:
data.head()

In [None]:
#### Share of exposure due to heat by 17 year split

## 1983 - 1999
data1 = HI_STATS[(HI_STATS['year'] >= 1983) & (HI_STATS['year'] < 2000)]
coef1pop , r21pop, p1pop  = lm_func(data1 , 'people_days_pop')
coef1heat , r21heat, p1heat = lm_func(data1 , 'people_days_heat')

years = list(np.unique(data1['year']))
plt.plot(years, data1.groupby('year')['people_days_heat'].sum())
sns.regplot(years, data1.groupby('year')['people_days_heat'].sum(), 
            color = 'blue', scatter = False, truncate = True)

## 2000 - 2016
data2 = HI_STATS[(HI_STATS['year'] >= 2000) & (HI_STATS['year'] <= 2016)]
coef2heat , r22heat, p2heat = lm_func(data2 , 'people_days_heat')
coef2pop , r22pop, p1pop  = lm_func(data2 , 'people_days_pop')

years = list(np.unique(data1['year']))
plt.plot(years, data2.groupby('year')['people_days_heat'].sum())
sns.regplot(years, data2.groupby('year')['people_days_heat'].sum(), 
            color = 'orange', scatter = False, truncate = True)

In [None]:
## 2000 - 2016
data2pop = HI_STATS[(HI_STATS['year'] >= 1983) & (HI_STATS['year'] < 2000)]
coef2pop , r22pop, p1pop  = lm_func(data2pop , 'people_days_pop')

data2heat = HI_STATS[(HI_STATS['year'] >= 2000) & (HI_STATS['year'] <= 2016)]
coef2heat , r22heat, p2heat = lm_func(data2heat , 'people_days_heat')


In [None]:
## Estimates
print('From 83 - 99, contribution from heat was', coef1heat/(coef1pop+coef1heat))
print('From 00 - 16, contribution from heat was', coef2heat/(coef2pop+coef2heat))
print('From 83 - 00, heat was', coef1heat/10**9, round(p1heat, 3))
print('From 00 - 16, heat was', coef2heat/10**9)
print('From 83 - 00, pop was', coef1pop/10**9)
print('From 00 - 16, pop was', coef2pop/10**9)

# Heat Waves

- 9691 Kolkata 1998
- 2046 Paris 2003
- 4417, Aleppo 2010

In [None]:
def select_city_year(df, city_id, year):
    "Quick search to find city and years within HI_STATS"
    df_out = df[(df['ID_HDC_G0'] == city_id) & (df['year'] == year)]
    
    return df_out


In [None]:
aleppo = select_city_year(ALL_DATA, 4417, 2010)

In [None]:
aleppo.loc[669069]

# Old Code

In [None]:
### Rates of exposure due to pop or heat (switch cols)

col = 'coef_pop' # 'coef_heat'
for label in np.unique(all_coefs['sub-region']):
    geog = 'sub-region'
    label = label
    data = all_coefs[all_coefs[geog] == label]c
    results = data[col].quantile([.25, .5, .75])
    
    print(label, ' ', results)
    print('median', data[col].median(), '\n')

In [None]:
#### Delhi
city = 6955
data = HI_STATS[HI_STATS['ID_HDC_G0'] == city]
coef, r2, p = lm_func(data, 'people_days')
print('Delhi increased', coef / 10**6)

In [None]:
#### Lagos
city = 2125
data = HI_STATS[HI_STATS['ID_HDC_G0'] == city]
coef, r2, p = lm_func(data, 'people_days')
print('Lagos increased', coef)

In [None]:
## T-test w/ 17 samples each


In [None]:
#### Miami 
city = 556
data = HI_STATS[HI_STATS['ID_HDC_G0'] == city]
coef, r2, p = lm_func(data, 'people_days')
print('Miami increased', coef)

In [None]:
#### Top Fifty Cities Worldwide Heating
data = all_coefs[['ID_HDC_G0', 'CTR_MN_NM', 'coef_totDays']].sort_values('coef_totDays', ascending = False)
rank = list(range(1,len(data)+1))
data['rank'] = rank
data.head(50)

In [None]:
#### How many cities out all cities experienced warmingp
print('globally : ', len(HI_STATS.drop_duplicates('ID_HDC_G0')) /len(GHS))

## India



In [None]:
### Rates of exposure due to pop or heat (switch cols)

col = 'coef_pop' # 'coef_heat'
for label in np.unique(all_coefs['sub-region']):
    geog = 'sub-region'
    label = label
    data = all_coefs[all_coefs[geog] == label]
    results = data[col].quantile([.25, .5, .75])
    
    print(label, ' ', results)
    print('median', data[col].median(), '\n')

In [None]:
#### Which region contributed the most people-days from warming and Pop?

global_pdays = 5138.121501 * 10**6
global_heat = 1179.223093 * 10**6# see above
global_pop = 3958.898407 * 10**6

for label in np.unique(HI_STATS['sub-region']):
    geog = 'sub-region'
    label = label
    data = HI_STATS[HI_STATS[geog] == label]
    
    #### Rate of change
    coef, r2, p = lm_func(data, 'people_days')
    print('pct annual increase in people days '+label, 'was', coef/global_pdays * 100 , ' p=', p)
    coef1, r21, p1 = lm_func(data, 'people_days_heat')
    print('pct annual increase in people days heat '+label, 'was', coef1/global_heat * 100 , ' p=', p)
    coef2, r22, p2 = lm_func(data, 'people_days_pop')
    print('pct annual increase in people days pop '+label, 'was', coef2/global_pop * 100 , ' p=', p, '\n')

  

In [None]:
#### Total Change in people Days

for label in np.unique(HI_STATS['sub-region']):
    geog = 'sub-region'
    label = label
    data = HI_STATS[HI_STATS[geog] == label]
    
    #### Change 1983 - 2016 
    data = data.groupby('year')['people_days'].sum()
    year1 = str(data.index[33])
    value1 = data.values[33]/10**9
    print(label,' person days in 2016 was ',value1,' billion')

    year2 = str(data.index[0])
    value2 = data.values[0]/10**9
    print(label,' person days in 1983 was ', value2, ' billion')
    
    #### Total Change 
    print('Total change was', (value1 - value2) * 10**3, ' person-days')

    #### Pct Change in Poeple Days 1983 - 2016
    pdays16 = data.iloc[len(data) -1]
    pdays83 = data.iloc[0]
    out = (data.iloc[len(data) -1] - data.iloc[0]) / data.iloc[0] * 100
    print(label,' pct increase in people days 83 - 16 is ', out, '\n')
    

In [None]:
#### 2016 vs 2983
geog = 'region'
label = 'Europe'
data = HI_STATS[HI_STATS[geog] == label]
data = data.groupby('year')['people_days'].sum()
year1 = str(data.index[33])
value1 = data.values[33]/10**9
print(label,' person days in 2016 was ',value1,' billion')

year2 = str(data.index[0])
value2 = data.values[0]/10**9
print(label,' person days in 1983 was ', value2, ' billion')

#### Total Change 
print('Total change was', (value1 - value2) * 10**3, ' person-days')

#### Pct Change in Poeple Days 1983 - 2016
pdays16 = data.iloc[len(data) -1]
pdays83 = data.iloc[0]
out = (data.iloc[len(data) -1] - data.iloc[0]) / data.iloc[0] * 100
print(label,' pct increase in people days 83 - 16 is ', out, '\n')

In [None]:
### Rates of exposure due to pop or heat (switch cols)

col = 'coef_pop' # 'coef_heat'
for label in np.unique(all_coefs['sub-region']):
    geog = 'sub-region'
    label = label
    data = all_coefs[all_coefs[geog] == label]
    results = data[col].quantile([.25, .5, .75])
    
    print(label, ' ', results)
    print('median', data[col].median(), '\n')

In [None]:
#### Delhi
city = 6955
data = HI_STATS[HI_STATS['ID_HDC_G0'] == city]
coef, r2, p = lm_func(data, 'people_days')
print('Delhi increased', coef / 10**6)

In [None]:
#### Lagos
city = 2125
data = HI_STATS[HI_STATS['ID_HDC_G0'] == city]
coef, r2, p = lm_func(data, 'people_days')
print('Lagos increased', coef)

In [None]:
#### Miami 
city = 556
data = HI_STATS[HI_STATS['ID_HDC_G0'] == city]
coef, r2, p = lm_func(data, 'people_days')
print('Miami increased', coef)

In [None]:
#### Top Fifty Cities Worldwide Heating
data = all_coefs[['ID_HDC_G0', 'CTR_MN_NM', 'coef_totDays']].sort_values('coef_totDays', ascending = False)
rank = list(range(1,len(data)+1))
data['rank'] = rank
data.head(50)

In [None]:
#### How many cities out all cities experienced warmingp
print('globally : ', len(HI_STATS.drop_duplicates('ID_HDC_G0')) /len(GHS))

## India



In [None]:
### Rates of exposure due to pop or heat (switch cols)

col = 'coef_pop' # 'coef_heat'
for label in np.unique(all_coefs['sub-region']):
    geog = 'sub-region'
    label = label
    data = all_coefs[all_coefs[geog] == label]
    results = data[col].quantile([.25, .5, .75])
    
    print(label, ' ', results)
    print('median', data[col].median(), '\n')

In [None]:
#### Which region contributed the most people-days from warming and Pop?

global_pdays = 5138.121501 * 10**6
global_heat = 1179.223093 * 10**6# see above
global_pop = 3958.898407 * 10**6

for label in np.unique(HI_STATS['sub-region']):
    geog = 'sub-region'
    label = label
    data = HI_STATS[HI_STATS[geog] == label]
    
    #### Rate of change
    coef, r2, p = lm_func(data, 'people_days')
    print('pct annual increase in people days '+label, 'was', coef/global_pdays * 100 , ' p=', p)
    coef1, r21, p1 = lm_func(data, 'people_days_heat')
    print('pct annual increase in people days heat '+label, 'was', coef1/global_heat * 100 , ' p=', p)
    coef2, r22, p2 = lm_func(data, 'people_days_pop')
    print('pct annual increase in people days pop '+label, 'was', coef2/global_pop * 100 , ' p=', p, '\n')

  

In [None]:
#### Change 1983 and 2016

for label in np.unique(HI_STATS['sub-region']):
    geog = 'sub-region'
    label = label
    data = HI_STATS[HI_STATS[geog] == label]
    
    #### Change 1983 - 2016 
    data = data.groupby('year')['people_days'].sum()
    year1 = str(data.index[33])
    value1 = data.values[33]/10**9
    print(label,' person days in 2016 was ',value1,' billion')

    year2 = str(data.index[0])
    value2 = data.values[0]/10**9
    print(label,' person days in 1983 was ', value2, ' billion')
    
    #### Total Change 
    print('Total change was', (value1 - value2) * 10**3, ' person-days')

    #### Pct Change in Poeple Days 1983 - 2016
    pdays16 = data.iloc[len(data) -1]
    pdays83 = data.iloc[0]
    out = (data.iloc[len(data) -1] - data.iloc[0]) / data.iloc[0] * 100
    print(label,' pct increase in people days 83 - 16 is ', out, '\n')
    

In [None]:
#### Change 1983 and 2016
geog = 'region'
label = 'Europe'
data = HI_STATS[HI_STATS[geog] == label]
data = data.groupby('year')['people_days'].sum()
year1 = str(data.index[33])
value1 = data.values[33]/10**9
print(label,' person days in 2016 was ',value1,' billion')

year2 = str(data.index[0])
value2 = data.values[0]/10**9
print(label,' person days in 1983 was ', value2, ' billion')

#### Total Change 
print('Total change was', (value1 - value2) * 10**3, ' person-days')

#### Pct Change in Poeple Days 1983 - 2016
pdays16 = data.iloc[len(data) -1]
pdays83 = data.iloc[0]
out = (data.iloc[len(data) -1] - data.iloc[0]) / data.iloc[0] * 100
print(label,' pct increase in people days 83 - 16 is ', out, '\n')