# MS Calculations

Notebook to crunch numbers for the MS.

by Cascade Tuholske 2020.02.23

In [1]:
#### Depdencies 
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns

In [3]:
#### Regressions, no intercept addition is needed because we're using SK LEARN HERE 

def lm_func(df, col):
    
    "simple linear model of a time series data, returns coef"
    
    # Get Data
    X_year = np.array(df.groupby('year')['ID_HDC_G0'].mean().index).reshape((-1, 1))
    Y_stats = np.array(df.groupby('year')[col].sum()).reshape((-1, 1))

    # Add Intercept
    X_year_2 = sm.add_constant(X_year)

    # Regress
    model = sm.OLS(Y_stats, X_year_2).fit() 
        
    coef = int(model.params[1])
    #coef = int(coef)
            
    # R2 and P
    r2 = model.rsquared_adj
    p = model.pvalues[0]
    
    return coef, round(r2, 2), round(p, 3)

In [4]:
#### Load Data

# file paths
DATA_IN = "/home/cascade/projects/UrbanHeat/data/"  # Note: Need ?dl=1 to make sure this file gets read correctly
FIG_OUT = "/home/cascade/projects/UrbanHeat/figures/"
FN_IN = 'processed/All_data_HI406_figdata.csv'
#FN_IN = '/home/cascade/projects/UrbanHeat/data/processed/oldworkflow/All_data_Raw406_es_final_pdayadd.csv'

HI_STATS = pd.read_csv(DATA_IN+FN_IN)

# Check it
HI_STATS.head()

Unnamed: 0.1,Unnamed: 0,ID_HDC_G0,year,total_days,P,P1983,P2016,people_days,people_days_heat,people_days_pop
0,0,22,1983,2,52064.452435,52064.452435,73006.671133,104128.9,104128.9,0.0
1,1,26,1983,1,194088.886834,194088.886834,268055.635628,194088.9,194088.9,0.0
2,2,27,1983,1,80540.77994,80540.77994,93335.494324,80540.78,80540.78,0.0
3,3,28,1983,9,59320.971209,59320.971209,91449.606255,533888.7,533888.7,0.0
4,4,29,1983,8,336518.836621,336518.836621,533318.453653,2692151.0,2692151.0,0.0


In [5]:
#### Drop 40 cities where P1983 == 40 -- see comments in Fig 2 & 3 code CPT 2020.03.03
print(len(HI_STATS))
HI_STATS = HI_STATS[HI_STATS['P1983'] > 0]
print(len(HI_STATS))

#### Drop cities with only one Tmax Day in 1983 and none else because you cannot regress them
drop_list = [2543, 2560, 3667, 3669, 6122, 6156] # city IDS
HI_STATS= HI_STATS[~HI_STATS['ID_HDC_G0'].isin(drop_list)]
print(len(HI_STATS))

392972
391612
391408


In [6]:
## Add In Meta Data
geog = ['region', 'intermediate-region', 'sub-region','CTR_MN_NM', 'ID_HDC_G0', 'GCPNT_LAT', 'GCPNT_LON']
meta_fn = 'processed/All_data_HI406_meta.csv'
all_data = pd.read_csv(DATA_IN+meta_fn)
meta = all_data[geog]
meta = meta.drop_duplicates('ID_HDC_G0')

## Merge in meta
HI_STATS= HI_STATS.merge(meta, on = 'ID_HDC_G0', how = 'left')
HI_STATS.head()


Unnamed: 0.1,Unnamed: 0,ID_HDC_G0,year,total_days,P,P1983,P2016,people_days,people_days_heat,people_days_pop,region,intermediate-region,sub-region,CTR_MN_NM,GCPNT_LAT,GCPNT_LON
0,0,22,1983,2,52064.452435,52064.452435,73006.671133,104128.9,104128.9,0.0,Americas,Northern America,Northern America,United States,37.688409,-121.75398
1,1,26,1983,1,194088.886834,194088.886834,268055.635628,194088.9,194088.9,0.0,Americas,Northern America,Northern America,United States,37.985433,-121.797516
2,2,27,1983,1,80540.77994,80540.77994,93335.494324,80540.78,80540.78,0.0,Americas,Northern America,Northern America,United States,38.264013,-122.030253
3,3,28,1983,9,59320.971209,59320.971209,91449.606255,533888.7,533888.7,0.0,Americas,Northern America,Northern America,United States,37.730079,-121.431413
4,4,29,1983,8,336518.836621,336518.836621,533318.453653,2692151.0,2692151.0,0.0,Americas,Northern America,Northern America,United States,35.36381,-119.047535


# Global Trends

In [None]:
#### Total Change in people Days
data = HI_STATS.groupby('year')['people_days'].sum()
year = str(data.index[33])
value = str(data.values[33]/10**9)
print('person days in 2016 was '+value+' billion')

year = str(data.index[0])
value = str(data.values[0]/10**9)
print('person days in 1983 was '+value+' billion')

#### Pct Change in Poeple Days 1983 - 2016
pdays16 = data.iloc[len(data) -1]
pdays83 = data.iloc[0]
out = (data.iloc[len(data) -1] - data.iloc[0]) / data.iloc[0] * 100
print('pct increase in people days 83 - 16 is ', out)

In [None]:
#### Pct Pday Annual Increase from Heat
coef_pdays, r2_pdays, p_pdays = lm_func(HI_STATS, 'people_days') # regress pdays
coef_heat, r2_heat, p_heat = lm_func(HI_STATS, 'people_days_heat') # regreas heat

print('warming is what pct of total?', coef_heat/coef_pdays *100)

In [None]:
#### Share of exposure due to heat by decade

## 1983 - 1999
data1 = HI_STATS[(HI_STATS['year'] >= 1983) & (HI_STATS['year'] < 2000)]
coef1pop , r21pop, p1pop  = lm_func(data1 , 'people_days_pop')
coef1heat , r21heat, p1heat = lm_func(data1 , 'people_days_heat')

years = list(np.unique(data1['year']))
plt.plot(years, data1.groupby('year')['people_days_heat'].sum())
sns.regplot(years, data1.groupby('year')['people_days_heat'].sum(), 
            color = 'blue', scatter = False, truncate = True)

## 2000 - 2016
data2 = HI_STATS[(HI_STATS['year'] >= 2000) & (HI_STATS['year'] <= 2016)]
coef2heat , r22heat, p2heat = lm_func(data2 , 'people_days_heat')
coef2pop , r22pop, p1pop  = lm_func(data2 , 'people_days_pop')

years = list(np.unique(data1['year']))
plt.plot(years, data2.groupby('year')['people_days_heat'].sum())
sns.regplot(years, data2.groupby('year')['people_days_heat'].sum(), 
            color = 'orange', scatter = False, truncate = True)

In [None]:
## 2000 - 2016
data2pop = HI_STATS[(HI_STATS['year'] >= 1983) & (HI_STATS['year'] < 2000)]
coef2pop , r22pop, p1pop  = lm_func(data2pop , 'people_days_pop')

data2heat = HI_STATS[(HI_STATS['year'] >= 2000) & (HI_STATS['year'] <= 2016)]
coef2heat , r22heat, p2heat = lm_func(data2heat , 'people_days_heat')


In [None]:

## Estimates
print('From 83 - 99, contribution from heat was', coef1heat/(coef1pop+coef1heat))
print('From 00 - 16, contribution from heat was', coef2heat/(coef2pop+coef2heat))
print('From 83 - 00, heat was', coef1heat/10**9, round(p1heat, 3))
print('From 00 - 16, heat was', coef2heat/10**9)
print('From 83 - 00, pop was', coef1pop/10**9)
print('From 00 - 16, pop was', coef2pop/10**9)

# Regional Trends

In [None]:
#### Annual Rates

for label in np.unique(HI_STATS['sub-region']):
    geog = 'sub-region'
    label = label
    data = HI_STATS[HI_STATS[geog] == label]
    
    #### Rate of change
    coef, r2, p = lm_func(data, 'people_days')
    print('annual increase in people days '+label, 'was', coef/10**6, ' p=', p)
    coef1, r21, p1 = lm_func(data, 'people_days_heat')
    print('annual increase in people days heat '+label, 'was', coef1/10**6, ' p=', p)
    coef2, r22, p2 = lm_func(data, 'people_days_pop')
    print('annual increase in people days pop '+label, 'was', coef2/10**6, ' p=', p)
    print('attrib heat '+label, 'was', coef1 / coef *100, ' p=', p, '\n')
  

In [None]:
data.head()

In [None]:
#### Total Change in people Days

for label in np.unique(HI_STATS['sub-region']):
    geog = 'sub-region'
    label = label
    data = HI_STATS[HI_STATS[geog] == label]
    
    #### Change 1983 - 2016 
    data = data.groupby('year')['people_days'].sum()
    year1 = str(data.index[33])
    value1 = data.values[33]/10**9
    print(label,' person days in 2016 was ',value1,' billion')

    year2 = str(data.index[0])
    value2 = data.values[0]/10**9
    print(label,' person days in 1983 was ', value2, ' billion')
    
    #### Total Change 
    print('Total change was', (value1 - value2) * 10**3, ' person-days')

    #### Pct Change in Poeple Days 1983 - 2016
    pdays16 = data.iloc[len(data) -1]
    pdays83 = data.iloc[0]
    out = (data.iloc[len(data) -1] - data.iloc[0]) / data.iloc[0] * 100
    print(label,' pct increase in people days 83 - 16 is ', out, '\n')
    

In [None]:
#### Europe
geog = 'region'
label = 'Europe'
data = HI_STATS[HI_STATS[geog] == label]
data = data.groupby('year')['people_days'].sum()
year1 = str(data.index[33])
value1 = data.values[33]/10**9
print(label,' person days in 2016 was ',value1,' billion')

year2 = str(data.index[0])
value2 = data.values[0]/10**9
print(label,' person days in 1983 was ', value2, ' billion')

#### Total Change 
print('Total change was', (value1 - value2) * 10**3, ' person-days')

#### Pct Change in Poeple Days 1983 - 2016
pdays16 = data.iloc[len(data) -1]
pdays83 = data.iloc[0]
out = (data.iloc[len(data) -1] - data.iloc[0]) / data.iloc[0] * 100
print(label,' pct increase in people days 83 - 16 is ', out, '\n')

# City-level

In [8]:
all_coefs = pd.read_csv(DATA_IN+'/processed/All_data_HI406_figdata_map_totDays_p05.csv')

In [9]:
all_coefs.columns

Index(['Unnamed: 0', 'ID_HDC_G0', 'coef_pdays', 'p_value_pdays',
       'ci_left_pdays', 'ci_right_pdays', 'coef_heat', 'p_value_heat',
       'ci_left_heat', 'ci_right_heat', 'coef_pop', 'p_value_pop',
       'ci_left_pop', 'ci_right_pop', 'coef_totDays', 'p_value_totDays',
       'ci_left_totDays', 'ci_right_totDays', 'coef_attrib',
       'coef_attrib_left', 'coef_attrib_right', 'coef_attrib_norm',
       'coef_attrib_norm_left', 'coef_attrib_norm_right', 'region',
       'intermediate-region', 'sub-region', 'CTR_MN_NM', 'GCPNT_LAT',
       'GCPNT_LON', 'P1983', 'P2016'],
      dtype='object')

In [18]:
### Rates of exposure due to pop or heat (switch cols)

col = 'coef_pop' # 'coef_heat'
for label in np.unique(all_coefs['sub-region']):
    geog = 'sub-region'
    label = label
    data = all_coefs[all_coefs[geog] == label]
    results = data[col].quantile([.25, .5, .75])
    
    print(label, ' ', results)
    print('median', data[col].median(), '\n')

Australia and New Zealand   0.25     8800.347274
0.50    19553.819165
0.75    55303.479501
Name: coef_pop, dtype: float64
median 19553.819164701803 

Central Asia   0.25     1112.541037
0.50     2918.923145
0.75    11332.777559
Name: coef_pop, dtype: float64
median 2918.923144751925 

Eastern Asia   0.25      6683.165022
0.50     29018.687993
0.75    108326.820845
Name: coef_pop, dtype: float64
median 29018.687992527684 

Eastern Europe   0.25      789.452527
0.50     2184.046446
0.75    11220.120106
Name: coef_pop, dtype: float64
median 2184.0464461039637 

Latin America and the Caribbean   0.25     38156.119850
0.50    121236.490424
0.75    335545.112120
Name: coef_pop, dtype: float64
median 121236.49042391288 

Melanesia   0.25     32318.880274
0.50     71296.695864
0.75    133255.322773
Name: coef_pop, dtype: float64
median 71296.69586353978 

Northern Africa   0.25     31647.297693
0.50     63903.423008
0.75    125285.852893
Name: coef_pop, dtype: float64
median 63903.42300764535 

In [None]:
data.head()

In [None]:
#### Top Fifty Cities Worldwide Pdays
data = all_coefs[['ID_HDC_G0', 'CTR_MN_NM', 'coef_pdays']].sort_values('coef_pdays', ascending = False)
rank = list(range(1,len(data)+1))
data['rank'] = rank
data.head(50)

In [None]:
#### Delhi
city = 6955
data = HI_STATS[HI_STATS['ID_HDC_G0'] == city]
coef, r2, p = lm_func(data, 'people_days')
print('Delhi increased', coef)

In [None]:
#### Lagos
city = 2125
data = HI_STATS[HI_STATS['ID_HDC_G0'] == city]
coef, r2, p = lm_func(data, 'people_days')
print('Lagos increased', coef)

In [None]:
#### Miami 
city = 556
data = HI_STATS[HI_STATS['ID_HDC_G0'] == city]
coef, r2, p = lm_func(data, 'people_days')
print('Miami increased', coef)

In [None]:
#### Top Fifty Cities Worldwide Heating
data = all_coefs[['ID_HDC_G0', 'CTR_MN_NM', 'coef_totDays']].sort_values('coef_totDays', ascending = False)
rank = list(range(1,len(data)+1))
data['rank'] = rank
data.head(50)