# Figure 3 Final

Notebook to make data for figure 3 for ms <br>
Actual figure will be rendered in QGIS  <br>
by Cascade Tuholske 2020.02.23

In [1]:
#### Depdencies 
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns

In [2]:
#### Functions
def pop_stat(df, geog, stats):
    
    """Finds linear coef for increase in stat by a given geography from 1983 - 2016, as well
    as the pct change in population of the cities within the given geography"""

    # Get results
    labels = []
    delt_list = []
    r2_list = []
    coef_list = []
    p_list = []
    df_out = pd.DataFrame()

    for label, df_geog in df.groupby(geog):

        # Get Data
        X_year = np.array(df_geog.groupby('year')['ID_HDC_G0'].mean().index).reshape((-1, 1))
        Y_stats = np.array(df_geog.groupby('year')[stats].sum()).reshape((-1, 1))

        # Add Intercept
        X_year_2 = sm.add_constant(X_year)

        # Regress
        model = sm.OLS(Y_stats, X_year_2).fit() 
        
        # Get slope
        # first param in intercept coef, second is slope of line but if slope = 0, then intecept
        if len(model.params) == 2:
            coef = model.params[1]
            
        else:
            coef = model.params[0]
        
        # R2 and P
        r2 = model.rsquared_adj
        p = model.pvalues[0]

        # Pop change
        delt = df_geog.drop_duplicates('ID_HDC_G0').copy()
        delt['delt_pop'] = delt['P2016'] - delt['P1983']
        delt = delt['delt_pop'].sum()

        # Make lists
        labels.append(label)
        r2_list.append(r2)
        coef_list.append(coef)
        p_list.append(p)
        delt_list.append(delt)

    # Make data frame
    df_out[geog] = labels
    df_out['p_delt'] = delt_list
    df_out['r2'] = r2_list
    df_out['coef'] = coef_list
    df_out['p_value'] = [round(elem, 4) for elem in p_list]

    return df_out

In [3]:
#### Load Data
# file path
DATA_IN = "/home/cascade/projects/UrbanHeat/data/"  # Note: Need ?dl=1 to make sure this file gets read correctly
FIG_OUT = "/home/cascade/projects/UrbanHeat/figures/"

# Raw Heat
FN_IN = 'processed/All_data_HI406_figdata.csv'
stats = pd.read_csv(DATA_IN+FN_IN)

# scale the date in the plot 
scale = 10**9 

In [5]:
#### Get Regions
geog = 'sub-region'
meta_fn = 'processed/All_data_HI406_meta.csv'
meta_in = pd.read_csv(DATA_IN+meta_fn)
meta = meta_in[[geog, 'ID_HDC_G0']]
stats = stats.merge(meta, on = 'ID_HDC_G0', how = 'inner')

In [6]:
stats.head()

Unnamed: 0.1,Unnamed: 0,ID_HDC_G0,year,total_days,P,P1983,P2016,people_days,people_days_heat,people_days_pop,sub-region
0,0,22,1983,2,52064.452435,52064.452435,73006.671133,0.000104,0.000104,0.0,Northern America
1,0,22,1983,2,52064.452435,52064.452435,73006.671133,0.000104,0.000104,0.0,Northern America
2,0,22,1983,2,52064.452435,52064.452435,73006.671133,0.000104,0.000104,0.0,Northern America
3,0,22,1983,2,52064.452435,52064.452435,73006.671133,0.000104,0.000104,0.0,Northern America
4,0,22,1983,2,52064.452435,52064.452435,73006.671133,0.000104,0.000104,0.0,Northern America


In [9]:
#### Find Regions where the trend in people days is sig at 0.05

# Args
heat_stat = 'total_days'
pdays_stat = 'people_days'
geog = 'sub-region'

# Find Sig Regions
df_region = pop_stat(stats, geog, heat_stat)
#df_region = df_region[df_region['p_value'] < 0.05]

# Get the regions in a list and re-order 
sig_regions = list(df_region[geog])
sig_regions = [sig_regions[10], sig_regions[11],  sig_regions[4],
               sig_regions[6], sig_regions[1], sig_regions[9],
               sig_regions[7], sig_regions[8], sig_regions[5],
               sig_regions[0], sig_regions[3], sig_regions[2]]
len(sig_regions)

12

In [10]:
df_region

Unnamed: 0,sub-region,p_delt,r2,coef,p_value
0,Australia and New Zealand,4394766.0,0.072034,619.622,0.0868
1,Central Asia,8186318.0,0.083458,694.2773,0.0656
2,Eastern Asia,199971300.0,0.199361,156921.2,0.01
3,Eastern Europe,2381901.0,0.315138,1663.884,0.0004
4,Latin America and the Caribbean,100205400.0,0.48829,712142.8,0.0
5,Melanesia,1109717.0,0.465274,22491.79,0.0
6,Northern Africa,62736100.0,0.399074,120924.8,0.0001
7,Northern America,44173530.0,0.301328,30601.69,0.0009
8,Northern Europe,545307.9,0.089138,7.268449,0.0511
9,South-eastern Asia,122393100.0,0.405202,1436896.0,0.0001


In [None]:
#### Or use All regions --- Reorder if needed
### THROWS AN ERROR WHEN DOING ALL REGIONS ...

# sig_regions = ['Australia and New Zealand',
#  'Central Asia',
#  'Eastern Asia',
#  'Eastern Europe',
#  'Latin America and the Caribbean',
#  'Melanesia',
#  'Northern Africa',
#  'Northern America',
#  'Northern Europe',
#  'South-eastern Asia',
#  'Southern Asia',
#  'Southern Europe',
#  'Sub-Saharan Africa',
#  'Western Asia',
#  'Western Europe']

In [None]:
# Make an empty df
df_out = pd.DataFrame(columns = ['ID_HDC_G0', 'p_delt', 'r2', 'coef_heat', 'p_value', 'coef_pday'])

In [None]:
# Get city-level data
for i, region in enumerate(sig_regions):
    
    df_region = stats[stats[geog] == region] # select region

    # Get people days dues to heat coef
    heat = pop_stat(df_region, 'ID_HDC_G0', 'people_days_heat') # get stats 
    heat.rename(columns={"coef": "coef_heat"}, inplace = True)
    
    # Get people days total
    pdays_all = pop_stat(df_region, 'ID_HDC_G0', 'people_days_pop') # get stats
    pdays_all.rename(columns={"coef": "coef_pop"}, inplace = True)
    
    # Merge pdays_all and heat
    heat = heat.merge(pdays_all[['ID_HDC_G0', 'coef_pop']], on = 'ID_HDC_G0', how = 'left')
    
    # Get people days total
    pdays_all = pop_stat(df_region, 'ID_HDC_G0', 'people_days') # get stats
    pdays_all.rename(columns={"coef": "coef_pday"}, inplace = True)
    
    # try to merge them
    heat = heat.merge(pdays_all[['ID_HDC_G0', 'coef_pday']], on = 'ID_HDC_G0', how = 'left')
    
    # add them on
    df_out = df_out.append(heat)
    print(len(df_out))

In [None]:
df_out.head()

In [None]:
#### Merge Back in Meta Data
# meta_fn = 'processed/All_data_HI406_meta.csv'
# meta_in = pd.read_csv(DATA_IN+meta_fn)
# meta = meta_in[['ID_HDC_G0', 'CTR_MN_NM','region', 'sub-region', 'intermediate-region', 'GCPNT_LAT', 'GCPNT_LON']]
# df_out = df_out.merge(meta, on = 'ID_HDC_G0', how = 'inner')


In [None]:
df_out.head()

In [None]:
#### Save out file 