# Data Check

This notebook was created in July/August 2020 to double check the data for the MS before I submit it.
I also want to check if the GHS-UCDB files have place names that we can use to update the figures in the MS and build out more supplement results.

In [1]:
#### Dependencies
import rasterio 
import numpy as np
import pandas as pd
import geopandas as gpd
from rasterstats import zonal_stats
from rasterio import features
import os
import xarray as xr
import fnmatch
import time
import multiprocessing as mp 
from glob import glob
from multiprocessing import Pool

#### Area Avg Function

In [2]:
def area_avg(tempRst, polyRst, ids_df):
    
    """ Function takes a temp/rh raster, polygon raster and ghs-ids to find area avg of ghs-ucdb polys"""
    
    # Make arrays into x    array DataArray
    tempRst_da = xr.DataArray(tempRst.read(1), dims = ['y', 'x']) # y and x are our 2-d labels
    polyRst_da = xr.DataArray(polyRst.read(1), dims = ['y', 'x'])

    # Make xarray dataset
    ds = xr.Dataset(data_vars = 
            {'ghs' : (['y', 'x'], polyRst_da),
            'temp' : (['y', 'x'], tempRst_da),})

    # UPDATED 2019-08-19 Mask the CHIRTS PIXELS FIRST, THEN GHS
    # Mask values from chirt that are ocean in ghs and chirt in our ds 
    ds_mask = ds.where(ds.temp != -9999, drop = False) #<<<<------ need to double check this

    # Mask pixels for both ghs and chirts where ghs cities are not present
    ds_mask = ds_mask.where(ds_mask.ghs > 0, drop = False)

    # Group poly_IDs find temp
    avg = ds_mask.groupby('ghs').mean(xr.ALL_DIMS)

    # turn GHS IDS and avg. CHIRTMax values into 1-D numpy arrays of equal length
    avg_ID = np.array(avg.ghs)
    avg_temp = np.array(avg.temp)

    print(len(avg_ID))
    print(len(avg_temp))

    # turn chirt max and IDS into a DF
    df_avg = pd.DataFrame()
    df_avg[date] = avg_temp
    df_avg['ID_HDC_G0'] = avg_ID

    # merge the df
    ids_df = ghs_ids_df.merge(df_avg, on='ID_HDC_G0', how = 'outer')
    
    return ids_df

# Check coef meaning 

In [13]:
DATA_PATH = '/home/cascade/projects/UrbanHeat/data/'
GHS_POP_FN = DATA_PATH+'interim/GHS-UCDB-Interp.csv'
COEF_FN = DATA_PATH+'processed/PNAS-DATA-v2/HI406_2D_TREND_EXP05.json'
EXP_FN = DATA_PATH+'processed/PNAS-DATA-v2/HI461_1D_EXP.json'

GHS_POP = pd.read_csv(GHS_POP_FN)
COEF = pd.read_json(COEF_FN, orient = 'split')
EXP = pd.read_json(EXP_FN, orient = 'split')

In [14]:
city_id = 34
city_coef = COEF[COEF['ID_HDC_G0'] == city_id]
city_exp = EXP[EXP['ID_HDC_G0'] == city_id]

In [15]:
city_exp

Unnamed: 0,ID_HDC_G0,year,tot_days,P,P1983,P2016,people_days,people_days_heat,people_days_pop
120,34,1983,1,236515.041712,236515.041712,365617.969896,236515.0,236515.0,0.0
121,34,1984,2,238715.510187,236515.041712,365617.969896,477431.0,473030.1,4400.936951
122,34,1988,4,247517.38409,236515.041712,365617.969896,990069.5,946060.2,44009.369513
123,34,1990,6,251918.321041,236515.041712,365617.969896,1511510.0,1419090.0,92419.675977
124,34,1991,4,254455.204371,236515.041712,365617.969896,1017821.0,946060.2,71760.650638
125,34,1992,2,256992.087701,236515.041712,365617.969896,513984.2,473030.1,40954.091979
126,34,1993,2,259528.971031,236515.041712,365617.969896,519057.9,473030.1,46027.858639
127,34,1995,4,264602.737691,236515.041712,365617.969896,1058411.0,946060.2,112350.783919
128,34,1996,8,267139.621021,236515.041712,365617.969896,2137117.0,1892120.0,244996.634479
129,34,1997,3,269676.504352,236515.041712,365617.969896,809029.5,709545.1,99484.38792


In [16]:
city_coef.head(1)

Unnamed: 0,ID_HDC_G0,coef_pdays,p_value_pdays,coef_heat,p_value_heat,coef_pop,p_value_pop,coef_totDays,p_value_totDays,coef_attrib,coef_attrib_norm,CTR_MN_NM,UC_NM_MN,GCPNT_LAT,GCPNT_LON,region,sub-region,intermediate-region,P1983,P2016
4,34,71696.9372,0.0357,4047.316222,0.9359,67649.620978,0.0,0.017112,0.9359,0.887099,0.943538,United States,Stockton [USA],37.973783,-121.295244,Americas,Northern America,Northern America,236515.041712,365617.969896


## Check RH is ERA5 

In [None]:
# date to check 
date = '1983.02.02'

# Dirs
DATA_OUT = '/home/cascade/projects/UrbanHeat/data/interim/ERA5_RH/' 
DATA_INTERIM = '/home/cascade/projects/UrbanHeat/data/interim/'

# ERA5 FN 
ERA5_RH_FN = '/home/chc-ftp_out/products/CHIRTSdaily/global_tifs_p05/RHum/1983/RH.'+date+'.tif'
ERA5_RH = rasterio.open(ERA5_RH_FN)

# MERRA 2 FN
MERRA2_RH_FN ='/home/CHIRTS/daily_ERA5/w-MERRA2.params/1983/RH.'+date+'.tif'
MERRA2_RH = rasterio.open(MERRA2_RH_FN)

# Open the file with GeoPANDAS read_file
ghs_ids_fn = 'GHS-UCSB-IDS.csv'
ghs_ids_df = pd.read_csv(DATA_INTERIM+ghs_ids_fn)

# Open Polygon Raster
ghs_polyRst_fn = 'GHS_UCDB_Raster_touched.tif'
ghs_polyRst = rasterio.open(DATA_INTERIM+ghs_polyRst_fn)

In [None]:
MERRA2_RH_GHS = area_avg(MERRA2_RH, ghs_polyRst, ghs_ids_df)

In [None]:
ERA5_RH_GHS = area_avg(ERA5_RH, ghs_polyRst, ghs_ids_df)

In [None]:
# Open my current ERA5 RH Data or what I think is ERA5
current_RH_FN = '/home/cascade/projects/UrbanHeat/data/interim/ERA5_RH/GHS-ERA5-RH_1983.csv'
current_RH = pd.read_csv(current_RH_FN)

In [None]:
current_RH.head()

In [None]:
check_df = pd.merge(MERRA2_RH_GHS, current_RH[['ID_HDC_G0', date]], on = 'ID_HDC_G0', how = 'inner')
check_df.rename(columns={'1983.02.02_x':'1983.02.02_cpt'}, inplace=True)
check_df.rename(columns={'1983.02.02_y':'1983.02.02_MERRA2'}, inplace=True)

In [None]:
check_df = pd.merge(check_df, ERA5_RH_GHS[['ID_HDC_G0', date]], on = 'ID_HDC_G0', how = 'inner')
check_df.rename(columns={'1983.02.02':'1983.02.02_ERA5'}, inplace=True)

In [None]:
check_df

# Check GHS-UCSB for names

In [None]:
ghs_ucdb_fn = '/home/cascade/projects/UrbanHeat/data/raw/GHS_UCDB/GHS_STAT_UCDB2015MT_GLOBE_R2019A_V1_0.shp'
ghs_ucdb = gpd.read_file(ghs_ucdb_fn)

In [None]:
for col in ghs_ucdb.columns:
    print(col)

In [None]:
ghs_ucdb['UC_NM_SRC']

In [None]:
# Kolkata (9691), Paris (2046), Aleppo (4417)

ghs_ucdb[ghs_ucdb['ID_HDC_G0'] == 4417]['UC_NM_MN'] 