# Data Check

This notebook was created in July/August 2020 to double check the data for the MS before I submit it.
I also want to check if the GHS-UCDB files have place names that we can use to update the figures in the MS and build out more supplement results.

In [1]:
#### Dependencies
import rasterio 
import numpy as np
import pandas as pd
import geopandas as gpd
from rasterstats import zonal_stats
from rasterio import features
import os
import xarray as xr
import fnmatch
import time
import multiprocessing as mp 
from glob import glob
from multiprocessing import Pool

#### Area Avg Function

In [2]:
def area_avg(tempRst, polyRst, ids_df):
    
    """ Function takes a temp/rh raster, polygon raster and ghs-ids to find area avg of ghs-ucdb polys"""
    
    # Make arrays into x    array DataArray
    tempRst_da = xr.DataArray(tempRst.read(1), dims = ['y', 'x']) # y and x are our 2-d labels
    polyRst_da = xr.DataArray(polyRst.read(1), dims = ['y', 'x'])

    # Make xarray dataset
    ds = xr.Dataset(data_vars = 
            {'ghs' : (['y', 'x'], polyRst_da),
            'temp' : (['y', 'x'], tempRst_da),})

    # UPDATED 2019-08-19 Mask the CHIRTS PIXELS FIRST, THEN GHS
    # Mask values from chirt that are ocean in ghs and chirt in our ds 
    ds_mask = ds.where(ds.temp != -9999, drop = False) #<<<<------ need to double check this

    # Mask pixels for both ghs and chirts where ghs cities are not present
    ds_mask = ds_mask.where(ds_mask.ghs > 0, drop = False)

    # Group poly_IDs find temp
    avg = ds_mask.groupby('ghs').mean(xr.ALL_DIMS)

    # turn GHS IDS and avg. CHIRTMax values into 1-D numpy arrays of equal length
    avg_ID = np.array(avg.ghs)
    avg_temp = np.array(avg.temp)

    print(len(avg_ID))
    print(len(avg_temp))

    # turn chirt max and IDS into a DF
    df_avg = pd.DataFrame()
    df_avg[date] = avg_temp
    df_avg['ID_HDC_G0'] = avg_ID

    # merge the df
    ids_df = ghs_ids_df.merge(df_avg, on='ID_HDC_G0', how = 'outer')
    
    return ids_df

## Check RH is ERA5 

In [3]:
# date to check 
date = '1983.02.02'

# Dirs
DATA_OUT = '/home/cascade/projects/UrbanHeat/data/interim/ERA5_RH/' 
DATA_INTERIM = '/home/cascade/projects/UrbanHeat/data/interim/'

# ERA5 FN 
ERA5_RH_FN = '/home/chc-ftp_out/products/CHIRTSdaily/global_tifs_p05/RHum/1983/RH.'+date+'.tif'
ERA5_RH = rasterio.open(ERA5_RH_FN)

# MERRA 2 FN
MERRA2_RH_FN ='/home/CHIRTS/daily_ERA5/w-MERRA2.params/1983/RH.'+date+'.tif'
MERRA2_RH = rasterio.open(MERRA2_RH_FN)

# Open the file with GeoPANDAS read_file
ghs_ids_fn = 'GHS-UCSB-IDS.csv'
ghs_ids_df = pd.read_csv(DATA_INTERIM+ghs_ids_fn)

# Open Polygon Raster
ghs_polyRst_fn = 'GHS_UCDB_Raster_touched.tif'
ghs_polyRst = rasterio.open(DATA_INTERIM+ghs_polyRst_fn)

In [4]:
MERRA2_RH_GHS = area_avg(MERRA2_RH, ghs_polyRst, ghs_ids_df)

13067
13067


In [5]:
ERA5_RH_GHS = area_avg(ERA5_RH, ghs_polyRst, ghs_ids_df)

13072
13072


In [6]:
# Open my current ERA5 RH Data or what I think is ERA5
current_RH_FN = '/home/cascade/projects/UrbanHeat/data/interim/ERA5_RH/GHS-ERA5-RH_1983.csv'
current_RH = pd.read_csv(current_RH_FN)

In [7]:
current_RH.head()

Unnamed: 0.1,Unnamed: 0,ID_HDC_G0,CTR_MN_NM,1983.01.01,1983.01.02,1983.01.03,1983.01.04,1983.01.05,1983.01.06,1983.01.07,...,1983.12.22,1983.12.23,1983.12.24,1983.12.25,1983.12.26,1983.12.27,1983.12.28,1983.12.29,1983.12.30,1983.12.31
0,0,5782,Russia,70.4588,75.14415,57.5747,78.48598,73.564224,54.77919,64.68439,...,94.6909,55.307,86.86023,95.86625,100.0,94.35248,90.32634,96.291595,83.88049,65.3701
1,1,3316,Russia,78.63648,84.30023,75.21151,78.14096,89.907974,90.51669,81.38997,...,83.181015,83.39253,100.0,94.860085,96.46097,99.67202,100.0,100.0,81.44434,71.243
2,2,5645,Russia,100.0,87.11761,71.00068,48.90888,53.354855,65.923996,64.28545,...,85.09533,53.07567,81.656166,84.950325,62.699684,71.29387,73.483406,68.009155,80.20846,65.54414
3,3,3185,Finland,90.72903,100.0,95.85354,81.20574,92.373825,93.32554,99.97878,...,67.47721,90.884,98.63177,97.165474,100.0,100.0,100.0,100.0,100.0,99.89831
4,4,3539,Russia,99.517075,98.2619,89.710014,77.465775,81.36922,64.009605,97.27532,...,88.17868,98.90047,96.488976,88.18032,88.679146,87.29457,89.28673,87.11443,97.0547,74.403336


In [14]:
check_df = pd.merge(MERRA2_RH_GHS, current_RH[['ID_HDC_G0', date]], on = 'ID_HDC_G0', how = 'inner')
check_df.rename(columns={'1983.02.02_x':'1983.02.02_cpt'}, inplace=True)
check_df.rename(columns={'1983.02.02_y':'1983.02.02_MERRA2'}, inplace=True)

In [15]:
check_df = pd.merge(check_df, ERA5_RH_GHS[['ID_HDC_G0', date]], on = 'ID_HDC_G0', how = 'inner')
check_df.rename(columns={'1983.02.02':'1983.02.02_ERA5'}, inplace=True)

In [16]:
check_df

Unnamed: 0,ID_HDC_G0,CTR_MN_NM,1983.02.02_cpt,1983.02.02_MERRA2,1983.02.02_ERA5
0,5782,Russia,46.510246,46.510246,95.552055
1,3316,Russia,99.898506,99.898506,96.487511
2,5645,Russia,62.028629,62.028630,94.270050
3,3185,Finland,99.505447,99.505450,95.659050
4,3539,Russia,63.400898,63.400898,97.655205
...,...,...,...,...,...
13130,13046,New Zealand,61.427849,61.427850,95.706703
13131,1116,Argentina,52.373001,52.373000,91.055717
13132,1114,Chile,56.439636,56.439636,93.537193
13133,1161,Argentina,60.396267,60.396267,95.979973


# Check GHS-UCSB for names

In [18]:
ghs_ucdb_fn = '/home/cascade/projects/UrbanHeat/data/raw/GHS_UCDB/GHS_STAT_UCDB2015MT_GLOBE_R2019A_V1_0.shp'
ghs_ucdb = gpd.read_file(ghs_ucdb_fn)

In [30]:
for col in ghs_ucdb.columns:
    print(col)

ID_HDC_G0
QA2_1V
AREA
BBX_LATMN
BBX_LONMN
BBX_LATMX
BBX_LONMX
GCPNT_LAT
GCPNT_LON
CTR_MN_NM
CTR_MN_ISO
XBRDR
XCTR_NBR
XC_NM_LST
XC_ISO_LST
GRGN_L1
GRGN_L2
UC_NM_MN
UC_NM_LST
UC_NM_SRC
H75_NBR
H90_NBR
H00_NBR
H75_AREA
H90_AREA
H00_AREA
E_BM_NM_LS
E_SL_LST
EL_AV_ALS
E_KG_NM_LS
E_RB_NM_LS
E_WR_P_90
E_WR_P_00
E_WR_P_14
E_WR_T_90
E_WR_T_00
E_WR_T_14
B75
B90
B00
B15
P75
P90
P00
P15
BUCAP75
BUCAP90
BUCAP00
BUCAP15
NTL_AV
GDP90_SM
GDP00_SM
GDP15_SM
INCM_CMI
DEV_CMI
TT2CC
E_GR_AV90
E_GR_AV00
E_GR_AV14
E_GR_AH90
E_GR_AM90
E_GR_AL90
E_GR_AT90
E_GR_AH00
E_GR_AM00
E_GR_AL00
E_GR_AT00
E_GR_AH14
E_GR_AM14
E_GR_AL14
E_GR_AT14
E_EC2E_E75
E_EC2E_E90
E_EC2E_E00
E_EC2E_E12
E_EC2E_R75
E_EC2E_R90
E_EC2E_R00
E_EC2E_R12
E_EC2E_I75
E_EC2E_I90
E_EC2E_I00
E_EC2E_I12
E_EC2E_T75
E_EC2E_T90
E_EC2E_T00
E_EC2E_T12
E_EC2E_A75
E_EC2E_A90
E_EC2E_A00
E_EC2E_A12
E_EC2O_E75
E_EC2O_E90
E_EC2O_E00
E_EC2O_E12
E_EC2O_R75
E_EC2O_R90
E_EC2O_R00
E_EC2O_R12
E_EC2O_I75
E_EC2O_I90
E_EC2O_I00
E_EC2O_I12
E_EC2O_T75
E_EC2O_T90
E_EC2O_T

In [33]:
ghs_ucdb['UC_NM_SRC']

0        GRUMP
1        GRUMP
2           NE
3        GRUMP
4           NE
         ...  
13130    GRUMP
13131    GRUMP
13132    GRUMP
13133    GRUMP
13134    GRUMP
Name: UC_NM_SRC, Length: 13135, dtype: object

In [37]:
# Kolkata (9691), Paris (2046), Aleppo (4417)

ghs_ucdb[ghs_ucdb['ID_HDC_G0'] == 4417]['UC_NM_MN'] 

2377    Halab (Aleppo) [SYR]
Name: UC_NM_MN, dtype: object