# Data Check

This notebook was created in July/August 2020 to double check the data for the MS before I submit it.
I also want to check if the GHS-UCDB files have place names that we can use to update the figures in the MS and build out more supplement results.

In [1]:
#### Dependencies
import rasterio 
import numpy as np
import pandas as pd
import geopandas as gpd
from rasterstats import zonal_stats
from rasterio import features
import os
import xarray as xr
import fnmatch
import time
import multiprocessing as mp 
from glob import glob
from multiprocessing import Pool

## Check RH is ERA5 

In [2]:
# Dirs
DATA_OUT = '/home/cascade/projects/UrbanHeat/data/interim/ERA5_RH/' 
DATA_INTERIM = '/home/cascade/projects/UrbanHeat/data/interim/'

# FN 
ERA5_RH_FN = '/home/chc-ftp_out/products/CHIRTSdaily/global_tifs_p05/RHum/1983/RH.1983.01.01.tif'
tempRst = rasterio.open(ERA5_RH_FN)

# Open the file with GeoPANDAS read_file
ghs_ids_fn = 'GHS-UCSB-IDS.csv'
ghs_ids_df = pd.read_csv(DATA_INTERIM+ghs_ids_fn)

# Open Polygon Raster
polyRst_fn = 'GHS_UCDB_Raster_touched.tif'
polyRst = rasterio.open(DATA_INTERIM+polyRst_fn)

In [5]:
#### Calc Area Avg. ERA5 RH for '1983.01.01' for GHS-UCDB

date = '1983.01.01'

# Turn polyRst data as Xarray, 
polyRst_da = xr.DataArray(polyRst.read(1), dims = ['y', 'x'])

# Make arrays into x array DataArray
tempRst_da = xr.DataArray(tempRst.read(1), dims = ['y', 'x']) # y and x are our 2-d labels

# Make xarray dataset
ds = xr.Dataset(data_vars = 
        {'ghs' : (['y', 'x'], polyRst_da),
        'temp' : (['y', 'x'], tempRst_da),})

# UPDATED 2019-08-19 Mask the CHIRTS PIXELS FIRST, THEN GHS
# Mask values from chirt that are ocean in ghs and chirt in our ds 
ds_mask = ds.where(ds.temp != -9999, drop = False) #<<<<------ need to double check this

# Mask pixels for both ghs and chirts where ghs cities are not present
ds_mask = ds_mask.where(ds_mask.ghs > 0, drop = False)

# Group poly_IDs find temp
avg = ds_mask.groupby('ghs').mean(xr.ALL_DIMS)

# turn GHS IDS and avg. CHIRTMax values into 1-D numpy arrays of equal length
avg_ID = np.array(avg.ghs)
avg_temp = np.array(avg.temp)

print(len(avg_ID))
print(len(avg_temp))

# turn chirt max and IDS into a DF
df_avg = pd.DataFrame()
df_avg[date] = avg_temp
df_avg['ID_HDC_G0'] = avg_ID

# merge the df
ghs_ids_df = ghs_ids_df.merge(df_avg, on='ID_HDC_G0', how = 'outer')

13072
13072


In [6]:
ghs_ids_df.head()

Unnamed: 0,ID_HDC_G0,CTR_MN_NM,1983.01.01
0,5782,Russia,94.164597
1,3316,Russia,96.823662
2,5645,Russia,95.182632
3,3185,Finland,98.555817
4,3539,Russia,94.960381


In [7]:
# Open current ERA5 RH Data for 1983.01.01 ... or what I think is ERA5
current_RH_FN = '/home/cascade/projects/UrbanHeat/data/interim/ERA5_RH/GHS-ERA5-RH_1983.csv'
current_RH = 