# Notebook to explore updating PNAS data
- rewrite code to make json not csv <br>
- what are we dropping city year combos in pdays routine?

In [7]:
# Dependencies
import pandas as pd
import numpy as np
import geopandas as gpd
from glob import glob
import os
import xarray as xr

In [2]:
# Data path
DATA_PATH = '/home/cascade/projects/UrbanHeat/data/'


### data check

In [3]:
# Explore files - 1983
content = os.listdir(DATA_PATH+'interim/')
fns = sorted(glob(DATA_PATH+'interim/ERA5_HI/*csv'))
fn = fns[0]
df1983 = pd.read_csv(fn)
df1983.head()

Unnamed: 0.1,Unnamed: 0,ID_HDC_G0,CTR_MN_NM,1983.01.01,1983.01.02,1983.01.03,1983.01.04,1983.01.05,1983.01.06,1983.01.07,...,1983.12.22,1983.12.23,1983.12.24,1983.12.25,1983.12.26,1983.12.27,1983.12.28,1983.12.29,1983.12.30,1983.12.31
0,0,5782,Russia,-49.590861,-38.042995,-36.750948,-28.374049,-26.626346,-21.826344,-21.446321,...,-27.149449,-36.333571,-30.164864,-19.821702,-14.509835,-14.25968,-16.59624,-14.705007,-14.277887,-16.171796
1,1,3316,Russia,-4.088565,-2.252119,-6.184317,-13.70558,-7.631833,-7.392443,-5.487767,...,-12.845948,-18.931796,-8.486376,-8.990228,-15.775358,-19.373392,-19.813643,-12.64872,-13.315901,-15.861366
2,2,5645,Russia,-38.693309,-23.243517,-13.361891,-11.548153,-16.044299,-18.756276,-24.4268,...,-20.344301,-26.453421,-16.92111,-2.669969,-4.402858,-9.637487,-11.979378,-15.932049,-15.186321,-12.247702
3,3,3185,Finland,-0.263081,0.372479,-5.046791,-2.224906,-1.583173,-0.187118,-0.431349,...,-6.141768,-3.382719,-2.95968,-7.515815,-13.45751,-4.966569,-5.707407,-10.071544,-9.986938,-10.397102
4,4,3539,Russia,-10.14669,-9.364139,-20.047189,-16.052717,-12.344176,-5.424159,-0.371261,...,-5.950157,-2.916705,-1.73435,-1.815293,-6.173747,-10.115407,-10.211072,-10.359057,-10.35903,-11.863613


In [4]:
df1983.head()

Unnamed: 0.1,Unnamed: 0,ID_HDC_G0,CTR_MN_NM,1983.01.01,1983.01.02,1983.01.03,1983.01.04,1983.01.05,1983.01.06,1983.01.07,...,1983.12.22,1983.12.23,1983.12.24,1983.12.25,1983.12.26,1983.12.27,1983.12.28,1983.12.29,1983.12.30,1983.12.31
0,0,5782,Russia,-49.590861,-38.042995,-36.750948,-28.374049,-26.626346,-21.826344,-21.446321,...,-27.149449,-36.333571,-30.164864,-19.821702,-14.509835,-14.25968,-16.59624,-14.705007,-14.277887,-16.171796
1,1,3316,Russia,-4.088565,-2.252119,-6.184317,-13.70558,-7.631833,-7.392443,-5.487767,...,-12.845948,-18.931796,-8.486376,-8.990228,-15.775358,-19.373392,-19.813643,-12.64872,-13.315901,-15.861366
2,2,5645,Russia,-38.693309,-23.243517,-13.361891,-11.548153,-16.044299,-18.756276,-24.4268,...,-20.344301,-26.453421,-16.92111,-2.669969,-4.402858,-9.637487,-11.979378,-15.932049,-15.186321,-12.247702
3,3,3185,Finland,-0.263081,0.372479,-5.046791,-2.224906,-1.583173,-0.187118,-0.431349,...,-6.141768,-3.382719,-2.95968,-7.515815,-13.45751,-4.966569,-5.707407,-10.071544,-9.986938,-10.397102
4,4,3539,Russia,-10.14669,-9.364139,-20.047189,-16.052717,-12.344176,-5.424159,-0.371261,...,-5.950157,-2.916705,-1.73435,-1.815293,-6.173747,-10.115407,-10.211072,-10.359057,-10.35903,-11.863613


### Let's rewrite how the data is made

In [6]:
#### Step 1 - Function Loads all Tmax Data as an X-array
def read_data(dir_path, space_dim, time_dim):
    """ Function reads in all Tmax .csv files, joins them by date along the x-axis
    and returns the whole record as a x-array data array
    
    Args:   
        dir_path = path to .csv files 
        time_dim = name for time dim as a str ... use date :-)
        space_dim = col name for GHS-UCDB IDs as an str (ID_HDC_G0)
    """
    fn_list = sorted(glob.glob(dir_path+'*.csv'))
    df_out = pd.DataFrame()
    date_list = []

    # Open all Tmax files and concat into a df
    for i, fn in enumerate(fn_list):    
        # Open the CSV
        df = pd.read_csv(fn)

        # Get the city ids 
        if i == 1:
            df_id = df[space_dim]

        # get only the Tmax columns and concate date list 
        df_temp = df.iloc[:,3:] # get only temp columns
        date_list = date_list+list(df_temp.columns)

        # Drop cities w/ no temp record 
        df_temp_drop = df_temp.dropna()

        # Merge
        df_out = pd.concat([df_out, df_temp_drop], axis=1)
        print(df_out.shape)
    
    # make date into an array
    tmax_arr = df_out.to_numpy()

    # Make data into an xr.DataArray
    tmax_xr_da = xr.DataArray(tmax_arr, coords=[df_id, date_list], 
                             dims=[space_dim, time_dim])
    return tmax_xr_da

In [25]:
def tmax_days(xarray, Tthresh):
    """ Function finds all the tmax days in a year and sums total days per year 
    greater than a threshold within a year where Tmax > Tthresh for each city. Returns the total number of days,
    the dates, the tempatures, and the intensity (daily Tmax - Tthresh)
    
    Args: 
        xarray = an xarray object with dims = (space, times)
        Tthresh = int of temp threshold
    """
    
    # empty lists & df
    id_list = []
    date_list = []
    tmax_list = []
    intensity_list = []
    df_out = pd.DataFrame()
    
    # subset xarray
    out = xarray.where(xarray > Tthresh, drop = True)

    # start loop 
    for index, loc in enumerate(out.ID_HDC_G0):
        id_list.append(out.ID_HDC_G0.values[index]) # get IDS
        date_list.append(out.sel(ID_HDC_G0 = loc).dropna(dim = 'date').date.values) # get event dates
        
        # #CPT 2020.02.23 
        # dayTot_list.append(len(out.sel(ID_HDC_G0 = loc).dropna(dim = 'date').date.values)) # get event totals
        
        tmax_list.append(out.sel(ID_HDC_G0 = loc).dropna(dim = 'date').values) # get temp values
        intensity_list.append(out.sel(ID_HDC_G0 = loc).dropna(dim = 'date').values - Tthresh) # get severity

    # write to a data frame
    df_out['ID_HDC_G0'] = id_list
    # df_out['total_days'] = dayTot_list #CPT 2020.02.23
    df_out['dates'] = date_list
    df_out['tmax'] = tmax_list
    df_out['tmax_tntensity'] = intensity_list

    # return df_out
    return df_out

In [39]:
# Make data into an xr.DataArray
df_id = list(df1983['ID_HDC_G0'])
date_list = list(df1983.columns[3:])
df_temp = df1983.iloc[:,3:] # get only temp columns
df_temp_drop = df_temp.dropna() # drop cities with no temp record
space_dim = 'ID_HDC_G0'
time_dim = 'date'

tmax_arr = df_temp_drop.to_numpy()

tmax_xr_da = xr.DataArray(tmax_arr, coords=[df_id, date_list], 
                     dims=[space_dim, time_dim])

In [40]:
df1983_406 = tmax_days(tmax_xr_da, 40.6)


In [41]:
df1983_406

Unnamed: 0,ID_HDC_G0,dates,tmax,tmax_tntensity
0,3091,[1983.05.16],[42.49354861557129],[1.8935486155712908]
1,1813,[1983.07.12],[41.30586334280715],[0.7058633428071488]
2,1794,[1983.07.12],[42.11767744492123],[1.517677444921226]
3,1786,[1983.07.12],[42.38930445203122],[1.7893044520312174]
4,3179,[1983.05.16],[41.95857979155281],[1.3585797915528062]
...,...,...,...,...
10310,1036,"[1983.02.05, 1983.11.27]","[41.921277502043, 45.52775676609304]","[1.3212775020429959, 4.92775676609304]"
10311,1040,"[1983.02.05, 1983.02.06, 1983.11.27]","[42.82940140527403, 43.346543012073575, 45.686...","[2.229401405274025, 2.746543012073573, 5.08628..."
10312,1087,"[1983.01.01, 1983.01.05, 1983.01.12, 1983.01.2...","[42.1275297653516, 41.70304743074981, 46.36813...","[1.527529765351602, 1.103047430749811, 5.76813..."
10313,1081,"[1983.01.05, 1983.02.04, 1983.02.05, 1983.11.27]","[40.94428549255453, 40.62568825715032, 41.7781...","[0.34428549255452623, 0.025688257150321192, 1...."
