# Tmax States Final

A notebook to subset Tmax daily for the 13000 GHS urban areas to identify dates >40c, consecuritve days >40 c etc.

Moved from cpt_tmax_subset to clean up all the code on 2019-09-24

**Need to subset**
- Days per year 
- Duration of each event 
- Intensity of each day during each event (>40.6)
- Avg temp
- Avg intsensity

#### Dependencies

In [1]:
import pandas as pd
import numpy as np
import xarray as xr
from random import random
from itertools import groupby
from operator import itemgetter
import geopandas as gpd 
import glob
from statistics import mean

#### Functions

In [2]:
def csv_to_xr(file_in, time_dim, space_dim):
    
    """ Function reads in a csv w/ GHS-UCDB IDs and temp, isolates the temp
    and returns a xarray data array with dims set to city ids and dates
    
    Args:
        file_in = file name and path
        time_dim = name for time dim as a str ... use date :-)
        space_dim = col name for GHS-UCDB IDs as an str (ID_HDC_G0)
    """
    
    df = pd.read_csv(file_in) # read the file in as a df
    print(df.shape)
    
    df_id = df[space_dim] # get IDs
    df_temp = df.iloc[:,3:] # get only temp columns
    df_temp.index = df_id # set index values
    df_temp_drop = df_temp.dropna() # Drop cities w/ no temp record 
    print(len(df_temp_drop))
    
    temp_np = df_temp_drop.to_numpy() # turn temp cols into an np array
    
    # make xr Data Array w/ data as temp and dims as spece (e.g. id)
    
    # Note 2019 09 17 changed to xr.Dataset from xr.Dataarray
    temp_xr_da = xr.DataArray(temp_np, coords=[df_temp_drop.index, df_temp_drop.columns], 
                            dims=[space_dim, time_dim])
    
    return temp_xr_da

In [3]:
def tmax_days(xarray, Tthresh):
    """ Function finds all the tmax days in a year and sums total days per year 
    greater than a threshold within a year where Tmax > Tthresh for each city. Returns the total number of days,
    the dates, the tempatures, and the intensity (daily Tmax - Tthresh)
    
    Args: 
        xarray = an xarray object with dims = (space, times)
        Tthresh = int of temp threshold
    """
    
    # empty lists & df
    id_list = []
    date_list = []
    dayTot_list = []
    tmax_list = []
    intensity_list = []
    df_out = pd.DataFrame()
    
    # subset xarry
    out = xarray.where(xarray > Tthresh, drop = True)

    # start loop 
    for index, loc in enumerate(out.ID_HDC_G0):
        id_list.append(out.ID_HDC_G0.values[index]) # get IDS
        date_list.append(out.sel(ID_HDC_G0 = loc).dropna(dim = 'date').date.values) # get event dates
        
        # this is actually getting the total events of all 2019-09-22
        dayTot_list.append(len(out.sel(ID_HDC_G0 = loc).dropna(dim = 'date').date.values)) # get event totals
        
        tmax_list.append(out.sel(ID_HDC_G0 = loc).dropna(dim = 'date').values) # get temp values
        intensity_list.append(out.sel(ID_HDC_G0 = loc).dropna(dim = 'date').values - Tthresh) # get severity

    # write to a data frame
    df_out['ID_HDC_G0'] = id_list
    df_out['total_days'] = dayTot_list
    df_out['dates'] = date_list
    df_out['tmax'] = tmax_list
    df_out['tmax_tntensity'] = intensity_list

    # return df_out
    return df_out

In [4]:
def jul_convert(dates):
    "Function turn days into julian datetime"
    jul_days = pd.to_datetime(dates).to_julian_date()
    
    return jul_days

def event_split(dates, ID_HDC_G0, intensity, tmax, total_days):
    """ Searchs a list of dates and isolates sequential dates as a list, then calculates event stats.
    See comments in code for more details. 
    
    Args:
        dates: pandas.core.index as julian dates
        ID_HDC_G0: city ID as string
        country: country of city as string
        intensity: numpy.ndarray of intensities values
        tmax: numpy.ndarray of intensities values of tmax values
        total_days: total number of tmax days in a year for a given city

    """
    
    # city id
    city_id = ID_HDC_G0
    tot_days = total_days
    
    # lists to fill
    city_id_list = []
    tot_days_list = []
    dates_list = []
    dur_list = []
    intensity_list = []
    tmax_list = []
    avg_temp_list = []
    avg_int_list = []
    tot_int_list = []
    
    # data frame out
    df_out = pd.DataFrame()
    
    # turn days into julian days
    jul_days = jul_convert(dates)
    
    # Loop through dur list and isolate seq days, temps, and intensities
    for k, g in groupby(enumerate(jul_days.values), lambda x: x[1]-x[0]):
        
        seq = list(map(itemgetter(1), g)) # isolate seq. days
        dur = len(seq) # duration of each event
        days = dates[0:dur] # dates of tmax days during each event
        intense = intensity[0:dur] # intensity of each day during event
        temp = tmax[0:dur] # temp of each day during event
        avg_temp = mean(temp) # avg. temp during event
        avg_int = mean(intense) # avg. intensity during event
        tot_int = intense.sum() # total intensity during event
        
        # fill lists
        city_id_list.append(city_id)
        tot_days_list.append(tot_days)
        dur_list.append(dur)
        dates_list.append(days)
        intensity_list.append(intense)
        tmax_list.append(temp)
        avg_temp_list.append(avg_temp)
        avg_int_list.append(avg_int)
        tot_int_list.append(tot_int)
     
    # write out as a dateframe
    df_out['ID_HDC_G0'] = city_id_list
    df_out['total_days'] = tot_days_list
    df_out['duration'] = dur_list
    df_out['avg_temp'] = avg_temp_list
    df_out['avg_intensity'] = avg_int_list
    df_out['tot_intensity'] = tot_int_list
    df_out['events'] = dates_list
    df_out['duration'] = dur_list
    df_out['intensity'] = intensity_list
    df_out['tmax'] = tmax_list
    
    return df_out

In [5]:
def tmax_stats(df_in):
    """ runs event_split functionon a dataframe to produce desired tmax stats
    
        NOTE - If you add arguments to event_split to make more states, 
        be sure to update this function
    
        args:
            df: input dataframe
        
    """
    df_out = pd.DataFrame()
    
    # NOTE - If you add arguments to event_split to make more states, 
    # be sure to update this function
    
    for index, row in df_in.iterrows():
        dates = row['dates'] # Get event dates
        intensity = row['tmax_tntensity'] # Get intensity for each day
        tmax = row['tmax'] # Get tmax for each day
        ID_HDC_G0 = row['ID_HDC_G0'] # get city id
        total_days = row['total_days'] # get total number of tmax days

        df = event_split(dates, ID_HDC_G0, intensity, tmax, total_days)

        df_out = df_out.append(df)
    
    return df_out

In [9]:
def stats_loop(dir_in, dir_out, fn_out, time_dim, space_dim, Tthresh):
    
    """ Loop through a dir with csvs to apply csv_to_xr and
    tmax_stats function and save out a .csv for each year
    
    Args:
        dir_in = dir path to loop through
        dir_out = dir path to save files out
        fn_out = string to label out files
        time_dim = name for time dim as a str ... use date :-) for csv_to_xr function
        space_dim = col name for GHS-UCDB IDs as an str (ID_HDC_G0) for csv_to_xr function
        Tthresh = int of temp threshold for temp_event function -- 40.6 is used

    """
    
    # Open the GHS-ID List with GeoPANDAS read_file
    ghs_ids_fn = 'GHS-UCSB-IDS.csv'
    ghs_ids_df = pd.read_csv(DATA_INTERIM+ghs_ids_fn)
        
    # Git File list
    fn_list = glob.glob(dir_in+'*.csv')
    
    for fn in sorted(fn_list):
        
        # Get year for arg for temp_event function
        year = fn.split('GHS-Tmax-DAILY_')[1].split('.csv')[0]
        print(year)
        
        # read csv as a data array
        temp_xr_da = csv_to_xr(fn, time_dim, space_dim)
        
        # data array to tmax events, out as df
        df_days = tmax_days(temp_xr_da, Tthresh)
        
        # tmax events stats, out as df
        df_out = tmax_stats(df_days)
        
        # merge to get countries
        ghs_ids_df_out = ghs_ids_df.merge(df_out, on='ID_HDC_G0', how = 'inner') #<<<<----- NEED TO FIX THIS
        
        # write it all out
        ghs_ids_df_out.to_csv(dir_out+fn_out+year+'.csv')

        print(year, 'SAVED!')

#### Run Code

In [7]:
dir_in = '/home/cascade/projects/data_out_urbanheat/CHIRTS-GHS-DAILY/' # output from avg temp
DATA_INTERIM = '/home/cascade/projects/UrbanHeat/data/interim/' # ghs ID list
dir_out = '/home/cascade/projects/data_out_urbanheat/CHIRTS-GHS-Events-Stats/'
fn_out = 'CHIRTS-GHS-Events-Stats'
time_dim = 'date'
space_dim = 'ID_HDC_G0'
Tthresh = 40.6


In [10]:
stats_loop(dir_in, dir_out, fn_out, time_dim, space_dim, Tthresh)

1983
(13135, 368)
13067
1983 SAVED!
1984
(13135, 369)
13067
1984 SAVED!
1985
(13135, 368)
13067
1985 SAVED!
1986
(13135, 368)
13067
1986 SAVED!
1987
(13135, 368)
13067
1987 SAVED!
1988
(13135, 369)
13067
1988 SAVED!
1989
(13135, 368)
13067
1989 SAVED!
1990
(13135, 368)
13067
1990 SAVED!
1991
(13135, 368)
13067
1991 SAVED!
1992
(13135, 369)
13067
1992 SAVED!
1993
(13135, 368)
13067
1993 SAVED!
1994
(13135, 368)
13067
1994 SAVED!
1995
(13135, 368)
13067
1995 SAVED!
1996
(13135, 369)
13067
1996 SAVED!
1997
(13135, 368)
13067
1997 SAVED!
1998
(13135, 368)
13067
1998 SAVED!
1999
(13135, 368)
13067
1999 SAVED!
2000
(13135, 369)
13067
2000 SAVED!
2001
(13135, 368)
13067
2001 SAVED!
2002
(13135, 368)
13067
2002 SAVED!
2003
(13135, 368)
13067
2003 SAVED!
2004
(13135, 369)
13067
2004 SAVED!
2005
(13135, 368)
13067
2005 SAVED!
2006
(13135, 368)
13067
2006 SAVED!
2007
(13135, 368)
13067
2007 SAVED!
2008
(13135, 369)
13067
2008 SAVED!
2009
(13135, 368)
13067
2009 SAVED!
2010
(13135, 368)
13067
2010

#### Testing

In [None]:
# File Paths
DAILY_PATH = '/home/cascade/projects/data_out_urbanheat/CHIRTS-GHS-DAILY/' # output from avg temp
DATA_INTERIM = '/home/cascade/projects/UrbanHeat/data/interim/'
DATA_OUT = '/home/cascade/projects/data_out/'

In [None]:
# File name to test
fn_in = 'GHS-Tmax-DAILY_1983.csv'

In [None]:
# Open a raw file
xr1983 = csv_to_xr(DAILY_PATH+fn_in, 'date', 'ID_HDC_G0')

In [None]:
days1983 = tmax_days(xr1983, 40.6)

In [None]:
test = days1983[days1983['ID_HDC_G0'] == 6279]
test

# Maybe add in days_total

In [None]:
type(test['dates'])

In [None]:
# Build routine for loop through a csv

df_out = pd.DataFrame()

for index, row in days1983.iterrows():
    dates = row['dates'] # Get event dates
    intensity = row['tmax_tntensity'] # Get intensity for each day
    tmax = row['tmax'] # Get tmax for each day
    ID_HDC_G0 = row['ID_HDC_G0'] # get city id
    total_days = row['days_total'] # get total number of tmax days
    
    df = event_split(dates, ID_HDC_G0, intensity, tmax, total_days)
    
    df_out = df_out.append(df)

In [None]:
df_out.head()