# Tmax Subset

A notebook to subset Tmax daily for the 13000 GHS urban areas to identify dates >40c, consecuritve days >40 c etc.

### Depdencies

In [1]:
import pandas as pd
import numpy as np
import xarray as xr
from random import random
from itertools import groupby
from operator import itemgetter
import geopandas as gpd 
import glob

In [2]:
def csv_to_xr(file_in, time_dim, space_dim):
    
    """ Function reads in a csv w/ GHS-UCDB IDs and temp, isolates the temp
    and returns a xarray data array with dims set to city ids and dates
    
    Args:
        file_in = file name and path
        time_dim = name for time dim as a str ... use date :-)
        space_dim = col name for GHS-UCDB IDs as an str (ID_HDC_G0)
    """
    
    df = pd.read_csv(file_in) # read the file in as a df
    print(df.shape)
    
    df_id = df[space_dim] # get IDs
    df_temp = df.iloc[:,3:] # get only temp columns
    df_temp.index = df_id # set index values
    df_temp_drop = df_temp.dropna() # Drop cities w/ no temp record 
    print(len(df_temp_drop))
    
    temp_np = df_temp_drop.to_numpy() # turn temp cols into an np array
    
    # make xr Data Array w/ data as temp and dims as spece (e.g. id)
    
    # Note 2019 09 17 changed to xr.Dataset from xr.Dataarray
    temp_xr_da = xr.Dataset(temp_np, coords=[df_temp_drop.index, df_temp_drop.columns], dims=[space_dim, time_dim])
    
    return temp_xr_da

In [3]:
def temp_eventTot(xarray, Tthresh, year):
    """ Function returns the number of days within a year where Tmax > Tthresh for each city.
    
    Args: 
        xarray = an xarray object with dims = (space, times)
        Tthresh = int of temp threshold
    """
    
    ## NOTE FOR SOME REASON out.ID_HDC_G0 cannot be fed a string ... note sure why so be careful with col names
    out = xarray.where(xarray > Tthresh, drop = True)
    id_list = []
    event_tot = []
    df_out = pd.DataFrame()
    
    for index, loc in enumerate(out.ID_HDC_G0):
        id_list.append(out.ID_HDC_G0.values[index])
        event_tot.append(len(out.sel(ID_HDC_G0 = loc).dropna(dim = 'date').date.values))
    
    df_out['ID_HDC_G0'] = id_list
    df_out[year] = event_tot
    
    return df_out

In [4]:
def eventTot_loop(dir_nm, time_dim, space_dim, Tthresh):
    
    """ Loop through a dir with csvs to calc the total number of events greater than a threshold.
        Leap years explain the difference in shapes 368 vs 369
    
    Args:
        dir_nm = dir path to loop through
        time_dim = name for time dim as a str ... use date :-) for csv_to_xr function
        space_dim = col name for GHS-UCDB IDs as an str (ID_HDC_G0) for csv_to_xr function
        Tthresh = int of temp threshold for temp_event function -- 40.6 is
    """
    
    # Open the GHS-ID List with GeoPANDAS read_file
    ghs_ids_fn = 'GHS-UCSB-IDS.csv'
    ghs_ids_df = pd.read_csv(DATA_INTERIM+ghs_ids_fn)
    
    # Git File list
    fn_list = glob.glob(DAILY_PATH+'*.csv')
    
    for fn in sorted(fn_list):
        
        # Get year for arg for temp_event function
        year = fn.split('GHS-Tmax-DAILY_')[1].split('.csv')[0]
        print(year)
        
        temp_xr_da = csv_to_xr(fn, time_dim, space_dim)
        
        df_out = temp_eventTot(temp_xr_da, Tthresh, year)
        
        ghs_ids_df = ghs_ids_df.merge(df_out, on='ID_HDC_G0', how = 'outer') #<<<<----- NEED TO FIX THIS
    
    # build in later drop all NA GHS-IDs
    
    return ghs_ids_df


In [5]:
def temp_eventL(xarray, Tthresh, year):
    """ Function calculates the length of each Tmax threshold event as the number of days in a row
    greater than a threshold within a year where Tmax > Tthresh for each city.
    
    Args: 
        xarray = an xarray object with dims = (space, times)
        Tthresh = int of temp threshold
    """
    
    ## NOTE FOR SOME REASON out.ID_HDC_G0 cannot be fed a string ... note sure why so be careful with col names
    out = xarray.where(xarray > Tthresh, drop = True)
    id_list = []
    event_L = []
    df_out = pd.DataFrame()
    
    for index, loc in enumerate(out.ID_HDC_G0):
        id_list.append(out.ID_HDC_G0.values[index])
        event_tot.append(len(out.sel(ID_HDC_G0 = loc).dropna(dim = 'date').date.values))
    
    df_out['ID_HDC_G0'] = id_list
    df_out[year] = event_tot
    
    return df_out

In [9]:
# File Paths
DAILY_PATH = '/home/cascade/projects/data_out_urbanheat/CHIRTS-GHS-DAILY/'
DATA_INTERIM = '/home/cascade/projects/UrbanHeat/data/interim/'
DATA_OUT = '/home/cascade/projects/data_out/'

In [10]:
# File name to test
fn_in = 'GHS-Tmax-DAILY_1983.csv'

In [11]:
xr1993 = csv_to_xr(DAILY_PATH+fn_in, 'date', 'ID_HDC_G0')

(13135, 368)
13067


In [12]:
xr1993

<xarray.DataArray (ID_HDC_G0: 13067, date: 365)>
array([[-43.921947, -33.71345 , -33.054974, ..., -12.416152, -13.232986,
        -15.403823],
       [ -4.804248,  -3.914425,  -7.533999, ...,  -5.186461, -10.945722,
        -16.29516 ],
       [-23.904118, -17.422953, -13.182008, ..., -12.788978, -11.337886,
        -10.00939 ],
       ...,
       [ 16.028023,  17.73603 ,  20.493294, ...,  14.559421,  15.160739,
         15.184024],
       [ 16.420553,  17.87142 ,  22.519674, ...,  15.680964,  16.169733,
         16.039179],
       [ 16.6943  ,  17.559229,  21.480919, ...,  14.446052,  15.235602,
         14.005591]])
Coordinates:
  * ID_HDC_G0  (ID_HDC_G0) int64 5782 3316 5645 3185 ... 1116 1114 1161 1169
  * date       (date) object '1983.01.01' '1983.01.02' ... '1983.12.31'

In [13]:
out = xr1993.where(xr1993 > 40.6, drop = True)
out

<xarray.DataArray (ID_HDC_G0: 4483, date: 315)>
array([[nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       ...,
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan]])
Coordinates:
  * ID_HDC_G0  (ID_HDC_G0) int64 2784 2372 6156 2833 ... 1077 12977 13012 13020
  * date       (date) object '1983.01.04' '1983.01.05' ... '1983.12.29'

In [31]:
type(out)

xarray.core.dataarray.DataArray

In [28]:
type(test)

method

In [14]:
#event_tot.append(len(out.sel(ID_HDC_G0 = loc).dropna(dim = 'date').date.values))

arr = []
for index, loc in enumerate(out.ID_HDC_G0):
    print(out.ID_HDC_G0.values[index])
    x = (out.sel(ID_HDC_G0 = loc).dropna(dim = 'date').date.values) #
    arr.append(x)

2784
2372
6156
2833
2885
2367
2334
2331
2258
2952
6414
5066
2275
2965
2280
5884
2186
2862
6564
2264
2529
2504
2559
2964
2538
2586
2246
6181
5928
5950
6411
171
166
5649
5974
6174
964
7465
6035
6130
6110
5881
12111
5879
6007
5877
6003
481
5885
5887
6596
6679
5898
1871
5897
5901
5899
3049
6006
12078
6014
5895
5904
3077
6070
6509
6008
6093
6009
6084
6068
6078
6125
6074
6080
6111
6104
6123
6005
878
6107
6002
6136
409
6128
6101
550
6108
6090
6112
6072
913
6621
6065
6105
6129
6012
3167
576
6094
5365
6066
6081
6087
5438
6038
6095
6043
933
6049
6036
6000
5961
6067
10697
5954
5751
5977
5468
5996
5461
5495
5979
5956
525
6296
5988
5441
5991
5807
10827
6292
10796
10712
5951
5976
5995
405
5857
408
483
5980
5993
484
6031
40
491
6053
6039
6045
3416
487
6427
6027
6024
4566
42
6064
4588
5889
6083
4599
34
6474
5893
4666
6060
33
4506
4551
6058
28
6011
5942
35
5880
6140
36
5886
6022
5908
4701
6050
5858
4793
39
6682
5851
2747
4841
4587
6023
4642
2745
6697
4531
4865
6706
98
5853
5888
4700
5842
4511
5395
44
5

7787
8880
9092
8305
8829
8893
8996
9142
9333
8352
8937
8144
8904
8557
8441
9233
6257
6665
8376
9069
7924
8208
8262
8055
8791
5864
6463
7492
9269
9343
6507
6785
7458
8315
8814
9110
9180
9310
210
8468
8881
212
8430
8720
8517
9093
8013
9193
4250
9040
8919
8945
9348
8636
9054
8334
7800
8156
9137
6246
6574
8852
8988
9016
6546
8923
8976
9059
9081
9159
8545
8873
8898
7423
8450
8950
4247
8821
9187
9383
6261
6630
6918
8018
6997
8767
8899
9306
8959
8990
9111
138
6480
8706
9062
8452
8870
5923
6413
7537
7574
8890
4242
6435
9100
8384
9324
8784
8120
9094
9208
9143
7300
8064
8214
9235
9019
9138
4243
7801
8938
9167
5866
7020
7151
8324
8547
9148
7746
8531
7487
7060
8991
9106
8086
8286
6565
8597
8977
9177
9216
4238
7396
8820
8874
8884
9024
9190
9329
8446
9045
6855
9156
9386
6911
7626
9080
8390
8989
9097
9025
9041
9171
8780
9298
8632
9124
4225
6253
8847
9144
8582
7700
8598
9084
7563
8472
8858
8877
8914
209
4230
4233
6526
8425
8453
8656
8683
8736
9048
9283
8799
9150
70
8130
8273
8566
8670
8831
9365
7112
7

8626
9116
10221
10203
7932
7964
8504
8426
8589
7642
8088
4479
8570
7435
7570
8163
5788
10212
8110
7385
10916
10183
10188
10204
7848
9028
7702
10263
10195
10235
10287
7836
8921
10271
8005
7188
7865
7261
10219
8479
9029
10304
8641
10298
7546
10239
7808
5493
8969
8360
7737
8758
8601
10198
5506
10245
8962
10254
8285
7753
8029
8690
8930
7213
7415
10225
8943
7291
8863
7343
7712
8455
5509
10215
2649
8964
7606
8842
10276
8437
10202
7214
8901
8361
10261
7288
8927
7510
8817
10597
8671
2487
1717
3476
4447
7292
8039
8627
7237
7799
8892
10248
7995
8856
7882
7399
8910
8981
10228
7740
8854
10246
8729
7218
7272
8129
1490
4431
4689
7347
4691
1523
1499
5523
4714
8822
8941
11245
8679
1487
7729
1478
8879
7450
7894
7349
8094
8628
7421
8466
4715
5024
8362
10522
1482
10236
232
8730
4713
7314
10968
5530
8872
7273
8629
8532
8702
8042
7463
7620
8584
1954
7304
8522
7331
7984
7444
8657
228
7875
8766
7372
7534
7703
1508
8553
8646
5846
8458
8715
5545
7816
7741
5844
7627
2365
8637
11864
8193
8121
7402
8527
4341
7322

In [None]:
# Run routine
# all_events_df = event_loop(DAILY_PATH, 'date', 'ID_HDC_G0', 40.6)

In [None]:
all_events_df.head()

In [None]:
# Move IDS to Index 

all_events_df = all_events_df.set_index(['ID_HDC_G0', 'CTR_MN_NM'], drop = True)
all_events_df.head()

In [None]:
# Drop NaNs
all_events_df_drop = all_events_df.dropna(how = 'all')
all_events_df_drop.shape

In [None]:
all_events_df_drop.head()

In [None]:
df_out = all_events_df_drop.copy()

In [None]:
df_out['ID_HDC_G0'] = all_events_df_drop.index

In [None]:
df_out.head()

In [None]:
def temp_eventL(xarray, Tthresh):
    """ Function calculates the length of each Tmax threshold event as the number of days in a row
    greater than a threshold within a year where Tmax > Tthresh for each city. Returns the length,
    the dates, the tempatures, and the severity (daily Tmax - Tthresh)
    
    Args: 
        xarray = an xarray object with dims = (space, times)
        Tthresh = int of temp threshold
    """
    
    # empty lists & df
    id_list = []
    date_list = []
    eventL_list = []
    temp_list = []
    severity_list = []
    df_out = pd.DataFrame()
    
    # subset xarry
    out = xarray.where(xarray > Tthresh, drop = True)

    # start loop 
    for index, loc in enumerate(out.ID_HDC_G0):
        id_list.append(out.ID_HDC_G0.values[index]) # get IDS
        date_list.append(out.sel(ID_HDC_G0 = loc).dropna(dim = 'date').date.values) # get event dates
        eventL_list.append(len(out.sel(ID_HDC_G0 = loc).dropna(dim = 'date').date.values)) # get event lengths
        temp_list.append(out.sel(ID_HDC_G0 = loc).dropna(dim = 'date').values) # get temp values
        severity_list.append(out.sel(ID_HDC_G0 = loc).dropna(dim = 'date').values - Tthresh) # get severity

    
    # write to a data frame
    df_out['ID_HDC_G0'] = id_list
    df_out['Event_Length'] = eventL_list
    df_out['Event_Dates'] = date_list
    df_out['Event_Temps'] = temp_list
    df_out['Event_Severity'] = severity_list

    # return df_out
    return df_out

In [None]:
def eventL_loop(dir_nm, fn_out, time_dim, space_dim, Tthresh):
    
    """ Loop through a dir with csvs to apply temp_eventL function and save out a .csv for each year
    
    Args:
        dir_nm = dir path to loop through
        fn_out = string to label out files
        time_dim = name for time dim as a str ... use date :-) for csv_to_xr function
        space_dim = col name for GHS-UCDB IDs as an str (ID_HDC_G0) for csv_to_xr function
        Tthresh = int of temp threshold for temp_event function -- 40.6 is
    """
    
    # Open the GHS-ID List with GeoPANDAS read_file
    ghs_ids_fn = 'GHS-UCSB-IDS.csv'
    ghs_ids_df = pd.read_csv(DATA_INTERIM+ghs_ids_fn)
        
    # Git File list
    fn_list = glob.glob(DAILY_PATH+'*.csv')
    
    for fn in sorted(fn_list):
        
        # Get year for arg for temp_event function
        year = fn.split('GHS-Tmax-DAILY_')[1].split('.csv')[0]
        print(year)
        
        temp_xr_da = csv_to_xr(fn, time_dim, space_dim)
        
        df_out = temp_eventL(temp_xr_da, Tthresh)
                
        ghs_ids_df_out = ghs_ids_df.merge(df_out, on='ID_HDC_G0', how = 'inner') #<<<<----- NEED TO FIX THIS

        ghs_ids_df_out.to_csv(DATA_OUT+fn_out+year+'.csv')

        print(year, 'SAVED!')

In [None]:
# File Paths 

# UPDATE AS NEEDED <<<<< ------------------------------------------
DAILY_PATH = '/home/cascade/projects/data_out/CHIRTS-GHS-DAILY/'
DATA_INTERIM = '/home/cascade/projects/UrbanHeat/data/interim/'
DATA_OUT = '/home/cascade/projects/data_out/CHIRTS-GHS-Events/'

In [None]:
# File name
fn_out = 'CHIRTS-GHS-Events'
dir_nm = DAILY_PATH
time_dim = 'date'
space_dim = 'ID_HDC_G0'
Tthresh = 40.6


In [None]:
eventL_loop(dir_nm, fn_out, time_dim, space_dim, Tthresh)

### Plot it

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.pyplot import figure
%matplotlib inline

figure(num=None, figsize=(8, 6), dpi=80, facecolor='w', edgecolor='k')
y = range(1,125)
year = '2010'
country = 'INDIA'
plt.hist(india[year], bins = 125)
plt.xlabel('Number of Days in '+year+' where Tmax >40c in ')
plt.ylabel('Number of cities')
plt.title(country+': For all cities with Tmax >40, how many days in '+year+' were >40C? ')

In [None]:
# MAP BACK TO POLYGONS AND LOOK AT IT 
SHP_DIR = '/Users/cascade/Github/UrbanHeat/data/raw/ghs-ucdb/'
shp_fn = 'GHS_STAT_UCDB2015MT_GLOBE_R2019A_V1_0.shp'
shps = gpd.read_file(SHP_DIR+shp_fn)

In [None]:
df_ghs = gpd.GeoDataFrame()
df_ghs['geometry'] = shps.geometry
df_ghs['ID_HDC_G0'] = shps.ID_HDC_G0

In [None]:
df_merge = df_ghs.merge(events, on='ID_HDC_G0', how = 'inner') #<<<<----- NEED TO FIX THIS

In [None]:
# Write it out
DATA_INTERIM = '/Users/cascade/Github/UrbanHeat/data/interim/'
fn_out = 'GHS-TmaxDaily-events.shp'
df_merge.to_file(DATA_INTERIM+fn_out)

# Old Code

In [None]:
# This will return the ID and Date where Tmax is greater than 40 as a dict, but will not return actual tempatures 

Tmax = np.random.randint(20, high=50, size=(3,10)) # Make a 3x10 random list
print(Tmax)
results = np.where(Tmax > 40) # find the index and rows
coords = list(zip(results[0], results[1])) # zip the i and js into tuples

b = [(k, list(list(zip(*g))[1])) for k, g in groupby(coords, itemgetter(0))] # group by rows

print(b)
dict_out = dict(b) # turn into a dict, where keys are city ids and values are dates
dict_out


In [None]:
for key, value in dict_out.items():
    print(key, value)

In [None]:
np.where(Tmax > 40, Tmax, Tmax*0) 

In [None]:
np.where(Tmax > 40) 

In [None]:
np.argwhere(Tmax>1)

In [None]:
def temp_search(array):
    results = np.where(array > 40) # find the index and rows
    coords = list(zip(results[0], results[1])) # zip the i and js into tuples
    b = [(k, list(list(zip(*g))[1])) for k, g in groupby(coords, itemgetter(0))] # group by rows
    dict_out = dict(b) # turn into a dict, where keys are city ids and values are dates

    return dict_out

In [None]:
file_in = '/Users/cascade/Desktop/GHS-Tmax-DAILY_1983.csv'

df = pd.read_csv(file_in)

In [None]:
df.head()

In [None]:
df_sub = df.iloc[:,3:]

In [None]:
df_sub_drop = df.dropna(how='all')

In [None]:
df_sub.head()
arr = df_sub.to_numpy()

In [None]:
arr.shape

In [None]:
np.unique(arr)

In [None]:
tmax_search = temp_search(arr)

In [None]:
# Make some fake data
Tmax = np.random.randint(20, high=50, size=(3,10))
locs = ['001', '002', '003']
times = pd.date_range('2000-01-01', periods=10)

In [None]:
foo = xr.DataArray(Tmax, coords=[locs, times], dims=['space', 'times'])

In [None]:
foo

In [None]:
out = foo.where(foo > 40, drop = True)
out

In [None]:
for loc in out.space:
    print(len(out.sel(space = loc).dropna(dim = 'times').times.values))

In [None]:
for x in out.space.values:
    print(x)

In [None]:
out.space.values[0]

In [None]:
xr1993 = csv_to_xr(DAILY_PATH+fn_in, 'date', 'ID_HDC_G0')
out = xr1993.where(xr1993 > 40.6, drop = True)

In [None]:
40 - out.sel(ID_HDC_G0 = 5885).dropna(dim = 'date').values

In [None]:
#event_tot.append(len(out.sel(ID_HDC_G0 = loc).dropna(dim = 'date').date.values))

id_list = []
date_list = []
eventL_list = []
temp_list = []
df_out = pd.DataFrame()

# start loop 
for index, loc in enumerate(out.ID_HDC_G0):

    id_list.append(out.ID_HDC_G0.values[index]) # get IDS
    date_list.append(out.sel(ID_HDC_G0 = loc).dropna(dim = 'date').date.values) # get event dates
    eventL_list.append(len(out.sel(ID_HDC_G0 = loc).dropna(dim = 'date').date.values)) # get event lengths
    temp_list.append(out.sel(ID_HDC_G0 = loc).dropna(dim = 'date').values) #get temp values

# write to a data frame
df_out['ID_HDC_G0'] = id_list
df_out['Event_Length'] = eventL_list
df_out['Event_Dates'] = date_list
df_out['Event_Temps'] = temp_list


In [None]:
df_out.head(50)

In [None]:
date_list

In [None]:
# Run routine
# all_events_df = event_loop(DAILY_PATH, 'date', 'ID_HDC_G0', 40.6)

In [None]:
all_events_df.head()

In [None]:
# Move IDS to Index 

all_events_df = all_events_df.set_index(['ID_HDC_G0', 'CTR_MN_NM'], drop = True)
all_events_df.head()

In [None]:
# Drop NaNs
all_events_df_drop = all_events_df.dropna(how = 'all')
all_events_df_drop.shape

In [None]:
all_events_df_drop.head()

In [None]:
df_out = all_events_df_drop.copy()

In [None]:
df_out['ID_HDC_G0'] = all_events_df_drop.index

In [None]:
df_out.head()

In [None]:
all_events_df_drop = all_events_df_drop.reset_index()

In [None]:
all_events_df_drop.head()

In [None]:
#all_events_df_drop.to_csv(DATA_OUT+'20190831_TMax-GHS_TotEvents83-2016.csv')

In [None]:
india = all_events_df_drop[all_events_df_drop['CTR_MN_NM'] == 'India']

In [None]:
india.head()