# Edge Search

Notebook to search for extreme heat events that overlap years.
Fix to resolve problem TBD.

By Cascade Tuholske, 2019-10-19


### Preliminary Findings

- In the entire record, there are 97 events that start on Jan 1.
- In the entire record, there are 94 events that end of Dec 31.

Of these, it looks like 5 were from the same city and bridged two years

#### Depdencies

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
fn = "/home/cascade/projects/data_out_urbanheat/All_data20191107.csv"  # Note: Need ?dl=1 to make sure this file gets read correctly
df = pd.read_csv(fn)
df.columns

Index(['Unnamed: 0', 'Unnamed: 0.1', 'Event_ID', 'ID_HDC_G0', 'CTR_MN_NM',
       'total_days', 'duration', 'avg_temp', 'avg_intensity', 'tot_intensity',
       'event_dates', 'intensity', 'tmax', 'year', 'region', 'sub-region',
       'intermediate-region', 'GCPNT_LAT', 'GCPNT_LON'],
      dtype='object')

#### Find Edges

Build query to find dates XXXX.12.31 and XXXX.01.01 
Events col are strings

In [210]:
df.columns

Index(['Unnamed: 0', 'Unnamed: 0.1', 'Event_ID', 'ID_HDC_G0', 'CTR_MN_NM',
       'total_days', 'duration', 'avg_temp', 'avg_intensity', 'tot_intensity',
       'event_dates', 'intensity', 'tmax', 'year', 'region', 'sub-region',
       'intermediate-region', 'GCPNT_LAT', 'GCPNT_LON'],
      dtype='object')

In [35]:
def date_search(df, date):
    
    """ Searches Tmax data frame to find dates within a Tmax event with the goal of finding 12.31-01.01 overlap
    Args:
        df = tmax df
        data = date you want to find
    
    Returns df with event id, event dates, city id, year, and tmax temps 
    """

    event_id_list = []
    event_dates_list = []
    city_id_list = []
    event_year_list = []
    tmax_list = []
    
    for index, row in df.iterrows():
        if date in row['event_dates']:
            
            event_id = row['Event_ID']
            event_dates = row['event_dates']
            city_id = row['ID_HDC_G0']
            event_year = row['year']
            tmax = row['tmax']
            
            event_id_list.append(event_id)
            event_dates_list.append(event_dates)
            city_id_list.append(city_id)
            event_year_list.append(event_year)
            tmax_list.append(tmax)
    
    df_out = pd.DataFrame()
    df_out['ID_HDC_G0'] = city_id_list
    df_out['Event_ID'] = event_id_list
    df_out['tmax'] = tmax_list
    df_out['event_dates'] = event_dates_list
    df_out['year'] = event_year_list
    
    return df_out

In [36]:
# Dec 31 Events

df_1231 = date_search(df, '12.31')

In [37]:
# Jan 1 Events

df_0101 = date_search(df, '01.01')

In [38]:
df_1231.head()

Unnamed: 0,ID_HDC_G0,Event_ID,tmax,event_dates,year
0,1506,633720,[40.846912],['2008.12.31'],2008
1,5519,117936,[40.956707 41.224346 41.3726 ],['1987.12.29' '1987.12.30' '1987.12.31'],1987
2,5519,849342,[41.66478],['2015.12.31'],2015
3,5551,117949,[40.935562 40.927567],['1987.12.30' '1987.12.31'],1987
4,5551,849363,[40.852245],['2015.12.31'],2015


In [39]:
# Check len

print(len(df_0101))
print(len(df_1231))

94
97


In [40]:
# See how many cities overlap

df_1231['ID_HDC_G0'].isin(df_0101['ID_HDC_G0']).value_counts()

True     59
False    38
Name: ID_HDC_G0, dtype: int64

In [46]:
# Merge based on city ID to only include overlaps

merge = pd.merge(df_1231, df_0101, on = 'ID_HDC_G0', how = 'inner')

In [47]:
merge

Unnamed: 0,ID_HDC_G0,Event_ID_x,tmax_x,event_dates_x,year_x,Event_ID_y,tmax_y,event_dates_y,year_y
0,5519,117936,[40.956707 41.224346 41.3726 ],['1987.12.29' '1987.12.30' '1987.12.31'],1987,451678,[40.648888],['2001.01.20'],2001
1,5519,849342,[41.66478],['2015.12.31'],2015,451678,[40.648888],['2001.01.20'],2001
2,5534,90648,[40.77078],['1986.12.31'],1986,45819,[41.395733 41.12193 ],['1984.01.01' '1984.01.02'],1984
3,5534,90648,[40.77078],['1986.12.31'],1986,141081,[41.37887 41.26748],['1988.01.01' '1988.01.02'],1988
4,5534,90648,[40.77078],['1986.12.31'],1986,404842,[41.404846 41.652008 41.585175 41.031097],['1999.01.01' '1999.01.02' '1999.01.03' '1999....,1999
...,...,...,...,...,...,...,...,...,...
314,3398,118141,[41.71855],['1987.12.31'],1987,68044,[42.620567 42.454918],['1985.01.01' '1985.01.02'],1985
315,3398,118141,[41.71855],['1987.12.31'],1987,141242,[43.16907 43.610813],['1988.01.01' '1988.01.02'],1988
316,3398,118141,[41.71855],['1987.12.31'],1987,451891,[40.78469],['2001.01.06'],2001
317,3398,118141,[41.71855],['1987.12.31'],1987,451892,[41.174187],['2001.01.08'],2001


In [43]:
merge.dtypes

ID_HDC_G0          int64
Event_ID_x       float64
tmax_x            object
event_dates_x     object
year_x           float64
Event_ID_y       float64
tmax_y            object
event_dates_y     object
year_y           float64
dtype: object

In [22]:
#merge.to_csv('/home/cascade/projects/data_out_urbanheat/jandecoverlap_v2.csv')

In [53]:
# Look for years the are one apart and get rows

out = []

for i, year in merge.iterrows():
        if year['year_y'] - year['year_x'] == 1:
            out.append(i)
out

[13, 34, 111, 123, 142, 175, 202, 236, 250, 273, 284, 299, 309, 315]

In [173]:
# Get the rows with dec 31 - jan 1

overlap = merge.loc[out]
overlap

Unnamed: 0,ID_HDC_G0,Event_ID_x,tmax_x,event_dates_x,year_x,Event_ID_y,tmax_y,event_dates_y,year_y
13,5534,117974,[41.446693 42.30564 42.97953 43.17466 ],['1987.12.28' '1987.12.29' '1987.12.30' '1987....,1987,141081,[41.37887 41.26748],['1988.01.01' '1988.01.02'],1988
34,5534,376959,[40.947575],['1998.12.31'],1998,404842,[41.404846 41.652008 41.585175 41.031097],['1999.01.01' '1999.01.02' '1999.01.03' '1999....,1999
111,5534,849422,[42.601044],['2015.12.31'],2015,882573,[41.114815 41.550606 41.746284],['2016.01.01' '2016.01.02' '2016.01.03'],2016
123,5541,117989,[41.22316 42.082108 42.755997 42.951126],['1987.12.28' '1987.12.29' '1987.12.30' '1987....,1987,141093,[41.167503 41.056114],['1988.01.01' '1988.01.02'],1988
142,5541,376974,[40.806267],['1998.12.31'],1998,404854,[41.231552 41.478714 41.41188 40.857803],['1999.01.01' '1999.01.02' '1999.01.03' '1999....,1999
175,5541,849443,[42.398857],['2015.12.31'],2015,882595,[40.954025 41.389816 41.585495],['2016.01.01' '2016.01.02' '2016.01.03'],2016
202,5520,118003,[41.090614 41.926125 42.600014 42.84202 ],['1987.12.28' '1987.12.29' '1987.12.30' '1987....,1987,141104,[40.96762 40.755005],['1988.01.01' '1988.01.02'],1988
236,5520,849463,[42.149605],['2015.12.31'],2015,882617,[40.83622 41.154823 41.233315],['2016.01.01' '2016.01.02' '2016.01.03'],2016
250,5390,377007,[40.932465],['1998.12.31'],1998,404896,[40.849594],['1999.01.01'],1999
273,5390,849508,[40.60667],['2015.12.31'],2015,882679,[41.256462 40.64724 ],['2016.01.01' '2016.01.02'],2016


#### Make new data from overlaps

In [174]:
def string_hunt(string_list, out_list, dtype):
    """ Helped function to pull tmax record strings from a list with , and turn them into ints"""
    for i in string_list: # set the strings from X list
        if len(i) > 1:
            if '[' in i:
                
                record = i[1:]
                
                if ']' in record:
                    
                    record =  record[:-1]
                    out_list.append(dtype(record))
                else:
                    out_list.append(dtype(record))
            
            elif ']' in i:
                record = i[:-1]
                out_list.append(dtype(record))
            
            else:
                record = i
                out_list.append(dtype(record))
    
    return out_list

In [211]:
# loop by row to get temps

df_out = pd.DataFrame()

# Lists for df
temps_list_list = []
dates_list_list = []
duration_list = []
avg_temp_list = []
intensity_list = []
avg_intensity_list = []
tot_intensity_list = []
city_id_list = []
year_list = []

### Tempature
for i, row in overlap.iterrows():
    
    temps_list = [] # make list to populate
    
    temps_x = (row['tmax_x'].split(' ')) # split up the strings from X list
    temps_list = string_hunt(temps_x, temps_list, float)
    
    temps_y = (row['tmax_y'].split(' ')) # split up the strings from X list
    temps_list = string_hunt(temps_y, temps_list, float)
    
    temps_list_list.append(temps_list) # append list for df
    
    ### Dates
    dates_list = [] # make list to populate
    
    dates_x = (row['event_dates_x'].split(' ')) # split up the strings from X list
    dates_list = string_hunt(dates_x, dates_list, str)
    
    dates_y = (row['event_dates_y'].split(' ')) # split up the strings from X list
    dates_list = string_hunt(dates_y, dates_list, str)
    
    dates_list_list.append(dates_list) # append list for df 
    
    ### Duration
    duration = len(temps_list)
    duration_list.append(duration)
    
    ### Intensity [x - 13 for x in a]
    intensity = [x - 40.6 for x in temps_list] # <<<<<<-------------------------- UPDATE AS NEEDED
    intensity_list.append(intensity)
    

    ### Avg_temp
    avg_temp = np.mean(temps_list)
    avg_temp_list.append(avg_temp)
    
    ### avg_intensity
    avg_intensity = np.mean(intensity)
    avg_intensity_list.append(avg_intensity)
    
    ### tot_intensity
    tot_intensity = np.sum(intensity)
    tot_intensity_list.append(tot_intensity)
    
    ### city_id & total days & year, etc
    city_id = row['ID_HDC_G0']
    city_id_list.append(city_id)
    
    year = row['year_x']
    year_list.append(year)
    

    
    #avg_temp	avg_intensity	tot_intensity	event_dates	intensity

In [212]:
df_out['ID_HDC_G0'] = city_id_list
df_out['year'] = year_list
df_out['tmax'] = temps_list_list
df_out['event_dates'] = dates_list_list
df_out['duration'] = duration_list
df_out['avg_temp'] = avg_temp_list
df_out['intensity'] = intensity_list
df_out['tot_intensity'] = tot_intensity_list
df_out['avg_intensity'] = avg_intensity_list
df_out

Unnamed: 0,ID_HDC_G0,year,tmax,event_dates,duration,avg_temp,intensity,tot_intensity,avg_intensity
0,5534,1987,"[41.446693, 42.30564, 42.97953, 43.17466, 41.3...","['1987.12.28', '1987.12.29', '1987.12.30', '19...",6,42.092146,"[0.8466930000000019, 1.7056399999999954, 2.379...",8.952873,1.492145
1,5534,1998,"[40.947575, 41.404846, 41.652008, 41.585175, 4...","['1998.12.31', '1999.01.01', '1999.01.02', '19...",5,41.32414,"[0.3475749999999991, 0.8048459999999977, 1.052...",3.620701,0.72414
2,5534,2015,"[42.601044, 41.114815, 41.550606, 41.746284]","['2015.12.31', '2016.01.01', '2016.01.02', '20...",4,41.753187,"[2.0010440000000003, 0.5148149999999987, 0.950...",4.612749,1.153187
3,5541,1987,"[41.22316, 42.082108, 42.755997, 42.951126, 41...","['1987.12.28', '1987.12.29', '1987.12.30', '19...",6,41.872668,"[0.6231599999999986, 1.4821079999999967, 2.155...",7.636008,1.272668
4,5541,1998,"[40.806267, 41.231552, 41.478714, 41.41188, 40...","['1998.12.31', '1999.01.01', '1999.01.02', '19...",5,41.157243,"[0.20626699999999687, 0.6315519999999992, 0.87...",2.786216,0.557243
5,5541,2015,"[42.398857, 40.954025, 41.389816, 41.585495]","['2015.12.31', '2016.01.01', '2016.01.02', '20...",4,41.582048,"[1.7988569999999982, 0.35402500000000003, 0.78...",3.928193,0.982048
6,5520,1987,"[41.090614, 41.926125, 42.600014, 42.84202, 40...","['1987.12.28', '1987.12.29', '1987.12.30', '19...",6,41.6969,"[0.49061400000000077, 1.3261249999999976, 2.00...",6.581398,1.0969
7,5520,2015,"[42.149605, 40.83622, 41.154823, 41.233315]","['2015.12.31', '2016.01.01', '2016.01.02', '20...",4,41.343491,"[1.5496049999999997, 0.23621999999999588, 0.55...",2.973963,0.743491
8,5390,1998,"[40.932465, 40.849594]","['1998.12.31', '1999.01.01']",2,40.89103,"[0.3324649999999991, 0.24959400000000187]",0.582059,0.29103
9,5390,2015,"[40.60667, 41.256462, 40.64724]","['2015.12.31', '2016.01.01', '2016.01.02']",3,40.836791,"[0.006669999999999732, 0.6564619999999977, 0.0...",0.710372,0.236791


#### Add Meta data back

In [219]:
list(cols_to_use)

['CTR_MN_NM',
 'Event_ID',
 'GCPNT_LAT',
 'GCPNT_LON',
 'Unnamed: 0',
 'Unnamed: 0.1',
 'intermediate-region',
 'region',
 'sub-region',
 'total_days']

In [220]:
cols_to_use = df.columns.difference(df_out.columns)

df_out_merge = pd.merge(df_out, df[list[cols_to_use]], on = 'ID_HDC_G0', )

TypeError: 'type' object is not subscriptable

In [215]:
cols_to_use

Index(['CTR_MN_NM', 'Event_ID', 'GCPNT_LAT', 'GCPNT_LON', 'Unnamed: 0',
       'Unnamed: 0.1', 'intermediate-region', 'region', 'sub-region',
       'total_days'],
      dtype='object')

#### Drop overlap events based on event id

#### Add in new events with new event ids

## NEXT:

I need to write a function that compares years with the resulting subset of Dec 31 and Jan 1 and then I think we will just drop all these from the record and then add them on as heat waves that overlap years later. There should be about 100 of them

# Old Code

In [None]:
            
            
    
#     temps_y = (row['tmax_y'].split(' ')) # split up the strings from Y list
            
#     for i in temps_y: # set the strings from Y list
#         if len(i) > 1:
#             if '[' in i:

#                 temp = i[1:]

#                 if ']' in temp:

#                     temp =  temp[:-1]
#                     temps_list.append(float(temp))
#                 else:
#                     temps_list.append(float(temp))

#             else:
#                 temp = i
#                 temps_list.append(float(temp))



# counter = 0
# for i, row in overlap.iterrows():
#     test = (row['tmax_x'].split(' '))
#     for i in test:
#         if len(i) > 1:
#             print(i)
            
#             counter = counter + 1
# #         if '[' in i:
# #             print('yes')
# #             print(i[1:])
# #         if ']' in i:
# #             print(i[:-1])
# #         print(i)
# print(counter)

In [None]:

#     for i in temps_x: # set the strings from X list
#         if len(i) > 1:
#             if '[' in i:
                
#                 temp = i[1:]
                
#                 if ']' in temp:
                    
#                     temp =  temp[:-1]
#                     temps_list.append(float(temp))
#                 else:
#                     temps_list.append(float(temp))
            
#             elif ']' in i:
#                 temp = i[:-1]
#                 temps_list.append(float(temp))
            
#             else:
#                 temp = i
#                 temps_list.append(float(temp))
                
#             #temps_list_list.append(temps_list) # append list for df
    
#     temps_y = (row['tmax_y'].split(' ')) # split up the strings from X list

#     for i in temps_y: # set the strings from y list
#         if len(i) > 1:
#             if '[' in i:
                
#                 temp = i[1:]
                
#                 if ']' in temp:
                    
#                     temp =  temp[:-1]
#                     temps_list.append(float(temp))
#                 else:
#                     temps_list.append(float(temp))
            
#             elif ']' in i:
#                 temp = i[:-1]
#                 temps_list.append(float(temp))
            
#             else:
#                 temp = i
#                 temps_list.append(float(temp))
                       
#     temps_list_list.append(temps_list) # append list for df
    
#     print(temps_list)

In [108]:
for i, row in overlap.iterrows():
    print(overlap.loc[i]['tmax_x'])

[41.446693 42.30564  42.97953  43.17466 ]
[40.947575]
[42.601044]
[41.22316  42.082108 42.755997 42.951126]
[40.806267]
[42.398857]
[41.090614 41.926125 42.600014 42.84202 ]
[42.149605]
[40.932465]
[40.60667]
[41.324394 43.38537 ]
[41.113922 41.99298  42.158752 43.608948 44.592743 42.73523  44.078674
 45.146576 44.022064 41.813843]
[42.265797 43.199844]
[41.71855]


In [None]:
[41.446693 42.30564 42.97953  43.17466
[40.947575]
[42.601044]
[41.22316 42.082108 42.755997 42.951126]
[40.806267]
[42.398857]
[41.090614 41.926125 42.600014 42.84202
[42.149605]
[40.932465]
[40.60667]
[41.324394 43.38537
[41.113922 41.99298 42.158752 43.608948 44.592743 42.73523 44.078674
45.146576 44.022064 41.813843]
[42.265797 43.199844]
[41.71855]