# Edge Search

Notebook to search for extreme heat events that overlap years.
Fix to resolve problem TBD.

By Cascade Tuholske, 2019-10-19


### Preliminary Findings

- In the entire record, there are 97 events that start on Jan 1.
- In the entire record, there are 94 events that end of Dec 31.

Of these, it looks like 5 were from the same city and bridged two years

#### Depdencies

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [28]:
fn = "/home/cascade/projects/data_out_urbanheat/All_data20191107.csv"  # Note: Need ?dl=1 to make sure this file gets read correctly
df = pd.read_csv(fn)
df.columns

Index(['Unnamed: 0', 'Unnamed: 0.1', 'Event_ID', 'ID_HDC_G0', 'CTR_MN_NM',
       'total_days', 'duration', 'avg_temp', 'avg_intensity', 'tot_intensity',
       'event_dates', 'intensity', 'tmax', 'year', 'region', 'sub-region',
       'intermediate-region', 'GCPNT_LAT', 'GCPNT_LON'],
      dtype='object')

#### Find Edges

Build query to find dates XXXX.12.31 and XXXX.01.01 
Events col are strings

In [3]:
df.columns

Index(['Unnamed: 0', 'Unnamed: 0.1', 'Event_ID', 'ID_HDC_G0', 'CTR_MN_NM',
       'total_days', 'duration', 'avg_temp', 'avg_intensity', 'tot_intensity',
       'event_dates', 'intensity', 'tmax', 'year', 'region', 'sub-region',
       'intermediate-region', 'GCPNT_LAT', 'GCPNT_LON'],
      dtype='object')

In [4]:
def date_search(df, date):
    
    """ Searches Tmax data frame to find dates within a Tmax event with the goal of finding 12.31-01.01 overlap
    Args:
        df = tmax df
        data = date you want to find
    
    Returns df with event id, event dates, city id, year, and tmax temps 
    """

    event_id_list = []
    event_dates_list = []
    city_id_list = []
    event_year_list = []
    tmax_list = []
    total_days_list = []
    
    for index, row in df.iterrows():
        if date in row['event_dates']:
            
            event_id = row['Event_ID']
            event_dates = row['event_dates']
            city_id = row['ID_HDC_G0']
            event_year = row['year']
            tmax = row['tmax']
            total_days = row['total_days']
            
            event_id_list.append(event_id)
            event_dates_list.append(event_dates)
            city_id_list.append(city_id)
            event_year_list.append(event_year)
            tmax_list.append(tmax)
            total_days_list.append(total_days)
    
    df_out = pd.DataFrame()
    df_out['ID_HDC_G0'] = city_id_list
    df_out['Event_ID'] = event_id_list
    df_out['tmax'] = tmax_list
    df_out['event_dates'] = event_dates_list
    df_out['year'] = event_year_list
    df_out['total_days'] = total_days_list
    
    return df_out

In [5]:
# Dec 31 Events

df_1231 = date_search(df, '12.31')

In [6]:
# Jan 1 Events

df_0101 = date_search(df, '01.01')

In [7]:
df_1231.head()

Unnamed: 0,ID_HDC_G0,Event_ID,tmax,event_dates,year,total_days
0,1506,633720,[40.846912],['2008.12.31'],2008,70
1,5519,117936,[40.956707 41.224346 41.3726 ],['1987.12.29' '1987.12.30' '1987.12.31'],1987,31
2,5519,849342,[41.66478],['2015.12.31'],2015,74
3,5551,117949,[40.935562 40.927567],['1987.12.30' '1987.12.31'],1987,12
4,5551,849363,[40.852245],['2015.12.31'],2015,31


In [8]:
# Check len

print(len(df_0101))
print(len(df_1231))

94
97


In [9]:
# See how many cities overlap

df_1231['ID_HDC_G0'].isin(df_0101['ID_HDC_G0']).value_counts()

True     59
False    38
Name: ID_HDC_G0, dtype: int64

In [10]:
# Merge based on city ID to only include overlaps

merge = pd.merge(df_1231, df_0101, on = 'ID_HDC_G0', how = 'inner')

In [11]:
merge

Unnamed: 0,ID_HDC_G0,Event_ID_x,tmax_x,event_dates_x,year_x,total_days_x,Event_ID_y,tmax_y,event_dates_y,year_y,total_days_y
0,5519,117936,[40.956707 41.224346 41.3726 ],['1987.12.29' '1987.12.30' '1987.12.31'],1987,31,451678,[40.648888],['2001.01.20'],2001,50
1,5519,849342,[41.66478],['2015.12.31'],2015,74,451678,[40.648888],['2001.01.20'],2001,50
2,5534,90648,[40.77078],['1986.12.31'],1986,86,45819,[41.395733 41.12193 ],['1984.01.01' '1984.01.02'],1984,103
3,5534,90648,[40.77078],['1986.12.31'],1986,86,141081,[41.37887 41.26748],['1988.01.01' '1988.01.02'],1988,84
4,5534,90648,[40.77078],['1986.12.31'],1986,86,404842,[41.404846 41.652008 41.585175 41.031097],['1999.01.01' '1999.01.02' '1999.01.03' '1999....,1999,92
...,...,...,...,...,...,...,...,...,...,...,...
314,3398,118141,[41.71855],['1987.12.31'],1987,8,68044,[42.620567 42.454918],['1985.01.01' '1985.01.02'],1985,5
315,3398,118141,[41.71855],['1987.12.31'],1987,8,141242,[43.16907 43.610813],['1988.01.01' '1988.01.02'],1988,7
316,3398,118141,[41.71855],['1987.12.31'],1987,8,451891,[40.78469],['2001.01.06'],2001,3
317,3398,118141,[41.71855],['1987.12.31'],1987,8,451892,[41.174187],['2001.01.08'],2001,3


In [12]:
merge.dtypes

ID_HDC_G0         int64
Event_ID_x        int64
tmax_x           object
event_dates_x    object
year_x            int64
total_days_x      int64
Event_ID_y        int64
tmax_y           object
event_dates_y    object
year_y            int64
total_days_y      int64
dtype: object

In [13]:
# Look for years the are one apart and get rows

out = []

for i, year in merge.iterrows():
        if year['year_y'] - year['year_x'] == 1:
            out.append(i)
out

[13, 34, 111, 123, 142, 175, 202, 236, 250, 273, 284, 299, 309, 315]

In [14]:
# Get the rows with dec 31 - jan 1

overlap = merge.loc[out]
overlap

Unnamed: 0,ID_HDC_G0,Event_ID_x,tmax_x,event_dates_x,year_x,total_days_x,Event_ID_y,tmax_y,event_dates_y,year_y,total_days_y
13,5534,117974,[41.446693 42.30564 42.97953 43.17466 ],['1987.12.28' '1987.12.29' '1987.12.30' '1987....,1987,99,141081,[41.37887 41.26748],['1988.01.01' '1988.01.02'],1988,84
34,5534,376959,[40.947575],['1998.12.31'],1998,79,404842,[41.404846 41.652008 41.585175 41.031097],['1999.01.01' '1999.01.02' '1999.01.03' '1999....,1999,92
111,5534,849422,[42.601044],['2015.12.31'],2015,126,882573,[41.114815 41.550606 41.746284],['2016.01.01' '2016.01.02' '2016.01.03'],2016,141
123,5541,117989,[41.22316 42.082108 42.755997 42.951126],['1987.12.28' '1987.12.29' '1987.12.30' '1987....,1987,92,141093,[41.167503 41.056114],['1988.01.01' '1988.01.02'],1988,75
142,5541,376974,[40.806267],['1998.12.31'],1998,76,404854,[41.231552 41.478714 41.41188 40.857803],['1999.01.01' '1999.01.02' '1999.01.03' '1999....,1999,84
175,5541,849443,[42.398857],['2015.12.31'],2015,112,882595,[40.954025 41.389816 41.585495],['2016.01.01' '2016.01.02' '2016.01.03'],2016,129
202,5520,118003,[41.090614 41.926125 42.600014 42.84202 ],['1987.12.28' '1987.12.29' '1987.12.30' '1987....,1987,83,141104,[40.96762 40.755005],['1988.01.01' '1988.01.02'],1988,70
236,5520,849463,[42.149605],['2015.12.31'],2015,103,882617,[40.83622 41.154823 41.233315],['2016.01.01' '2016.01.02' '2016.01.03'],2016,113
250,5390,377007,[40.932465],['1998.12.31'],1998,10,404896,[40.849594],['1999.01.01'],1999,33
273,5390,849508,[40.60667],['2015.12.31'],2015,59,882679,[41.256462 40.64724 ],['2016.01.01' '2016.01.02'],2016,71


#### Make new data from overlaps

In [15]:
def string_hunt(string_list, out_list, dtype):
    """Helper function to pull tmax record strings from a list without , and turn them into ints"""
    for i in string_list: # set the strings from X list
        if len(i) > 1:
            if '[' in i:
                
                record = i[1:]
                
                if ']' in record:
                    
                    record =  record[:-1]
                    out_list.append(dtype(record))
                else:
                    out_list.append(dtype(record))
            
            elif ']' in i:
                record = i[:-1]
                out_list.append(dtype(record))
            
            else:
                record = i
                out_list.append(dtype(record))
    
    return out_list

In [16]:
# loop by row to get temps

df_out = pd.DataFrame()

# Lists for df
temps_list_list = []
dates_list_list = []
duration_list = []
avg_temp_list = []
intensity_list = []
avg_intensity_list = []
tot_intensity_list = []
city_id_list = []
year_x_list = []
year_y_list = []
event_x_id_list = [] # <<<<<---- going to use the ID for the Dec date for now
event_y_id_list = [] # <<<<<---- going to use the ID for the Dec date for now
total_days_x_list = [] # total number of days added to first year
total_days_y_list = [] # total number of days subtracted first year

### Tempature
for i, row in overlap.iterrows():
    
    ### Temp and Days
    temps_list = [] # make list to populate
    
    temps_x = (row['tmax_x'].split(' ')) # split up the strings from X list
    temps_list = string_hunt(temps_x, temps_list, float)

    dur_x = len(temps_list) # duration first year 
    
    temps_y = (row['tmax_y'].split(' ')) # split up the strings from X list
    temps_list = string_hunt(temps_y, temps_list, float)

    dur_y = len(temps_list) - dur_x # duration second year
    
    temps_list_list.append(temps_list)
    
    ## Total Days
    total_days_x = row['total_days_x'] + dur_y
    total_days_y = row['total_days_y'] - dur_y
    
    total_days_x_list.append(total_days_x)
    total_days_y_list.append(total_days_y)
   
    ### Dates
    dates_list = [] # make list to populate
    
    dates_x = (row['event_dates_x'].split(' ')) # split up the strings from X list
    dates_list = string_hunt(dates_x, dates_list, str)
    
    dates_y = (row['event_dates_y'].split(' ')) # split up the strings from X list
    dates_list = string_hunt(dates_y, dates_list, str)
    
    dates_list_list.append(dates_list) # append list for df 
    
    ### Duration
    duration = len(temps_list)
    duration_list.append(duration)
    
    ### Intensity [x - 13 for x in a]
    intensity = [x - 40.6 for x in temps_list] # <<<<<<-------------------------- UPDATE AS NEEDED
    intensity_list.append(intensity)
    
    ### Avg_temp
    avg_temp = np.mean(temps_list)
    avg_temp_list.append(avg_temp)
    
    ### avg_intensity
    avg_intensity = np.mean(intensity)
    avg_intensity_list.append(avg_intensity)
    
    ### tot_intensity
    tot_intensity = np.sum(intensity)
    tot_intensity_list.append(tot_intensity)
    
    ### city_id & total days & year, etc
    city_id = row['ID_HDC_G0']
    city_id_list.append(city_id)
    
    ### Year
    year_x = row['year_x']
    year_x_list.append(year_x)
    
    year_y = row['year_y']
    year_y_list.append(year_y)
    
    ### event ID
    event_x_id = row['Event_ID_x']
    event_x_id_list.append(event_x_id)
    event_y_id = row['Event_ID_y']
    event_y_id_list.append(event_y_id)
    
    #avg_temp	avg_intensity	tot_intensity	event_dates	intensity

In [17]:
df_out['ID_HDC_G0'] = city_id_list
df_out['Event_ID_x'] = event_x_id_list
df_out['Event_ID_y'] = event_y_id_list
df_out['year_x'] = year_x_list
df_out['year_y'] = year_y_list
df_out['total_days_x'] = total_days_x_list
df_out['total_days_y'] = total_days_y_list
df_out['tmax'] = temps_list_list
df_out['event_dates'] = dates_list_list
df_out['duration'] = duration_list
df_out['avg_temp'] = avg_temp_list
df_out['intensity'] = intensity_list
df_out['tot_intensity'] = tot_intensity_list
df_out['avg_intensity'] = avg_intensity_list
df_out.head(1)



Unnamed: 0,ID_HDC_G0,Event_ID_x,Event_ID_y,year_x,year_y,total_days_x,total_days_y,tmax,event_dates,duration,avg_temp,intensity,tot_intensity,avg_intensity
0,5534,117974,141081,1987,1988,101,82,"[41.446693, 42.30564, 42.97953, 43.17466, 41.3...","['1987.12.28', '1987.12.29', '1987.12.30', '19...",6,42.092146,"[0.8466930000000019, 1.7056399999999954, 2.379...",8.952873,1.492145


#### Fix Total Days for Cities

In [None]:
df_out[df_out['year_x'] == 1984]

In [None]:
for i in zip(years_x,id_x):
    print(i)

In [None]:
years_x = list(df_out['year_x'])
id_x = list(df_out['ID_HDC_G0'])
total_days_x = list(df_out['total_days_x'])

df_sub = df[df['ID_HDC_G0'].isin(id_x)]
df_sub = df_sub[df_sub['year'].isin(years_x)]

print(np.unique(df_sub['ID_HDC_G0']))
print(np.unique(df_sub['year']))

print(df_sub.shape)

df_sub = df[(df['ID_HDC_G0'].isin(id_x)) & (df['year'].isin(years_x))]
df_sub.shape

In [None]:
test = df_sub[0:50]

In [None]:
days_list = []

for i, row_a in test.iterrows():
    for j, row_b in df_out.iterrows():
        if (row_a['ID_HDC_G0'] == row_b['ID_HDC_G0']) & (row_a['year'] == row_b['year_x']):
#             print('Year A is ', row_a['year'], ' ', 'Year B is ', row_b['year_x'])
#             print('ID A is ', row_a['ID_HDC_G0'],' ', 'ID B is', row_b['ID_HDC_G0'])
#             print('Days A', row_a['total_days'], "Days B ", row_b['total_days_x'])
            day = row_b['total_days_x']
            days_list.append(day)


In [None]:
for i in test['total_days']:
    print(i)

#### Add Meta data back

In [None]:
cols_to_use = df.columns.difference(df_out.columns)
cols_list = list(cols_to_use)
cols_list.append('ID_HDC_G0')

df_cols = df[cols_list]
df_cols = df_cols.drop(columns=['Unnamed: 0', 'Unnamed: 0.1'])

df_cols = df_cols.drop_duplicates('ID_HDC_G0', keep = 'first')

#df_out_merge = df_out.merge(df_out, df[cols_list], on = 'ID_HDC_G0', how = 'inner')

In [None]:
df_cols.head()

In [None]:
df_out_merge = df_out.merge(df_cols, on = 'ID_HDC_G0', how = 'inner')

In [None]:
df_out_merge.shape

In [None]:
df_out_merge.head(17)

#### Drop overlap events based on event id

In [None]:
# Get events

overlap
jan_ids = list(overlap['Event_ID_y'])
dec_ids = list(overlap['Event_ID_x'])

In [None]:
# Drop Events from Dataset

print(len(df))
df_copy = df.copy()

# Jan
for event in jan_ids:
    df_copy = df_copy[df_copy['Event_ID'] != event]
    
for event in dec_ids:
    df_copy = df_copy[df_copy['Event_ID'] != event]

print(len(df_copy))

#### Add in new events with new event ids

In [None]:
# Merge 
df_copy = df_copy.drop(columns=['Unnamed: 0', 'Unnamed: 0.1'])
df_copy.head()

In [None]:
print(len(df_copy))
print(len(df_out_merge))

print(df_copy.columns)
print(df_out_merge.columns)

In [None]:
print(len(df_copy))
print(len(df_out))
df_copy = df_copy.drop(columns=['Unnamed: 0', 'Unnamed: 0.1'])
df_final = []
df_final = pd.concat([df_copy, df_out_merge], sort = True)
print(len(df_final))

In [None]:
df_final.head(50)

#### Add start date

In [None]:
test_df = df_final.loc[10]

In [None]:
null_data = df_final[df_final.isnull().any(axis=1)]

In [None]:
null_data

## NEXT:

I need to write a function that compares years with the resulting subset of Dec 31 and Jan 1 and then I think we will just drop all these from the record and then add them on as heat waves that overlap years later. There should be about 100 of them

# Old Code

In [None]:
            
            
    
#     temps_y = (row['tmax_y'].split(' ')) # split up the strings from Y list
            
#     for i in temps_y: # set the strings from Y list
#         if len(i) > 1:
#             if '[' in i:

#                 temp = i[1:]

#                 if ']' in temp:

#                     temp =  temp[:-1]
#                     temps_list.append(float(temp))
#                 else:
#                     temps_list.append(float(temp))

#             else:
#                 temp = i
#                 temps_list.append(float(temp))



# counter = 0
# for i, row in overlap.iterrows():
#     test = (row['tmax_x'].split(' '))
#     for i in test:
#         if len(i) > 1:
#             print(i)
            
#             counter = counter + 1
# #         if '[' in i:
# #             print('yes')
# #             print(i[1:])
# #         if ']' in i:
# #             print(i[:-1])
# #         print(i)
# print(counter)

In [None]:

#     for i in temps_x: # set the strings from X list
#         if len(i) > 1:
#             if '[' in i:
                
#                 temp = i[1:]
                
#                 if ']' in temp:
                    
#                     temp =  temp[:-1]
#                     temps_list.append(float(temp))
#                 else:
#                     temps_list.append(float(temp))
            
#             elif ']' in i:
#                 temp = i[:-1]
#                 temps_list.append(float(temp))
            
#             else:
#                 temp = i
#                 temps_list.append(float(temp))
                
#             #temps_list_list.append(temps_list) # append list for df
    
#     temps_y = (row['tmax_y'].split(' ')) # split up the strings from X list

#     for i in temps_y: # set the strings from y list
#         if len(i) > 1:
#             if '[' in i:
                
#                 temp = i[1:]
                
#                 if ']' in temp:
                    
#                     temp =  temp[:-1]
#                     temps_list.append(float(temp))
#                 else:
#                     temps_list.append(float(temp))
            
#             elif ']' in i:
#                 temp = i[:-1]
#                 temps_list.append(float(temp))
            
#             else:
#                 temp = i
#                 temps_list.append(float(temp))
                       
#     temps_list_list.append(temps_list) # append list for df
    
#     print(temps_list)

In [None]:
for i, row in overlap.iterrows():
    print(overlap.loc[i]['tmax_x'])

In [None]:
[41.446693 42.30564 42.97953  43.17466
[40.947575]
[42.601044]
[41.22316 42.082108 42.755997 42.951126]
[40.806267]
[42.398857]
[41.090614 41.926125 42.600014 42.84202
[42.149605]
[40.932465]
[40.60667]
[41.324394 43.38537
[41.113922 41.99298 42.158752 43.608948 44.592743 42.73523 44.078674
45.146576 44.022064 41.813843]
[42.265797 43.199844]
[41.71855]