# Edge Search

Notebook to search for extreme heat events that overlap years.
Fix to resolve problem TBD.

By Cascade Tuholske, 2019-10-19


### Preliminary Findings

- In the entire record, there are 97 events that start on Jan 1.
- In the entire record, there are 94 events that end of Dec 31.

Of these, it looks like 5 were from the same city and bridged two years

#### Depdencies

In [8]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from time import time
import xarray as xr

In [3]:
fn = "/home/cascade/projects/UrbanHeat/data/processed/All_data_HI406.csv"  # Note: Need ?dl=1 to make sure this file gets read correctly
df = pd.read_csv(fn)
df.columns

Index(['Unnamed: 0', 'Event_ID', 'ID_HDC_G0', 'CTR_MN_NM', 'total_days',
       'duration', 'avg_temp', 'avg_intensity', 'tot_intensity', 'event_dates',
       'intensity', 'tmax', 'year'],
      dtype='object')

## 1. Find Edges

Build query to find dates XXXX.12.31 and XXXX.01.01 
Events col are strings

In [32]:
df.head()

Unnamed: 0.1,Unnamed: 0,Event_ID,ID_HDC_G0,CTR_MN_NM,total_days,duration,avg_temp,avg_intensity,tot_intensity,event_dates,intensity,tmax,year
0,0,1,4362,Russia,4,1,41.21623,0.61623,0.61623,['1983.07.01'],[0.61622976],[41.21622976],1983
1,1,2,4362,Russia,4,3,41.744828,1.144828,3.434484,['1983.07.03' '1983.07.04' '1983.07.05'],[0.40430535 0.39696466 2.63321359],[41.00430535 40.99696466 43.23321359],1983
2,2,3,5380,Russia,1,1,47.953803,7.353803,7.353803,['1983.07.03'],[7.35380338],[47.95380338],1983
3,3,4,5872,Russia,3,1,40.744342,0.144342,0.144342,['1983.06.23'],[0.14434172],[40.74434172],1983
4,4,5,5872,Russia,3,1,41.27792,0.67792,0.67792,['1983.06.25'],[0.67792041],[41.27792041],1983


In [5]:
def date_search(df, date):
    
    """ Searches Tmax data frame to find dates within a Tmax event with the goal of finding 12.31-01.01 overlap
    Args:
        df = tmax df
        data = date you want to find
    
    Returns df with event id, event dates, city id, year, and tmax temps 
    """

    event_id_list = []
    event_dates_list = []
    city_id_list = []
    event_year_list = []
    tmax_list = []
    total_days_list = []
    
    for index, row in df.iterrows():
        if date in row['event_dates']:
            
            event_id = row['Event_ID']
            event_dates = row['event_dates']
            city_id = row['ID_HDC_G0']
            event_year = row['year']
            tmax = row['tmax']
            total_days = row['total_days']
            
            event_id_list.append(event_id)
            event_dates_list.append(event_dates)
            city_id_list.append(city_id)
            event_year_list.append(event_year)
            tmax_list.append(tmax)
            total_days_list.append(total_days)
    
    df_out = pd.DataFrame()
    df_out['ID_HDC_G0'] = city_id_list
    df_out['Event_ID'] = event_id_list
    df_out['tmax'] = tmax_list
    df_out['event_dates'] = event_dates_list
    df_out['year'] = event_year_list
    df_out['total_days'] = total_days_list
    
    return df_out

In [10]:
# Dec 31 Events
start = time()
df_1231 = date_search(df, '12.31')
end = time()
print('This took', start - end, 'seconds')

This took -473.5447449684143 seconds


In [12]:
# Jan 1 Events
df_0101 = date_search(df, '01.01')

In [13]:
df_1231.head()

Unnamed: 0,ID_HDC_G0,Event_ID,tmax,event_dates,year,total_days
0,354,100397,[41.64383481 40.60793244],['1983.12.30' '1983.12.31'],1983,296
1,808,116528,[40.66734582],['1983.12.31'],1983,211
2,879,118686,[40.77659557],['1983.12.31'],1983,268
3,584,119932,[40.9906877 40.88687027],['1983.12.30' '1983.12.31'],1983,236
4,835,120212,[40.69073848 42.43803085],['1983.12.30' '1983.12.31'],1983,294


In [14]:
# Check len
print(len(df_0101))
print(len(df_1231))

20240
16860


In [15]:
# See how many cities overlap

df_1231['ID_HDC_G0'].isin(df_0101['ID_HDC_G0']).value_counts()

True     16581
False      279
Name: ID_HDC_G0, dtype: int64

In [16]:
# Merge based on city ID to only include overlaps
merge = pd.merge(df_1231, df_0101, on = 'ID_HDC_G0', how = 'inner')

In [17]:
merge.head()

Unnamed: 0,ID_HDC_G0,Event_ID_x,tmax_x,event_dates_x,year_x,total_days_x,Event_ID_y,tmax_y,event_dates_y,year_y,total_days_y
0,354,100397,[41.64383481 40.60793244],['1983.12.30' '1983.12.31'],1983,296,100366,[43.64394254 42.58659489 43.45413085 42.265967...,['1983.01.01' '1983.01.02' '1983.01.03' '1983....,1983,296
1,354,100397,[41.64383481 40.60793244],['1983.12.30' '1983.12.31'],1983,296,260464,[41.28366604 40.95141962],['1984.01.01' '1984.01.02'],1984,222
2,354,100397,[41.64383481 40.60793244],['1983.12.30' '1983.12.31'],1983,296,583918,[40.98153566],['1986.01.01'],1986,236
3,354,100397,[41.64383481 40.60793244],['1983.12.30' '1983.12.31'],1983,296,746046,[41.60371676],['1987.01.01'],1987,282
4,354,100397,[41.64383481 40.60793244],['1983.12.30' '1983.12.31'],1983,296,917877,[42.84500802 44.09398242 44.69369739 43.593172...,['1988.01.01' '1988.01.02' '1988.01.03' '1988....,1988,215


In [18]:
merge.dtypes

ID_HDC_G0         int64
Event_ID_x        int64
tmax_x           object
event_dates_x    object
year_x            int64
total_days_x      int64
Event_ID_y        int64
tmax_y           object
event_dates_y    object
year_y            int64
total_days_y      int64
dtype: object

In [19]:
# Look for years the are one apart and get rows

out = []

for i, year in merge.iterrows():
        if year['year_y'] - year['year_x'] == 1:
            out.append(i)
out

[1,
 33,
 64,
 125,
 155,
 189,
 219,
 249,
 279,
 309,
 339,
 369,
 399,
 429,
 459,
 489,
 519,
 549,
 579,
 624,
 639,
 654,
 684,
 727,
 756,
 757,
 758,
 829,
 844,
 873,
 888,
 985,
 1049,
 1050,
 1051,
 1070,
 1090,
 1127,
 1146,
 1188,
 1234,
 1235,
 1236,
 1237,
 1260,
 1305,
 1317,
 1329,
 1342,
 1453,
 1476,
 1499,
 1522,
 1589,
 1612,
 1701,
 1702,
 1703,
 1726,
 1749,
 1795,
 1818,
 1841,
 1886,
 1931,
 1954,
 1977,
 2000,
 2097,
 2116,
 2135,
 2190,
 2232,
 2287,
 2306,
 2343,
 2362,
 2381,
 2400,
 2443,
 2456,
 2469,
 2482,
 2507,
 2532,
 2545,
 2558,
 2583,
 2610,
 2647,
 2660,
 2673,
 2686,
 2711,
 2736,
 2749,
 2762,
 2787,
 2801,
 2826,
 2871,
 2935,
 2946,
 2957,
 2968,
 2991,
 3004,
 3017,
 3030,
 3059,
 3072,
 3085,
 3112,
 3145,
 3162,
 3179,
 3212,
 3245,
 3278,
 3295,
 3312,
 3329,
 3346,
 3379,
 3396,
 3413,
 3430,
 3457,
 3468,
 3510,
 3531,
 3542,
 3553,
 3564,
 3586,
 3609,
 3622,
 3635,
 3648,
 3677,
 3690,
 3703,
 3743,
 3780,
 3788,
 3812,
 3933,
 3951,


In [107]:
# Get the rows with dec 31 - jan 1

overlap = merge.loc[out]
overlap.head()

Unnamed: 0,ID_HDC_G0,Event_ID_x,tmax_x,event_dates_x,year_x,total_days_x,Event_ID_y,tmax_y,event_dates_y,year_y,total_days_y
1,354,100397,[41.64383481 40.60793244],['1983.12.30' '1983.12.31'],1983,296,260464,[41.28366604 40.95141962],['1984.01.01' '1984.01.02'],1984,222
33,354,746080,[43.66745054 45.03831865 47.39934079 45.742721...,['1987.12.14' '1987.12.15' '1987.12.16' '1987....,1987,282,917877,[42.84500802 44.09398242 44.69369739 43.593172...,['1988.01.01' '1988.01.02' '1988.01.03' '1988....,1988,215
64,354,1246579,[41.96780346 41.71945055 41.05406109 42.084555...,['1990.12.22' '1990.12.23' '1990.12.24' '1990....,1990,287,1409544,[40.61181345 42.09862007 42.40291151 42.179612...,['1991.01.01' '1991.01.02' '1991.01.03' '1991....,1991,273
125,354,2393669,[41.34481498 43.38049558 41.11187846 40.91361081],['1997.12.28' '1997.12.29' '1997.12.30' '1997....,1997,287,2562213,[42.15636625 43.81814282 44.54720453 42.052629...,['1998.01.01' '1998.01.02' '1998.01.03' '1998....,1998,303
155,354,2562243,[42.05110364 42.36287296],['1998.12.30' '1998.12.31'],1998,303,2738036,[43.15258856 40.97809066 41.10608037 43.194516...,['1999.01.01' '1999.01.02' '1999.01.03' '1999....,1999,196


In [97]:
test_x = overlap['event_dates_x'][33]
dt_x = test_x.replace('\n', '').replace("[", '').replace(']', '').replace(' ', ' ').replace('\'','').split(' ')

In [98]:
dt_x

['1987.12.14',
 '1987.12.15',
 '1987.12.16',
 '1987.12.17',
 '1987.12.18',
 '1987.12.19',
 '1987.12.20',
 '1987.12.21',
 '1987.12.22',
 '1987.12.23',
 '1987.12.24',
 '1987.12.25',
 '1987.12.26',
 '1987.12.27',
 '1987.12.28',
 '1987.12.29',
 '1987.12.30',
 '1987.12.31']

In [99]:
test_y = overlap['event_dates_y'][33]
dt_y = test_y.replace('\n', '').replace("[", '').replace(']', '').replace(' ', ' ').replace('\'','').split(' ')

In [100]:
dt_x + dt_y

['1987.12.14',
 '1987.12.15',
 '1987.12.16',
 '1987.12.17',
 '1987.12.18',
 '1987.12.19',
 '1987.12.20',
 '1987.12.21',
 '1987.12.22',
 '1987.12.23',
 '1987.12.24',
 '1987.12.25',
 '1987.12.26',
 '1987.12.27',
 '1987.12.28',
 '1987.12.29',
 '1987.12.30',
 '1987.12.31',
 '1988.01.01',
 '1988.01.02',
 '1988.01.03',
 '1988.01.04',
 '1988.01.05',
 '1988.01.06',
 '1988.01.07']

In [178]:
# Lists to populate
dates_xy_list = []
Tmax_xy_list = []
new_x_total_days_list = []
new_y_total_days_list = []
intensity_list = []
year_x = []
year_y = []
event_x_id_list = []
event_y_id_list = []

for index, row in overlap.iterrows():
    
    ## X referes to the left hand event/year, Y referes to the right hand event/year
    
    # Merge x-y dates overlap
    dates_x = overlap['event_dates_x'][index].replace('\n', '').replace("[", '').replace(']', '').replace(' ', ' ').replace('\'','').split(' ')
    dates_y = overlap['event_dates_y'][index].replace('\n', '').replace("[", '').replace(']', '').replace(' ', ' ').replace('\'','').split(' ')
    dates_xy = dates_x + dates_y
    dates_xy_list.append(dates_xy)
    
    # Merge x-y dates overlap
    Tmax_x = overlap['tmax_x'][index].replace('  ', ' ').replace('\n', '').replace("[", '').replace(']', '').split(' ')
    Tmax_y = overlap['tmax_y'][index].replace('  ', ' ').replace('\n', '').replace("[", '').replace(']', '').split(' ')
    Tmax_xy = Tmax_x + Tmax_y
    Tmax_xy_list.append(Tmax_xy)
    
    # Add duration to x, subtract from y
#     new_x_total_days_list.append(overlap['total_days_x'][index] + len(dates_y))
#     new_y_total_days_list.append(overlap['total_days_y'][index] - len(dates_y))
    
#     ### Intensity [x - 13 for x in a]
#     intensity = [x - 40.6 for x in Tmax_xy_list] # <<<<<<-------------------------- UPDATE TMAX AS NEEDED
#     intensity_list.append(intensity)
    
#     ### Avg_temp
#     avg_temp = np.mean(temps_list)
#     avg_temp_list.append(avg_temp)
    
#     ### avg_intensity
#     avg_intensity = np.mean(intensity)
#     avg_intensity_list.append(avg_intensity)
    
#     ### tot_intensity
#     tot_intensity = np.sum(intensity)
#     tot_intensity_list.append(tot_intensity)
    
#     ### city_id & total days & year, etc
#     city_id = row['ID_HDC_G0']
#     city_id_list.append(city_id)
    
#     ### Year
#     year_x = row['year_x']
#     year_x_list.append(year_x)
    
#     year_y = row['year_y']
#     year_y_list.append(year_y)
    
#     ### event ID
#     event_x_id = row['Event_ID_x']
#     event_x_id_list.append(event_x_id)
#     event_y_id = row['Event_ID_y']
#     event_y_id_list.append(event_y_id)
    
    

In [179]:
Tmax_xy_list

[['41.64383481', '40.60793244', '41.28366604', '40.95141962'],
 ['43.66745054',
  '45.03831865',
  '47.39934079',
  '45.74272138',
  '44.37116105',
  '45.80005995',
  '45.85098181',
  '44.09870307',
  '45.54975649',
  '45.64126847',
  '44.9756013',
  '44.06068416',
  '43.36969024',
  '44.61114423',
  '44.55263444',
  '44.86192521',
  '44.18199414',
  '44.47591597',
  '42.84500802',
  '44.09398242',
  '44.69369739',
  '43.59317236',
  '42.46278798',
  '42.38256745',
  '40.95981847'],
 ['41.96780346',
  '41.71945055',
  '41.05406109',
  '42.08455547',
  '41.49368503',
  '40.98529322',
  '41.3688164',
  '41.88215165',
  '41.51349583',
  '41.73569697',
  '40.61181345',
  '42.09862007',
  '42.40291151',
  '42.179612',
  '',
  '42.42721704',
  '43.61403444',
  '42.53453395',
  '42.09121693',
  '42.61701929',
  '42.01980374'],
 ['41.34481498',
  '43.38049558',
  '41.11187846',
  '40.91361081',
  '42.15636625',
  '43.81814282',
  '44.54720453',
  '42.05262949',
  '42.13273148',
  '41.22365517'

In [168]:
overlap['tmax_x'][33]

'[43.66745054 45.03831865 47.39934079 45.74272138 44.37116105 45.80005995\n 45.85098181 44.09870307 45.54975649 45.64126847 44.9756013  44.06068416\n 43.36969024 44.61114423 44.55263444 44.86192521 44.18199414 44.47591597]'

In [165]:
wtf

['43.66745054',
 '45.03831865',
 '47.39934079',
 '45.74272138',
 '44.37116105',
 '45.80005995',
 '45.85098181',
 '44.09870307',
 '45.54975649',
 '45.64126847',
 '44.9756013',
 '',
 '44.06068416',
 '43.36969024',
 '44.61114423',
 '44.55263444',
 '44.86192521',
 '44.18199414',
 '44.47591597']

In [131]:
print(len(dates_xy_list))
print(len(Tmax_xy_list))
new_x_total_days_list = []
new_y_total_days_list = []
intensity_list = []
year_x = []
year_y = []
event_x_id_list = []
event_y_id_list = []

10985
0


In [128]:
df_overlap = pd.DataFrame()

df_overlap['ID_HDC_G0'] = city_id_list
df_overlap['Event_ID_x'] = event_x_id_list
df_overlap['Event_ID_y'] = event_y_id_list
df_overlap['year_x'] = year_x_list
df_overlap['year_y'] = year_y_list
df_overlap['new_x_total_days'] = new_x_total_days_list
df_overlap['new_y_total_days'] = new__total_days_list
df_overlap['tmax'] = temps_list_list
df_overlap['event_dates'] = dates_list_list
df_overlap['duration'] = duration_list
df_overlap['avg_temp'] = avg_temp_list
df_overlap['intensity'] = intensity_list
df_overlap['tot_intensity'] = tot_intensity_list
df_overlap['avg_intensity'] = avg_intensity_list
df_overlap.head(1)

ValueError: Length of values does not match length of index

## 2. Make new data from overlaps

In [26]:
def string_hunt(string_list, out_list, dtype):
    """Helper function to pull tmax record strings from a list of Tmaxs, and turn dates into ints"""
    for i in string_list: # set the strings from X list
        if len(i) > 1:
            if '[' in i:
                
                record = i[1:]
                
                if ']' in record:
                    
                    record =  record[:-1]
                    out_list.append(dtype(record))
                else:
                    out_list.append(dtype(record))
            
            elif ']' in i:
                record = i[:-1]
                out_list.append(dtype(record))
            
            else:
                record = i
                out_list.append(dtype(record))
    
    return out_list

In [None]:
test = string_hunt(overlap['event_dates_x'][1], out_l)

In [27]:
# loop by row to get temps

df_overlap = pd.DataFrame()

# Lists for df
temps_list_list = []
dates_list_list = []
duration_list = []
avg_temp_list = []
intensity_list = []
avg_intensity_list = []
tot_intensity_list = []
city_id_list = []
year_x_list = []
year_y_list = []
event_x_id_list = [] # <<<<<---- going to use the ID for the Dec date for now
event_y_id_list = [] # <<<<<---- going to use the ID for the Dec date for now
total_days_x_list = [] # total number of days added to first year
total_days_y_list = [] # total number of days subtracted first year

### Tempature
for i, row in overlap.iterrows():
    
    ### Temp and Days
    temps_list = [] # make list to populate
    
    temps_x = (row['tmax_x'].split(' ')) # split up the strings from X list
    temps_list = string_hunt(temps_x, temps_list, float)

    dur_x = len(temps_list) # duration first year 
    
    temps_y = (row['tmax_y'].split(' ')) # split up the strings from X list
    temps_list = string_hunt(temps_y, temps_list, float)

    dur_y = len(temps_list) - dur_x # duration second year
    
    temps_list_list.append(temps_list)
    
    ## Total Days
    total_days_x = row['total_days_x'] + dur_y # add event dur from year x
    total_days_y = row['total_days_y'] - dur_y # subtract event dur from year y
    
    total_days_x_list.append(total_days_x)
    total_days_y_list.append(total_days_y)
   
    ### Dates
    dates_list = [] # make list to populate
    
    dates_x = (row['event_dates_x'].split(' ')) # split up the strings from X list
    dates_list = string_hunt(dates_x, dates_list, str)
    
    dates_y = (row['event_dates_y'].split(' ')) # split up the strings from X list
    dates_list = string_hunt(dates_y, dates_list, str)
    
    dates_list_list.append(dates_list) # append list for df 
    
    ### Duration
    duration = len(temps_list)
    duration_list.append(duration)
    
    ### Intensity [x - 13 for x in a]
    intensity = [x - 40.6 for x in temps_list] # <<<<<<-------------------------- UPDATE TMAX AS NEEDED
    intensity_list.append(intensity)
    
    ### Avg_temp
    avg_temp = np.mean(temps_list)
    avg_temp_list.append(avg_temp)
    
    ### avg_intensity
    avg_intensity = np.mean(intensity)
    avg_intensity_list.append(avg_intensity)
    
    ### tot_intensity
    tot_intensity = np.sum(intensity)
    tot_intensity_list.append(tot_intensity)
    
    ### city_id & total days & year, etc
    city_id = row['ID_HDC_G0']
    city_id_list.append(city_id)
    
    ### Year
    year_x = row['year_x']
    year_x_list.append(year_x)
    
    year_y = row['year_y']
    year_y_list.append(year_y)
    
    ### event ID
    event_x_id = row['Event_ID_x']
    event_x_id_list.append(event_x_id)
    event_y_id = row['Event_ID_y']
    event_y_id_list.append(event_y_id)
    
    #avg_temp	avg_intensity	tot_intensity	event_dates	intensity

In [122]:
df_overlap['ID_HDC_G0'] = city_id_list
df_overlap['Event_ID_x'] = event_x_id_list
df_overlap['Event_ID_y'] = event_y_id_list
df_overlap['year_x'] = year_x_list
df_overlap['year_y'] = year_y_list
df_overlap['total_days_x'] = total_days_x_list
df_overlap['total_days_y'] = total_days_y_list
df_overlap['tmax'] = temps_list_list
df_overlap['event_dates'] = dates_list_list
df_overlap['duration'] = duration_list
df_overlap['avg_temp'] = avg_temp_list
df_overlap['intensity'] = intensity_list
df_overlap['tot_intensity'] = tot_intensity_list
df_overlap['avg_intensity'] = avg_intensity_list
df_overlap.head(1)



ValueError: Length of values does not match length of index

In [30]:
df_overlap['event_dates'][1]

["'1987.12.14'",
 "'1987.12.15'",
 "'1987.12.16'",
 "'1987.12.17'",
 "'1987.12.18'\n",
 "'1987.12.19'",
 "'1987.12.20'",
 "'1987.12.21'",
 "'1987.12.22'",
 "'1987.12.23'\n",
 "'1987.12.24'",
 "'1987.12.25'",
 "'1987.12.26'",
 "'1987.12.27'",
 "'1987.12.28'\n",
 "'1987.12.29'",
 "'1987.12.30'",
 "'1987.12.31'",
 "'1988.01.01'",
 "'1988.01.02'",
 "'1988.01.03'",
 "'1988.01.04'",
 "'1988.01.05'\n",
 "'1988.01.06'",
 "'1988.01.07'"]

## 3. Fix Total Days for Cities

Here we subtract the event days from the Jan year (y) from year y and we add those dates to year x so on balance the dates from the jan year are now just added to the earlier year

- the X list is the year on the left and the total days goes up by the added days from year y (right)

In [31]:
# Get List of Years and Cities for the dec-jan overlap and then find them in the dataset

# Start with year_x
years_x = list(df_overlap['year_x'])
id_x = list(df_overlap['ID_HDC_G0'])
total_days_x = list(df_overlap['total_days_x'])

x_list = []
for i in zip(years_x,id_x, total_days_x):
    x_list.append(i)

for x in x_list:
    print(x)

(1983, 354, 298)
(1987, 354, 289)
(1990, 354, 297)
(1997, 354, 294)
(1998, 354, 311)
(2001, 354, 291)
(2002, 354, 323)
(2004, 354, 316)
(2005, 354, 273)
(2006, 354, 317)
(2007, 354, 294)
(2008, 354, 248)
(2009, 354, 321)
(2010, 354, 238)
(2011, 354, 283)
(2012, 354, 315)
(2013, 354, 316)
(2014, 354, 334)
(2015, 354, 344)
(1987, 808, 237)
(1989, 808, 160)
(1990, 808, 199)
(1994, 808, 177)
(1998, 808, 272)
(2000, 808, 161)
(2000, 808, 160)
(2000, 808, 164)
(2006, 808, 208)
(2010, 808, 226)
(2012, 808, 221)
(2013, 808, 239)
(1992, 879, 219)
(2000, 879, 196)
(2000, 879, 196)
(2000, 879, 195)
(2002, 879, 279)
(2006, 879, 243)
(2012, 879, 258)
(2014, 879, 252)
(1991, 584, 196)
(2000, 584, 161)
(2000, 584, 161)
(2000, 584, 162)
(2000, 584, 163)
(2002, 584, 199)
(2012, 584, 219)
(2013, 584, 226)
(2014, 584, 225)
(1983, 835, 295)
(1989, 835, 234)
(1990, 835, 287)
(1991, 835, 248)
(1992, 835, 232)
(1995, 835, 307)
(1996, 835, 288)
(2000, 835, 272)
(2000, 835, 270)
(2000, 835, 275)
(2001, 835, 26

(1996, 4722, 155)
(1997, 4722, 149)
(1998, 4722, 190)
(1999, 4722, 165)
(2000, 4722, 155)
(2000, 4722, 160)
(2000, 4722, 154)
(2000, 4722, 154)
(2000, 4722, 157)
(2000, 4722, 155)
(2003, 4722, 181)
(2004, 4722, 168)
(2006, 4722, 177)
(2007, 4722, 183)
(2008, 4722, 204)
(2009, 4722, 188)
(2010, 4722, 193)
(2011, 4722, 175)
(2012, 4722, 158)
(2013, 4722, 169)
(2014, 4722, 173)
(2015, 4722, 167)
(1983, 4772, 188)
(1986, 4772, 114)
(1987, 4772, 184)
(1989, 4772, 114)
(1990, 4772, 159)
(1991, 4772, 158)
(1992, 4772, 170)
(1993, 4772, 144)
(1994, 4772, 119)
(1996, 4772, 149)
(1997, 4772, 132)
(1998, 4772, 176)
(1999, 4772, 150)
(2000, 4772, 150)
(2000, 4772, 157)
(2000, 4772, 154)
(2000, 4772, 149)
(2000, 4772, 149)
(2003, 4772, 152)
(2004, 4772, 164)
(2006, 4772, 168)
(2007, 4772, 172)
(2008, 4772, 198)
(2009, 4772, 180)
(2010, 4772, 203)
(2011, 4772, 183)
(2012, 4772, 144)
(2014, 4772, 168)
(2015, 4772, 154)
(1983, 4594, 120)
(1986, 4594, 75)
(1991, 4594, 91)
(1992, 4594, 104)
(1993, 4594,

(2000, 12792, 198)
(2002, 12792, 254)
(2009, 12792, 259)
(2013, 12792, 263)
(2015, 12792, 290)
(1996, 1886, 126)
(2001, 1886, 122)
(2002, 1886, 130)
(2005, 1886, 152)
(2008, 1886, 112)
(2009, 1886, 147)
(2002, 1779, 79)
(2005, 1779, 130)
(1985, 11062, 271)
(1986, 11062, 270)
(1987, 11062, 298)
(1989, 11062, 289)
(1990, 11062, 309)
(1992, 11062, 307)
(1994, 11062, 324)
(2000, 11062, 313)
(2000, 11062, 316)
(2000, 11062, 314)
(2000, 11062, 313)
(2000, 11062, 313)
(2000, 11062, 318)
(2002, 11062, 336)
(2003, 11062, 327)
(2005, 11062, 334)
(2009, 11062, 312)
(2011, 11062, 306)
(2013, 11062, 305)
(2015, 11062, 397)
(1987, 5739, 93)
(1998, 5739, 80)
(2012, 5739, 117)
(2015, 5739, 109)
(1987, 3348, 117)
(1992, 3348, 84)
(1996, 3348, 99)
(2000, 3348, 103)
(2000, 3348, 104)
(2000, 12945, 97)
(2000, 12945, 98)
(2000, 12945, 97)
(2015, 12945, 238)
(1987, 5709, 136)
(1998, 5709, 98)
(2000, 5709, 99)
(2000, 5709, 103)
(2000, 5709, 102)
(2000, 5709, 100)
(2000, 5709, 99)
(2008, 5709, 76)
(2012, 5709

(2000, 1524, 140)
(2000, 1524, 143)
(2000, 1524, 140)
(2000, 8164, 105)
(2000, 444, 75)
(2000, 444, 70)
(2000, 444, 71)
(2000, 444, 71)
(2000, 444, 72)
(2002, 444, 142)
(2006, 444, 100)
(2011, 444, 101)
(2014, 444, 126)
(2015, 444, 210)
(2000, 440, 75)
(2000, 440, 70)
(2000, 440, 70)
(2000, 440, 71)
(2000, 440, 71)
(2000, 440, 72)
(2002, 440, 142)
(2006, 440, 114)
(2011, 440, 108)
(2014, 440, 135)
(2015, 440, 214)
(2000, 13077, 47)
(2000, 13077, 46)
(2000, 13077, 47)
(2000, 13077, 46)
(2015, 13077, 76)
(2000, 13091, 36)
(2000, 13091, 37)
(2000, 13091, 38)
(2008, 13091, 34)
(2000, 12876, 69)
(2000, 12876, 68)
(2000, 12876, 70)
(2015, 12876, 157)
(2014, 13133, 19)
(2015, 252, 245)
(2001, 1473, 214)
(2001, 1460, 230)
(2001, 1465, 225)
(2001, 1469, 208)
(2001, 1462, 233)
(2001, 1471, 207)
(2009, 1476, 242)
(2009, 1480, 238)
(2001, 1493, 91)
(2008, 1502, 153)
(2013, 546, 121)
(2006, 527, 78)
(2014, 527, 111)
(2015, 3864, 33)
(2015, 3874, 34)
(2005, 345, 171)
(2005, 353, 179)
(2005, 346, 169

# Attempt to //

In [None]:
df_copy = df.copy()

In [None]:
df_copy.shape

In [None]:
# set number of cores to use
cpu_num = 4
marker = int(len(df_copy)/cpu_num)

In [None]:
counter = 0
start = 0 
marker_list = []

for x in range(cpu_num):
    start = start + marker
    marker_list.append(start)

marker_list = marker_list[:-1]

In [None]:
len(df_copy.loc[:marker_list[0]])

# BELOW THIS WILL WORK YA GOOF BALL 

In [None]:
#### START HERE


n = 200000  #chunk row size
list_df = [df_copy[i:i+n] for i in range(0,df_copy.shape[0],n)]
    



In [None]:
for n in list_df:
    print(len(n))

In [None]:
df_out = df.loc[counter:marker_list[0]]

In [None]:
len(df_out)

In [None]:
counter+marker

In [None]:
marker_list[1]

# Attempt to //

In [None]:
# Search df for i list and replace days
# this is super slow but it works

df_copy = df.copy()

for x in x_list:
    for i, row in df_copy.iterrows():
        if (row['year'] == x[0]) & (row['ID_HDC_G0'] == x[1]):
            print(df_copy.loc[i,'total_days'])
            df_copy.loc[i,'total_days'] = x[2]
            print(df_copy.loc[i,'total_days'])
            

In [None]:
# Start with year_y 
years_y = list(df_overlap['year_y'])
id_y = list(df_overlap['ID_HDC_G0'])
total_days_y = list(df_overlap['total_days_y'])

y_list = []
for i in zip(years_y, id_y, total_days_y):
    y_list.append(i)

for y in y_list:
    print(y)

In [None]:
# Run on y_list

for y in y_list:
    for i, row in df_copy.iterrows():
        if (row['year'] == y[0]) & (row['ID_HDC_G0'] == y[1]):
            print(df_copy.loc[i,'total_days'])
            df_copy.loc[i,'total_days'] = y[2]
            print(df_copy.loc[i,'total_days'])

In [None]:
# Make a copy as back up in case you over right df_copy

df_copy_extra = df_copy.copy()

In [None]:
# this csv is the results of step three so it doesn't have to be repeated
#df_copy.to_csv( "/home/cascade/projects/data_out_urbanheat/Edgesearchstep3_20200102.csv")

# 4. Add Meta data back

In [None]:
fn = "/home/cascade/projects/data_out_urbanheat/Edgesearchstep3_20200102.csv"  # Note: Need ?dl=1 to make sure this file gets read correctly
df_copy = pd.read_csv(fn)
df_copy.head()

In [None]:
# copy overlap 

df_overlap_copy = df_overlap.copy()

In [None]:
print(len(df_overlap_copy))
df_overlap_copy.head(1)

In [None]:
# Get columns to merge and merge
df_cols = df_copy[['CTR_MN_NM', 'ID_HDC_G0']]
df_cols = df_cols.drop_duplicates('ID_HDC_G0')

df_overlap_copy = df_overlap_copy.merge(df_cols, on = 'ID_HDC_G0', how = 'inner')

In [None]:
print(len(df_overlap_copy))
df_overlap_copy

In [None]:
# drop and rename columns

df_overlap_copy.rename(columns = {'year_x':'year'}, inplace = True) 
df_overlap_copy.rename(columns = {'total_days_x':'total_days'}, inplace = True) 

In [None]:
df_overlap_copy

# 5. Drop overlapped years and add in new DF

#### Drop overlap events based on event id from all events

In [None]:
overlap.head(1)

In [None]:
# Get events

jan_ids = list(overlap['Event_ID_y'])
dec_ids = list(overlap['Event_ID_x'])

In [None]:
# Drop Events from Dataset

print(len(df_copy))
df_events = df_copy.copy()

# Jan
for event in jan_ids:
    df_events = df_events[df_events['Event_ID'] != event]
    
for event in dec_ids:
    df_events = df_events[df_events['Event_ID'] != event]

print(len(df_events))

#### Add in new events with new event ids

In [None]:
# Merge 
df_events = df_events.drop(columns=['Unnamed: 0', 'Unnamed: 0.1'])
df_events.head()

In [None]:
print(len(df_events))
print(len(df_overlap_copy))

print(df_events.columns)
print(df_overlap_copy.columns)

In [None]:
# Make 'x' event ids for final df
df_overlap_copy['Event_ID'] = df_overlap_copy['Event_ID_x']

In [None]:
# drop event x y event ID cols 
cols_to_use = df_overlap_copy.columns.difference(df_events.columns) # find missing columns
cols_list = list(cols_to_use) # list
cols_list

df_overlap_copy = df_overlap_copy.drop(columns = cols_list)
df_overlap_copy

In [None]:
print(len(df_events))
print(len(df_overlap_copy))

df_final = pd.concat([df_events, df_overlap_copy], sort = True)

print(len(df_final))

# print(len(df_out_copy_merge))
# print(len(df_events))
# df_copy = df_copy.drop(columns=['Unnamed: 0', 'Unnamed: 0.1'])
# df_final = []
# df_final = pd.concat([df_copy, df_out_merge], sort = True)
# print(len(df_final))

In [None]:
# Save it out

FN_OUT = "/home/cascade/projects/data_out_urbanheat/All_data20191231.csv"  # Note: Need ?dl=1 to make sure this file gets read correctly
df_final.to_csv(FN_OUT)

### Sanity Check

In [None]:
for x in x_list:
    print(x)

In [None]:
for i, row in df_final[(df_final['year'] == 1987) & (df_final['ID_HDC_G0'] == 5534)].iterrows():
    print(row['total_days'])

In [None]:
for i, row in df[(df['year'] == 1987) & (df['ID_HDC_G0'] == 5534)].iterrows():
    print(row['total_days'])

# 6. Add start date

In [None]:
len(df_final)

In [None]:
# Find First Data

test = df_final[0:10]

start_date_list = []
for i, row in df_final.iterrows():
    if type(row['event_dates']) is str:
        start_date = row['event_dates'].split('\'')[1]
        start_date_list.append(start_date)
    else:
        print(row['event_dates'][0])
        start_date = row['event_dates'][0]
        start_date_list.append(start_date)
#     else:
#         print(i)
#         print(row['event_dates'][0])
# #     print(row['event_dates'].split('\'')[1])
# #     dates = (row['event_dates'].split(' '))

In [None]:
len(start_date_list)

In [None]:
df_final['start_date'] = start_date_list

In [None]:
FN_OUT = "/home/cascade/projects/data_out_urbanheat/All_data20191109_final.csv"  # Note: Need ?dl=1 to make sure this file gets read correctly
df_final.to_csv(FN_OUT)

## NEXT:

I need to write a function that compares years with the resulting subset of Dec 31 and Jan 1 and then I think we will just drop all these from the record and then add them on as heat waves that overlap years later. There should be about 100 of them

# Old Code

In [None]:
            
            
    
#     temps_y = (row['tmax_y'].split(' ')) # split up the strings from Y list
            
#     for i in temps_y: # set the strings from Y list
#         if len(i) > 1:
#             if '[' in i:

#                 temp = i[1:]

#                 if ']' in temp:

#                     temp =  temp[:-1]
#                     temps_list.append(float(temp))
#                 else:
#                     temps_list.append(float(temp))

#             else:
#                 temp = i
#                 temps_list.append(float(temp))



# counter = 0
# for i, row in overlap.iterrows():
#     test = (row['tmax_x'].split(' '))
#     for i in test:
#         if len(i) > 1:
#             print(i)
            
#             counter = counter + 1
# #         if '[' in i:
# #             print('yes')
# #             print(i[1:])
# #         if ']' in i:
# #             print(i[:-1])
# #         print(i)
# print(counter)

In [None]:

#     for i in temps_x: # set the strings from X list
#         if len(i) > 1:
#             if '[' in i:
                
#                 temp = i[1:]
                
#                 if ']' in temp:
                    
#                     temp =  temp[:-1]
#                     temps_list.append(float(temp))
#                 else:
#                     temps_list.append(float(temp))
            
#             elif ']' in i:
#                 temp = i[:-1]
#                 temps_list.append(float(temp))
            
#             else:
#                 temp = i
#                 temps_list.append(float(temp))
                
#             #temps_list_list.append(temps_list) # append list for df
    
#     temps_y = (row['tmax_y'].split(' ')) # split up the strings from X list

#     for i in temps_y: # set the strings from y list
#         if len(i) > 1:
#             if '[' in i:
                
#                 temp = i[1:]
                
#                 if ']' in temp:
                    
#                     temp =  temp[:-1]
#                     temps_list.append(float(temp))
#                 else:
#                     temps_list.append(float(temp))
            
#             elif ']' in i:
#                 temp = i[:-1]
#                 temps_list.append(float(temp))
            
#             else:
#                 temp = i
#                 temps_list.append(float(temp))
                       
#     temps_list_list.append(temps_list) # append list for df
    
#     print(temps_list)

In [None]:
for i, row in overlap.iterrows():
    print(overlap.loc[i]['tmax_x'])

In [None]:
[41.446693 42.30564 42.97953  43.17466
[40.947575]
[42.601044]
[41.22316 42.082108 42.755997 42.951126]
[40.806267]
[42.398857]
[41.090614 41.926125 42.600014 42.84202
[42.149605]
[40.932465]
[40.60667]
[41.324394 43.38537
[41.113922 41.99298 42.158752 43.608948 44.592743 42.73523 44.078674
45.146576 44.022064 41.813843]
[42.265797 43.199844]
[41.71855]

In [None]:
            
#             print(df.loc[i,'year'])
#             print(df.loc[i,'total_days'])
            #print(row['year'], x[0], row['ID_HDC_G0'], x[1], row['total_days'], x[2])

# df_copy = df.copy()
# condition = (row['ID_HDC_G0'] == x[0]) #

# for i, row in df_sub.iterrows():
#     for x in x_list:
#         if condition:
#             print(row['total_days'], ' '+ x[2])

# for x in x_list:
#     print(df_sub.where((['year'] == x[0])))

# for i, row in df_copy.itter
# test = df[df['ID_HDC_G0'] == 5534]
# test = test[test['year'] == 1987]
# for i in zip(years_x,id_x, total_days_x):
#     print(i)

In [None]:
df_out_copy.head(1)

In [None]:
df_out_copy = df_out_copy.drop(columns = ['year_y', 'total_days_y']) # drop ys


In [None]:
df_out_copy.head(1)

In [None]:
# Rename x columns

df_out_copy.rename(columns = {'year_x':'year'}, inplace = True) 
df_out_copy.rename(columns = {'total_days_x':'total_days'}, inplace = True) 

df_out_copy.head()

In [None]:
# Merge in Missing colums

cols_to_use = df.columns.difference(df_out_copy.columns) # find missing columns
cols_list = list(cols_to_use) # list
cols_list.append('ID_HDC_G0') # add IDS

df_copy_cols = df_copy[cols_list] # .copy() # make a copy so you don't f it up
df_copy_cols = df_copy_cols.drop(columns=['Unnamed: 0', 'Unnamed: 0.1']) # remove weird cols

df_copy_cols = df_copy_cols.drop_duplicates('ID_HDC_G0', keep = 'first') # drop duplicates

In [None]:
# merge them in 
df_out_copy_merge = df_out_copy.merge(df_copy_cols, on = 'ID_HDC_G0', how = 'inner')

In [None]:
df_out_copy_merge.shape

In [None]:
df_out_copy_merge.head(1)

#### Drop overlap events based on event id

In [None]:
# Get events

overlap
jan_ids = list(overlap['Event_ID_y'])
dec_ids = list(overlap['Event_ID_x'])

In [None]:
# Drop Events from Dataset

print(len(df))
df_copy = df.copy()

# Jan
for event in jan_ids:
    df_copy = df_copy[df_copy['Event_ID'] != event]
    
for event in dec_ids:
    df_copy = df_copy[df_copy['Event_ID'] != event]

print(len(df_copy))

#### Add in new events with new event ids

In [None]:
# Merge 
df_copy = df_copy.drop(columns=['Unnamed: 0', 'Unnamed: 0.1'])
df_copy.head()

In [None]:
print(len(df_copy))
print(len(df_out_merge))

print(df_copy.columns)
print(df_out_merge.columns)

In [None]:
print(len(df_copy))
print(len(df_out))
df_copy = df_copy.drop(columns=['Unnamed: 0', 'Unnamed: 0.1'])
df_final = []
df_final = pd.concat([df_copy, df_out_merge], sort = True)
print(len(df_final))

In [None]:
df_final.head(50)

In [None]:
# cols_to_use = df.columns.difference(df_overlap_copy.columns) # find missing columns
# cols_list = list(cols_to_use) # list
# cols_list.append('ID_HDC_G0') # add IDS
# df_cols = df_copy[cols_list]
# df_cols.head()

In [None]:
# Drop duplicates ids

df_cols = df_cols.drop_duplicates('ID_HDC_G0', keep = 'first')
df_cols = df_cols.drop(columns = ['Unnamed: 0', ''])
df_cols.head()

In [None]:
# Merge in meta data to overlap events

df_overlap_copy_merge = df_overlap_copy.merge(df_cols, on = 'ID_HDC_G0', how = 'inner')

In [None]:
len(df_overlap_copy_merge)

In [None]:
df_overlap_copy_merge.head()

In [None]:
# drop and rename columns

df_overlap_copy_merge.rename(columns = {'year_x':'year'}, inplace = True) 
df_overlap_copy_merge.rename(columns = {'total_days_x':'total_days'}, inplace = True) 