# Tmax Subset

A notebook to subset Tmax daily for the 13000 GHS urban areas to identify dates >40c, consecuritve days >40 c etc.

**Need to subset**
- Days per year (done)
- Duration of each event (done)
- Intensity of each day during each event (>40.6)

### Depdencies

In [1]:
import pandas as pd
import numpy as np
import xarray as xr
from random import random
from itertools import groupby
from operator import itemgetter
import geopandas as gpd 
import glob
from statistics import mean

In [2]:
def csv_to_xr(file_in, time_dim, space_dim):
    
    """ Function reads in a csv w/ GHS-UCDB IDs and temp, isolates the temp
    and returns a xarray data array with dims set to city ids and dates
    
    Args:
        file_in = file name and path
        time_dim = name for time dim as a str ... use date :-)
        space_dim = col name for GHS-UCDB IDs as an str (ID_HDC_G0)
    """
    
    df = pd.read_csv(file_in) # read the file in as a df
    print(df.shape)
    
    df_id = df[space_dim] # get IDs
    df_temp = df.iloc[:,3:] # get only temp columns
    df_temp.index = df_id # set index values
    df_temp_drop = df_temp.dropna() # Drop cities w/ no temp record 
    print(len(df_temp_drop))
    
    temp_np = df_temp_drop.to_numpy() # turn temp cols into an np array
    
    # make xr Data Array w/ data as temp and dims as spece (e.g. id)
    
    # Note 2019 09 17 changed to xr.Dataset from xr.Dataarray
    temp_xr_da = xr.DataArray(temp_np, coords=[df_temp_drop.index, df_temp_drop.columns], 
                            dims=[space_dim, time_dim])
    
    return temp_xr_da

In [3]:
def temp_eventTot(xarray, Tthresh, year):
    """ Function returns the number of days within a year where Tmax > Tthresh for each city.
    
    Args: 
        xarray = an xarray object with dims = (space, times)
        Tthresh = int of temp threshold
    """
    
    ## NOTE FOR SOME REASON out.ID_HDC_G0 cannot be fed a string ... note sure why so be careful with col names
    out = xarray.where(xarray > Tthresh, drop = True)
    id_list = []
    event_tot = []
    df_out = pd.DataFrame()
    
    for index, loc in enumerate(out.ID_HDC_G0):
        id_list.append(out.ID_HDC_G0.values[index])
        event_tot.append(len(out.sel(ID_HDC_G0 = loc).dropna(dim = 'date').date.values))
    
    df_out['ID_HDC_G0'] = id_list
    df_out[year] = event_tot
    
    return df_out

In [4]:
def eventTot_loop(dir_nm, time_dim, space_dim, Tthresh):
    
    """ Loop through a dir with csvs to calc the total number of events greater than a threshold.
        Leap years explain the difference in shapes 368 vs 369
    
    Args:
        dir_nm = dir path to loop through
        time_dim = name for time dim as a str ... use date :-) for csv_to_xr function
        space_dim = col name for GHS-UCDB IDs as an str (ID_HDC_G0) for csv_to_xr function
        Tthresh = int of temp threshold for temp_event function -- 40.6 is
    """
    
    # Open the GHS-ID List with GeoPANDAS read_file
    ghs_ids_fn = 'GHS-UCSB-IDS.csv'
    ghs_ids_df = pd.read_csv(DATA_INTERIM+ghs_ids_fn)
    
    # Git File list
    fn_list = glob.glob(DAILY_PATH+'*.csv')
    
    for fn in sorted(fn_list):
        
        # Get year for arg for temp_event function
        year = fn.split('GHS-Tmax-DAILY_')[1].split('.csv')[0]
        print(year)
        
        temp_xr_da = csv_to_xr(fn, time_dim, space_dim)
        
        df_out = temp_eventTot(temp_xr_da, Tthresh, year)
        
        ghs_ids_df = ghs_ids_df.merge(df_out, on='ID_HDC_G0', how = 'outer') #<<<<----- NEED TO FIX THIS
    
    # build in later drop all NA GHS-IDs
    
    return ghs_ids_df


In [5]:
def temp_eventL_a(xarray, Tthresh, year): #<---------------- # NEED TO RENAME or
    """ Function calculates the length of each Tmax threshold event as the number of days in a row
    greater than a threshold within a year where Tmax > Tthresh for each city.
    
    Args: 
        xarray = an xarray object with dims = (space, times)
        Tthresh = int of temp threshold
    """
    
    ## NOTE FOR SOME REASON out.ID_HDC_G0 cannot be fed a string ... note sure why so be careful with col names
    out = xarray.where(xarray > Tthresh, drop = True)
    id_list = []
    event_L = []
    df_out = pd.DataFrame()
    
    for index, loc in enumerate(out.ID_HDC_G0):
        id_list.append(out.ID_HDC_G0.values[index])
        event_tot.append(len(out.sel(ID_HDC_G0 = loc).dropna(dim = 'date').date.values))
    
    df_out['ID_HDC_G0'] = id_list
    df_out[year] = event_tot
    
    return df_out

In [13]:
def temp_eventL(xarray, Tthresh):
    """ Function calculates the length of each Tmax threshold event as the number of days in a row
    greater than a threshold within a year where Tmax > Tthresh for each city. Returns the length,
    the dates, the tempatures, and the severity (daily Tmax - Tthresh)
    
    Args: 
        xarray = an xarray object with dims = (space, times)
        Tthresh = int of temp threshold
    """
    
    # empty lists & df
    id_list = []
    date_list = []
    dayTot_list = []
    temp_list = []
    severity_list = []
    df_out = pd.DataFrame()
    
    # subset xarry
    out = xarray.where(xarray > Tthresh, drop = True)

    # start loop 
    for index, loc in enumerate(out.ID_HDC_G0):
        id_list.append(out.ID_HDC_G0.values[index]) # get IDS
        date_list.append(out.sel(ID_HDC_G0 = loc).dropna(dim = 'date').date.values) # get event dates
        
        # this is actually getting the total events of all 2019-09-22
        dayTot_list.append(len(out.sel(ID_HDC_G0 = loc).dropna(dim = 'date').date.values)) # get event totals
        
        temp_list.append(out.sel(ID_HDC_G0 = loc).dropna(dim = 'date').values) # get temp values
        severity_list.append(out.sel(ID_HDC_G0 = loc).dropna(dim = 'date').values - Tthresh) # get severity

    # write to a data frame
    df_out['ID_HDC_G0'] = id_list
    df_out['Days_Total'] = dayTot_list
    df_out['Event_Dates'] = date_list
    df_out['Event_Temps'] = temp_list
    df_out['Event_Severity'] = severity_list

    # return df_out
    return df_out

In [6]:
def eventL_loop(dir_nm, fn_out, time_dim, space_dim, Tthresh):
    
    """ Loop through a dir with csvs to apply temp_eventL function and save out a .csv for each year
    
    Args:
        dir_nm = dir path to loop through
        fn_out = string to label out files
        time_dim = name for time dim as a str ... use date :-) for csv_to_xr function
        space_dim = col name for GHS-UCDB IDs as an str (ID_HDC_G0) for csv_to_xr function
        Tthresh = int of temp threshold for temp_event function -- 40.6 is
    """
    
    # Open the GHS-ID List with GeoPANDAS read_file
    ghs_ids_fn = 'GHS-UCSB-IDS.csv'
    ghs_ids_df = pd.read_csv(DATA_INTERIM+ghs_ids_fn)
        
    # Git File list
    fn_list = glob.glob(DAILY_PATH+'*.csv')
    
    for fn in sorted(fn_list):
        
        # Get year for arg for temp_event function
        year = fn.split('GHS-Tmax-DAILY_')[1].split('.csv')[0]
        print(year)
        
        temp_xr_da = csv_to_xr(fn, time_dim, space_dim)
        
        df_out = temp_eventL(temp_xr_da, Tthresh)
                
        ghs_ids_df_out = ghs_ids_df.merge(df_out, on='ID_HDC_G0', how = 'inner') #<<<<----- NEED TO FIX THIS

        ghs_ids_df_out.to_csv(DATA_OUT+fn_out+year+'.csv')

        print(year, 'SAVED!')

### Testing

In [7]:
# File Paths
DAILY_PATH = '/home/cascade/projects/data_out_urbanheat/CHIRTS-GHS-DAILY/'
DATA_INTERIM = '/home/cascade/projects/UrbanHeat/data/interim/'
DATA_OUT = '/home/cascade/projects/data_out/'

In [8]:
# File name to test
fn_in = 'GHS-Tmax-DAILY_1983.csv'

In [9]:
xr1993 = csv_to_xr(DAILY_PATH+fn_in, 'date', 'ID_HDC_G0')

(13135, 368)
13067


In [14]:
xr1993

<xarray.DataArray (ID_HDC_G0: 13067, date: 365)>
array([[-43.921947, -33.71345 , -33.054974, ..., -12.416152, -13.232986,
        -15.403823],
       [ -4.804248,  -3.914425,  -7.533999, ...,  -5.186461, -10.945722,
        -16.29516 ],
       [-23.904118, -17.422953, -13.182008, ..., -12.788978, -11.337886,
        -10.00939 ],
       ...,
       [ 16.028023,  17.73603 ,  20.493294, ...,  14.559421,  15.160739,
         15.184024],
       [ 16.420553,  17.87142 ,  22.519674, ...,  15.680964,  16.169733,
         16.039179],
       [ 16.6943  ,  17.559229,  21.480919, ...,  14.446052,  15.235602,
         14.005591]])
Coordinates:
  * ID_HDC_G0  (ID_HDC_G0) int64 5782 3316 5645 3185 ... 1116 1114 1161 1169
  * date       (date) object '1983.01.01' '1983.01.02' ... '1983.12.31'

In [15]:
event1993 = temp_eventL(xr1993, 40.6)

In [16]:
event1993[400:450]

Unnamed: 0,ID_HDC_G0,Days_Total,Event_Dates,Event_Temps,Event_Severity
400,6279,14,"[1983.06.22, 1983.06.23, 1983.06.24, 1983.06.2...","[40.637226, 42.146217, 43.225623999999996, 40....","[0.03722599999999687, 1.5462169999999986, 2.62..."
401,6295,10,"[1983.06.23, 1983.06.24, 1983.06.26, 1983.06.2...","[41.303112, 42.403027, 41.33662, 42.872417, 41...","[0.7031119999999973, 1.8030270000000002, 0.736..."
402,6229,14,"[1983.06.22, 1983.06.23, 1983.06.24, 1983.06.2...","[42.190994, 42.450027, 43.509415000000004, 40....","[1.590994000000002, 1.8500269999999972, 2.9094..."
403,6249,15,"[1983.06.22, 1983.06.23, 1983.06.24, 1983.06.2...","[41.742354999999996, 43.00139, 44.029526000000...","[1.142354999999995, 2.4013899999999992, 3.4295..."
404,6236,16,"[1983.06.22, 1983.06.23, 1983.06.24, 1983.06.2...","[41.7475, 42.906532, 43.937798, 41.069748, 42....","[1.1475000000000009, 2.306531999999997, 3.3377..."
405,1664,1,[1983.08.05],[41.430878],[0.8308779999999985]
406,6263,15,"[1983.06.22, 1983.06.23, 1983.06.24, 1983.06.2...","[41.94552, 43.20455, 44.232690000000005, 41.33...","[1.3455200000000005, 2.604549999999996, 3.6326..."
407,6309,5,"[1983.06.24, 1983.06.27, 1983.06.28, 1983.06.2...","[41.467690000000005, 41.93708, 40.816628, 41.0...","[0.8676900000000032, 1.3370800000000003, 0.216..."
408,12850,1,[1983.08.06],[41.18087],[0.5808699999999973]
409,1661,2,"[1983.06.15, 1983.08.05]","[40.63337, 40.745804]","[0.0333699999999979, 0.14580399999999827]"


#### Find breaks in time serise

https://stackoverflow.com/questions/40118037/how-can-i-detect-gaps-and-consecutive-periods-in-a-time-series-in-pandas

In [20]:
# Get event dates
days = event1993[event1993['ID_HDC_G0'] == 6279]['Event_Dates']

# turn into datetime
days = pd.to_datetime(days.values[0])
days = days[0].to_julian_date()
days

2445507.5

In [21]:
#turn days back into julian days
pd.to_datetime(days, format = '%Y.%m.%d')

DatetimeIndex(['1983-06-22', '1983-06-23', '1983-06-24', '1983-06-25',
               '1983-06-26', '1983-06-27', '1983-06-28', '1983-06-29',
               '1983-06-30', '1983-07-01', '1983-07-21', '1983-07-22',
               '1983-07-23', '1983-08-01'],
              dtype='datetime64[ns]', freq=None)

In [None]:
# Get intensity for each day, note 'Event_Severity' is really intensity
intensity = event1993[event1993['ID_HDC_G0'] == 6279]['Event_Severity']
intensity = intensity.values[0]
intensity

In [None]:
tmax = event1993[event1993['ID_HDC_G0'] == 6279]['Event_Temps']
tmax = tmax.values[0]
tmax

In [None]:
tmax[0:3].sum()

In [None]:
# Add tid bit to run julian days back

def event_split(days, tmax, intensity, ID_HDC_G0, country):
    """ Searchs a list of dates and isolates sequential dates as a list
    
    Args:
        ID_HDC_G0: city ID as string
        tmax: numpy.ndarray of intensities values of tmax values
        days: pandas.core.index as julian dates
        intensity: numpy.ndarray of intensities values
    """
    
    # city id
    city_id = ID_HDC_G0
    
    # country 
    country = country
    
    # lists to fill
    event_list = []
    dur_list = []
    intensity_list = []
    city_id_list = []
    tmax_list = []
    avg_temp_list = []
    avg_int_list = []
    tot_int_list = []
    country_list = []
    
    # data frame out
    df_out = pd.DataFrame()
    
    # Loop through dur list and isolate seq days, temps, and intensities
    for k, g in groupby(enumerate(days.values), lambda x: x[1]-x[0]):
        
        days = list(map(itemgetter(1), g)) # isolate seq. days
        dur = len(days) # event duration
        intense = intensity[0:dur] # intensity of each day during event
        temp = tmax[0:dur] # temp of each day during event
        temp_avg = mean(temp) # avg. temp during event
        avg_int = mean(intense) # avg. intensity during event
        tot_int = intense.sum() # total intensity during event
        
        # turn days back into julian days
        # days = pd.to_datetime(days, format = '%Y.%m.%d')
        
        city_id_list.append(city_id)
        event_list.append(days)
        dur_list.append(dur)
        intensity_list.append(intense)
        tmax_list.append(temp)
        avg_temp_list.append(temp_avg)
        avg_int_list.append(avg_int)
        tot_int_list.append(tot_int)
        country_list.append(country)
        
    df_out['ID_HDC_G0'] = city_id_list
    df_out['CTR_MN_NM'] = country_list
    df_out['duration'] = dur_list
    df_out['avg_temp'] = avg_temp_list
    df_out['avg_intensity'] = avg_int_list
    df_out['tot_intensity'] = tot_int_list
    df_out['events'] = event_list
    df_out['duration'] = dur_list
    df_out['intensity'] = intensity_list
    df_out['tmax'] = tmax_list
    
    return df_out

In [None]:
df_out = event_split(days, tmax, intensity, 'city1', 'country')

df_out

#### Start running it on a one file and then build loop

In [None]:
DATA_IN = '/home/cascade/projects/data_out_urbanheat/CHIRTS-GHS-Events/'
fn = 'CHIRTS-GHS-Events1983.csv'
events1983 = pd.read_csv(DATA_IN+fn)

In [None]:
events1983.head()

In [None]:
for index, row in events1983.head(n=2).iterrows():
    days = row['Event_Dates']
    tmax = row['Event_Temps'] 
    intensity = row['Event_Severity']
    city = row['ID_HDC_G0']
    country = row['CTR_MN_NM']
    
    # turn into datetime
    days = pd.to_datetime(days.values[0])
    days = days.to_julian_date()
    days
    
    print(days)
#     df_test = event_split(days, tmax, intensity, city, country)
#     print(df_test['duration'])    

### Running

In [None]:
# File Paths 

# UPDATE AS NEEDED <<<<< ------------------------------------------
DAILY_PATH = '/home/cascade/projects/data_out_urbanheat/CHIRTS-GHS-DAILY/'
DATA_INTERIM = '/home/cascade/projects/UrbanHeat/data/interim/'
DATA_OUT = '/home/cascade/projects/data_out_urbanheat/CHIRTS-GHS-Events/'

In [None]:
# File name
fn_out = 'CHIRTS-GHS-Events'
dir_nm = DAILY_PATH
time_dim = 'date'
space_dim = 'ID_HDC_G0'
Tthresh = 40.6


In [None]:
eventL_loop(dir_nm, fn_out, time_dim, space_dim, Tthresh)

### Plot it

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.pyplot import figure
%matplotlib inline

figure(num=None, figsize=(8, 6), dpi=80, facecolor='w', edgecolor='k')
y = range(1,125)
year = '2010'
country = 'INDIA'
plt.hist(india[year], bins = 125)
plt.xlabel('Number of Days in '+year+' where Tmax >40c in ')
plt.ylabel('Number of cities')
plt.title(country+': For all cities with Tmax >40, how many days in '+year+' were >40C? ')

In [None]:
# MAP BACK TO POLYGONS AND LOOK AT IT 
SHP_DIR = '/Users/cascade/Github/UrbanHeat/data/raw/ghs-ucdb/'
shp_fn = 'GHS_STAT_UCDB2015MT_GLOBE_R2019A_V1_0.shp'
shps = gpd.read_file(SHP_DIR+shp_fn)

In [None]:
df_ghs = gpd.GeoDataFrame()
df_ghs['geometry'] = shps.geometry
df_ghs['ID_HDC_G0'] = shps.ID_HDC_G0

In [None]:
df_merge = df_ghs.merge(events, on='ID_HDC_G0', how = 'inner') #<<<<----- NEED TO FIX THIS

In [None]:
# Write it out
DATA_INTERIM = '/Users/cascade/Github/UrbanHeat/data/interim/'
fn_out = 'GHS-TmaxDaily-events.shp'
df_merge.to_file(DATA_INTERIM+fn_out)

# Old Code

In [None]:
# This will return the ID and Date where Tmax is greater than 40 as a dict, but will not return actual tempatures 

Tmax = np.random.randint(20, high=50, size=(3,10)) # Make a 3x10 random list
print(Tmax)
results = np.where(Tmax > 40) # find the index and rows
coords = list(zip(results[0], results[1])) # zip the i and js into tuples

b = [(k, list(list(zip(*g))[1])) for k, g in groupby(coords, itemgetter(0))] # group by rows

print(b)
dict_out = dict(b) # turn into a dict, where keys are city ids and values are dates
dict_out


In [None]:
for key, value in dict_out.items():
    print(key, value)

In [None]:
np.where(Tmax > 40, Tmax, Tmax*0) 

In [None]:
np.where(Tmax > 40) 

In [None]:
np.argwhere(Tmax>1)

In [None]:
def temp_search(array):
    results = np.where(array > 40) # find the index and rows
    coords = list(zip(results[0], results[1])) # zip the i and js into tuples
    b = [(k, list(list(zip(*g))[1])) for k, g in groupby(coords, itemgetter(0))] # group by rows
    dict_out = dict(b) # turn into a dict, where keys are city ids and values are dates

    return dict_out

In [None]:
file_in = '/Users/cascade/Desktop/GHS-Tmax-DAILY_1983.csv'

df = pd.read_csv(file_in)

In [None]:
df.head()

In [None]:
df_sub = df.iloc[:,3:]

In [None]:
df_sub_drop = df.dropna(how='all')

In [None]:
df_sub.head()
arr = df_sub.to_numpy()

In [None]:
arr.shape

In [None]:
np.unique(arr)

In [None]:
tmax_search = temp_search(arr)

In [None]:
# Make some fake data
Tmax = np.random.randint(20, high=50, size=(3,10))
locs = ['001', '002', '003']
times = pd.date_range('2000-01-01', periods=10)

In [None]:
foo = xr.DataArray(Tmax, coords=[locs, times], dims=['space', 'times'])

In [None]:
foo

In [None]:
out = foo.where(foo > 40, drop = True)
out

In [None]:
for loc in out.space:
    print(len(out.sel(space = loc).dropna(dim = 'times').times.values))

In [None]:
for x in out.space.values:
    print(x)

In [None]:
out.space.values[0]

In [None]:
xr1993 = csv_to_xr(DAILY_PATH+fn_in, 'date', 'ID_HDC_G0')
out = xr1993.where(xr1993 > 40.6, drop = True)

In [None]:
40 - out.sel(ID_HDC_G0 = 5885).dropna(dim = 'date').values

In [None]:
#event_tot.append(len(out.sel(ID_HDC_G0 = loc).dropna(dim = 'date').date.values))

id_list = []
date_list = []
eventL_list = []
temp_list = []
df_out = pd.DataFrame()

# start loop 
for index, loc in enumerate(out.ID_HDC_G0):

    id_list.append(out.ID_HDC_G0.values[index]) # get IDS
    date_list.append(out.sel(ID_HDC_G0 = loc).dropna(dim = 'date').date.values) # get event dates
    eventL_list.append(len(out.sel(ID_HDC_G0 = loc).dropna(dim = 'date').date.values)) # get event lengths
    temp_list.append(out.sel(ID_HDC_G0 = loc).dropna(dim = 'date').values) #get temp values

# write to a data frame
df_out['ID_HDC_G0'] = id_list
df_out['Event_Length'] = eventL_list
df_out['Event_Dates'] = date_list
df_out['Event_Temps'] = temp_list


In [None]:
df_out.head(50)

In [None]:
date_list

In [None]:
# Run routine
# all_events_df = event_loop(DAILY_PATH, 'date', 'ID_HDC_G0', 40.6)

In [None]:
all_events_df.head()

In [None]:
# Move IDS to Index 

all_events_df = all_events_df.set_index(['ID_HDC_G0', 'CTR_MN_NM'], drop = True)
all_events_df.head()

In [None]:
# Drop NaNs
all_events_df_drop = all_events_df.dropna(how = 'all')
all_events_df_drop.shape

In [None]:
all_events_df_drop.head()

In [None]:
df_out = all_events_df_drop.copy()

In [None]:
df_out['ID_HDC_G0'] = all_events_df_drop.index

In [None]:
df_out.head()

In [None]:
all_events_df_drop = all_events_df_drop.reset_index()

In [None]:
all_events_df_drop.head()

In [None]:
#all_events_df_drop.to_csv(DATA_OUT+'20190831_TMax-GHS_TotEvents83-2016.csv')

In [None]:
india = all_events_df_drop[all_events_df_drop['CTR_MN_NM'] == 'India']

In [None]:
india.head()

# Old Code

In [None]:
# try this https://stackoverflow.com/questions/52901387/find-group-of-consecutive-dates-in-pandas-dataframe

dt = test[test['ID_HDC_G0'] == 6279]['Event_Dates']
day = pd.Timedelta('1d')

In [None]:
city = test[test['ID_HDC_G0'] == 6279]
city_list = city.Event_Dates.tolist()

df = pd.DataFrame()
df['dates'] = city_list
df.dates.tolist()

In [None]:
dates = ['1983.06.20', '1983.06.23', '1983.06.24', '1983.06.25',
        '1983.06.26', '1983.06.27', '1983.06.28', '1983.06.29',
        '1983.06.30', '1983.07.01', '1983.07.21', '1983.07.22',
        '1983.07.23', '1983.08.01']

In [None]:
pd_dates = pd.to_datetime(dates)
shift = pd_dates.shift(1, freq = 'D')
day = pd.Timedelta('1d')

df = pd.DataFrame()
df['dates'] = pd_dates
# df['shift'] = shift

In [None]:
in_block = ((df - df.shift(-1)).abs() == day)
in_block 

In [None]:
filt = df.loc[in_block]
filt

In [None]:
df.diff()

In [None]:
dt_list = dt.tolist()

In [None]:
dt_list = ['1983.06.22', '1983.06.23', '1983.06.24', '1983.06.25',
       '1983.06.26', '1983.06.27', '1983.06.28', '1983.06.29',
       '1983.06.30', '1983.07.01', '1983.07.21', '1983.07.22',
       '1983.07.23', '1983.08.01']


In [None]:
in_block = ((dt - dt.shift(-1)).abs() == day) | (dt.diff() == day)
in_block

In [None]:
# try this https://stackoverflow.com/questions/52901387/find-group-of-consecutive-dates-in-pandas-dataframe

dt = test[test['ID_HDC_G0'] == 6279]['Event_Dates']
day = pd.Timedelta('1d')

In [None]:
city = test[test['ID_HDC_G0'] == 6279]
city_list = city.Event_Dates.tolist()

df = pd.DataFrame()
df['dates'] = city_list
df.dates.tolist()

In [None]:
dates = ['1983.06.20', '1983.06.23', '1983.06.24', '1983.06.25',
        '1983.06.26', '1983.06.27', '1983.06.28', '1983.06.29',
        '1983.06.30', '1983.07.01', '1983.07.21', '1983.07.22',
        '1983.07.23', '1983.08.01']

In [None]:
pd_dates = pd.to_datetime(dates)
shift = pd_dates.shift(1, freq = 'D')
day = pd.Timedelta('1d')

df = pd.DataFrame()
df['dates'] = pd_dates
# df['shift'] = shift

In [None]:
in_block = ((df - df.shift(-1)).abs() == day)
in_block 

In [None]:
filt = df.loc[in_block]
filt

In [None]:
df.diff()

In [None]:
dt_list = dt.tolist()

In [None]:
dt_list = ['1983.06.22', '1983.06.23', '1983.06.24', '1983.06.25',
       '1983.06.26', '1983.06.27', '1983.06.28', '1983.06.29',
       '1983.06.30', '1983.07.01', '1983.07.21', '1983.07.22',
       '1983.07.23', '1983.08.01']


In [None]:
in_block = ((dt - dt.shift(-1)).abs() == day) | (dt.diff() == day)
in_block

In [None]:
### Another idea
# https://stackoverflow.com/questions/2361945/detecting-consecutive-integers-in-a-list

from itertools import groupby
from operator import itemgetter
data = [1, 4,5,6, 10, 15,16,17,18, 22, 25,26,27,28]

for k, g in groupby(enumerate(data), lambda x: x[1]-x[0]):
    print(map(itemgetter(1), g))

In [None]:
L = [1,  4,5,6, 10, 15,16,17,18, 22, 25,26,27,28]
for k, g in groupby(enumerate(L), lambda x: x[1]-x[0] ) :
  print (list(map(itemgetter(1), g)))

In [None]:
dates = ['1983.06.20', '1983.06.23', '1983.06.24', '1983.06.25',
        '1983.06.26', '1983.06.27', '1983.06.28', '1983.06.29',
        '1983.06.30', '1983.07.01', '1983.07.21', '1983.07.22',
        '1983.07.23', '1983.08.01']

pd_dates = pd.to_datetime(dates)
df_dates = pd.DataFrame()
df_dates['dates'] = pd_dates



test = df_dates['dates'].apply(lambda x: x.toordinal())


In [None]:
for k, g in groupby(enumerate(test), lambda x: x[1]-x[0]):
  print (list(map(itemgetter(1), g)))

In [None]:
#event1993[['Event_Dates','Event_Severity']].apply(event_split)

In [None]:
# events_shift = events.shift(1, freq = 'D')

#events.to_julian_date() - events_shift.to_julian_date()

# turn into list 
# events_list = [list(i) for i in events.to_list()][0]

# turn into ordinal dates
# (pd.to_datetime(events.values[0]))

# pd_events = pd.to_datetime(events_list)
# df_events = pd.DataFrame()
# df_events['events'] = pd_events

# df_events_ord = df_events['events'].apply(lambda x: x.toordinal())
# (df_events_ord)

In [None]:
pd_events = pd.to_datetime(events_list)
df_events = pd.DataFrame()
df_events['events'] = pd_events



df_events_ord = df_events['events'].apply(lambda x: x.toordinal())
(df_events_ord)

In [None]:
for k, g in groupby(enumerate(df_events_ord), lambda x: x[1]-x[0]):
    print(list(map(itemgetter(1), g)))

In [None]:
# def events_split():
#     events = (test[test['ID_HDC_G0'] == 6279]['Event_Dates'])
#     events_list = events.to_list()
#     events_list = [list(i) for i in events_list]
#     events_list = events_list[0]
#     events_list