In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import datetime as dt
import re
from geopy.distance import vincenty
import matplotlib.dates as mdts
import matplotlib.pyplot as plt
import cartopy.crs as ccrs
from time import time

# Filtering of line blocking incidents only

In [None]:
def filter_blk_desc(path):
    dest_path = path + '/inc/blkg'
    bound_re = re.compile(r"(IF [E,N,S,W]B)")
    m_path = path + '/det'
    pth = Path(m_path)
    for child in pth.iterdir():
        incdnt_det_file_name = m_path + '/' + child.name
        det_month = pd.read_csv(incdnt_det_file_name, parse_dates=['Timestamp'])
        blkg_desc = det_month[det_month.apply(lambda row: ('NOT BLK' not in row.Desc) & ((' BLKG' in row.Desc) | (' BLKD' in row.Desc) | (' BLKING' in row.Desc) | (' BLKG' in row.Desc)) & ((' ENTIR' in row.Desc) | (' LN' in row.Desc)) & (('IF LN' not in row.Desc)) & (not bound_re.search(row.Desc)), axis=1)]
        blkg_desc.drop_duplicates(subset=['ID'], keep='first', inplace=True)
        blkg_desc.to_csv(dest_path + '/' + child.name[:-4] + '.csv', index=False)

In [None]:
dir_list = [
            '../data/PeMS/Incidents/work_folder/Months/Jan/',
            '../data/PeMS/Incidents/work_folder/Months/Feb/',
            '../data/PeMS/Incidents/work_folder/Months/Mar/',
            '../data/PeMS/Incidents/work_folder/Months/Apr/',
           '../data/PeMS/Incidents/work_folder/Months/May/',
           '../data/PeMS/Incidents/work_folder/Months/Jun/',
           '../data/PeMS/Incidents/work_folder/Months/Jul/',
           '../data/PeMS/Incidents/work_folder/Months/Aug/',
           '../data/PeMS/Incidents/work_folder/Months/Sep/',
           '../data/PeMS/Incidents/work_folder/Months/Oct/',
           '../data/PeMS/Incidents/work_folder/Months/Nov/',
           '../data/PeMS/Incidents/work_folder/Months/Dec/']
for a_dir in dir_list:
    filter_blk_desc(a_dir)

# Construction of time windows with incidents

In [2]:
def get_next_downstream_station(stations, incident, correction=0):
    fwy_no = incident.Freeway
    fwy_dir = incident.Freeway_direction
    lon = incident.Longitude
    lat = incident.Latitude
    candidates = stations[(stations['Fwy']==fwy_no) & (stations['Dir']==fwy_dir)].copy()
    if fwy_dir == 'S':
        candidates['Distance'] = lat - candidates['Latitude']
        candidates['Distance_aux'] = lon - candidates['Longitude']
        cond1 = candidates['Latitude'] < lat - correction
    elif fwy_dir == 'N':
        candidates['Distance'] = candidates['Latitude'] - lat
        candidates['Distance_aux'] = lon - candidates['Longitude']
        cond1 = candidates['Latitude'] > lat + correction
    elif fwy_dir == 'W':
        candidates['Distance'] = lon - candidates['Longitude']
        candidates['Distance_aux'] = lat - candidates['Latitude']
        cond1 = candidates['Longitude'] < lon - correction
    else : # fwy_dir == 'E':
        candidates['Distance'] = candidates['Longitude'] - lon
        candidates['Distance_aux'] = lat - candidates['Latitude']
        cond1 = candidates['Longitude'] > lon + correction
    cond2 = (np.abs(candidates['Distance_aux'])<0.01) & (candidates['Distance']<0.01) & (candidates['Distance']>0)
#     print(incidents[incidents['IncidentID']==incidentID].iloc[0])
    return candidates[cond1 & cond2 & (candidates['Distance'] > 0)].sort_values(['Distance']).reset_index(drop=True)

In [3]:
def get_next_upstream_station(stations, incident, correction=0):
    fwy_no = incident.Freeway
    fwy_dir = incident.Freeway_direction
    lon = incident.Longitude
    lat = incident.Latitude
    candidates = stations[(stations['Fwy']==fwy_no) & (stations['Dir']==fwy_dir)].copy()
    if fwy_dir == 'N':
        candidates['Distance'] = lat - candidates['Latitude']
        candidates['Distance_aux'] = lon - candidates['Longitude']
        cond1 = candidates['Latitude'] < lat - correction
    elif fwy_dir == 'S':
        candidates['Distance'] = candidates['Latitude'] - lat
        candidates['Distance_aux'] = lon - candidates['Longitude']
        cond1 = candidates['Latitude'] > lat + correction
    elif fwy_dir == 'E':
        candidates['Distance'] = lon - candidates['Longitude']
        candidates['Distance_aux'] = lat - candidates['Latitude']
        cond1 = candidates['Longitude'] < lon - correction
    else : # fwy_dir == 'E':
        candidates['Distance'] = candidates['Longitude'] - lon
        candidates['Distance_aux'] = lat - candidates['Latitude']
        cond1 = candidates['Longitude'] > lon + correction
    cond2 = (np.abs(candidates['Distance_aux'])<0.01) & (candidates['Distance']<0.01) & (candidates['Distance']>0)
#     print(incidents[incidents['IncidentID']==incidentID].iloc[0])
    return candidates[cond1 & cond2 & (candidates['Distance'] > 0)].sort_values(['Distance']).reset_index(drop=True)

In [4]:
def get_next_wrapper(inc, func, stations):
    nxt = func(stations, inc)
    return nxt.iloc[0][['ID', 'Latitude', 'Longitude', 'Lanes']] if nxt.size>0 else pd.DataFrame(np.zeros((1,4), dtype=int), columns=['ID', 'Latitude', 'Longitude', 'Lanes']).iloc[0]

In [5]:
def obtain_time_series(an_inc, full_series, mins_before, mins_after, chrcteristic='Avg Occupancy'):
    st = an_inc.ID_Prev
    st_next = an_inc.ID_Next
    test = full_series[(full_series.Station==st) &  (full_series.Timestamp >= an_inc.Timestamp-dt.timedelta(minutes=mins_before)) & (full_series.Timestamp < an_inc.Timestamp+dt.timedelta(minutes=mins_after))].copy()
    test_next = full_series[(full_series.Station==st_next) &  (full_series.Timestamp >= an_inc.Timestamp-dt.timedelta(minutes=mins_before)) & (full_series.Timestamp < an_inc.Timestamp+dt.timedelta(minutes=mins_after))].copy()
    return pd.concat((test[chrcteristic].reset_index(drop=True).add_prefix(chrcteristic + '_'),
                      test_next[chrcteristic].reset_index(drop=True).add_prefix('Next ' + chrcteristic + '_')))
#     test = test.reset_index(drop=True).reset_index()
#     return test.pivot(index='Station', columns='index', values='Avg Occupancy').reset_index().drop(columns=["Station"]).loc[0]

In [6]:
def create_accident_windows(path, mins_before=40, mins_after=40):
    path_inc = path + '/inc/light'
    pth = Path(path_inc)
    srs_path = path + '/series/smoothed'
    s_pth = Path(srs_path)
    cols = ['IncidentID', 'Timestamp', 'Latitude', 'Longitude', 'Freeway', 'Freeway_direction']
    result = pd.DataFrame()
    stations_pth = path + '/stations'
    st_pth = Path(stations_pth)
    blk_path = path + '/inc/blkg'
    blkg_desc = pd.read_csv(blk_path + '/' + sorted(Path(blk_path).iterdir())[0].name, parse_dates=['Timestamp'])
    dest_path = path + '/result'
    
    for i,child in enumerate(sorted(pth.iterdir())):
        incdnt_file_name = path_inc + '/' + child.name
        inc_file = pd.read_csv(incdnt_file_name, parse_dates=['Timestamp'], dtype={'IncidentID':np.int64,'Latitude':np.float64,'Longitude':np.float64,'Freeway':np.int32, 'Freeway_direction':object})[cols]
        inc_file.drop(index=inc_file[~inc_file.IncidentID.isin(blkg_desc.ID)].index, inplace=True)
        day_start_time = dt.datetime.combine(inc_file.Timestamp.iloc[int(inc_file.shape[0]/2)].date(), dt.time(hour=0, minute=0))
        min_timestamp = day_start_time + dt.timedelta(minutes=20+mins_before)
        max_timestamp = day_start_time + dt.timedelta(hours=24, minutes=-mins_after)
        inc_file.drop(index=inc_file[(inc_file.Timestamp<min_timestamp) | (inc_file.Timestamp>=max_timestamp)].index, inplace=True)
        srs_f_name = srs_path + '/' + sorted(s_pth.iterdir())[i].name
        srs = pd.read_csv(srs_f_name, parse_dates=['Timestamp'])
        gb = srs.groupby('Station')
        srs.drop(index=srs[srs.Station.isin(gb.filter(lambda x: len(x)<284).Station.unique())].index, inplace=True)
        if srs.size == 0:
            continue
        st_active_file_name = stations_pth + '/' + sorted(st_pth.iterdir())[0].name
        st_active = pd.read_csv(st_active_file_name).drop(columns=['User_ID_1', 'User_ID_2', 'User_ID_3', 'User_ID_4'])
        st_active.drop(index=st_active[~st_active.ID.isin(srs.Station.unique())].index, inplace=True)
        srs = srs.merge(st_active[['ID', 'Lanes']], how='left', left_on='Station', right_on='ID')
        srs['Total Flow'] = srs['Total Flow']/srs['Lanes']
        inc_file_prv = inc_file.apply(get_next_wrapper, axis=1, args=(get_next_upstream_station, st_active))
        for col in inc_file_prv.columns:
            inc_file[col+'_Prev'] = inc_file_prv[col]
        inc_file_nxt = inc_file.apply(get_next_wrapper, axis=1, args=(get_next_downstream_station, st_active))
        for col in inc_file_nxt.columns:
            inc_file[col+'_Next'] = inc_file_nxt[col]
        
        inc_file.drop(index=inc_file[(inc_file.ID_Prev==0) | (inc_file.ID_Next==0)].index, inplace=True)
        if inc_file.size==0:
            continue
        windows = inc_file.apply(obtain_time_series, axis=1, args=[srs, mins_before, mins_after, 'Avg Occupancy'])
        inc_file = pd.concat((inc_file, windows), axis=1)
        windows = inc_file.apply(obtain_time_series, axis=1, args=[srs, mins_before, mins_after, 'Total Flow'])
        inc_file = pd.concat((inc_file, windows), axis=1)
        windows = inc_file.apply(obtain_time_series, axis=1, args=[srs, mins_before, mins_after, 'Avg Speed'])
        inc_file = pd.concat((inc_file, windows), axis=1)
        result = pd.concat((result, inc_file), axis=0)
#         break
    result.to_csv(dest_path + '/accident_windows_next.csv', index=False)
    print(path + ' complete')
    return result    

In [None]:
dir_list = [
            '../data/PeMS/Incidents/work_folder/Months/Jan/',
            '../data/PeMS/Incidents/work_folder/Months/Feb/',
            '../data/PeMS/Incidents/work_folder/Months/Mar/',
            '../data/PeMS/Incidents/work_folder/Months/Apr/',
           '../data/PeMS/Incidents/work_folder/Months/May/',
           '../data/PeMS/Incidents/work_folder/Months/Jun/',
           '../data/PeMS/Incidents/work_folder/Months/Jul/',
           '../data/PeMS/Incidents/work_folder/Months/Aug/',
           '../data/PeMS/Incidents/work_folder/Months/Sep/',
           '../data/PeMS/Incidents/work_folder/Months/Oct/',
           '../data/PeMS/Incidents/work_folder/Months/Nov/',
           '../data/PeMS/Incidents/work_folder/Months/Dec/']
for a_dir in dir_list:
    accdnt_windows = create_accident_windows(a_dir)

In [None]:
accdnt_windows = pd.DataFrame()
dir_list = [
            '../data/PeMS/Incidents/work_folder/Months/Jan/result/accident_windows_next.csv',
            '../data/PeMS/Incidents/work_folder/Months/Feb/result/accident_windows_next.csv',
            '../data/PeMS/Incidents/work_folder/Months/Mar/result/accident_windows_next.csv',
            '../data/PeMS/Incidents/work_folder/Months/Apr/result/accident_windows_next.csv',
           '../data/PeMS/Incidents/work_folder/Months/May/result/accident_windows_next.csv',
           '../data/PeMS/Incidents/work_folder/Months/Jun/result/accident_windows_next.csv',
           '../data/PeMS/Incidents/work_folder/Months/Jul/result/accident_windows_next.csv',
           '../data/PeMS/Incidents/work_folder/Months/Aug/result/accident_windows_next.csv',
           '../data/PeMS/Incidents/work_folder/Months/Sep/result/accident_windows_next.csv',
           '../data/PeMS/Incidents/work_folder/Months/Oct/result/accident_windows_next.csv',
           '../data/PeMS/Incidents/work_folder/Months/Nov/result/accident_windows_next.csv',
           '../data/PeMS/Incidents/work_folder/Months/Dec/result/accident_windows_next.csv']
for a_file in dir_list:
    accdnt_windows = pd.concat((accdnt_windows, pd.read_csv(a_file, parse_dates=['Timestamp'])))

In [None]:
# col_order = accdnt_windows.columns[[26,30,27,28,24,25,29,0,1,4,5,6,7,8,9,10,11,2,3,31,32,35,36,37,38,39,40,41,42,33,34,12,13,16,17,18,19,20,21,22,23,14,15]]

col_order = ['IncidentID', 'Timestamp', 'Latitude_Prev', 'Latitude_Next', 'Longitude_Prev', 'Longitude_Next', 'Freeway',
       'Freeway_direction', 'ID_Prev', 'ID_Next',
             'Lanes_Prev', 'Lanes_Next',
        'Avg Occupancy_0',
       'Avg Occupancy_1', 'Avg Occupancy_2', 'Avg Occupancy_3',
       'Avg Occupancy_4', 'Avg Occupancy_5', 'Avg Occupancy_6',
       'Avg Occupancy_7', 'Avg Occupancy_8', 'Avg Occupancy_9',
       'Avg Occupancy_10', 'Avg Occupancy_11', 'Avg Occupancy_12',
       'Avg Occupancy_13', 'Avg Occupancy_14', 'Avg Occupancy_15',
       'Total Flow_0',
       'Total Flow_1', 'Total Flow_2', 'Total Flow_3', 'Total Flow_4',
       'Total Flow_5', 'Total Flow_6', 'Total Flow_7', 'Total Flow_8',
       'Total Flow_9', 'Total Flow_10', 'Total Flow_11', 'Total Flow_12',
       'Total Flow_13', 'Total Flow_14', 'Total Flow_15',
       'Avg Speed_0', 'Avg Speed_1', 'Avg Speed_2',
       'Avg Speed_3', 'Avg Speed_4', 'Avg Speed_5', 'Avg Speed_6',
       'Avg Speed_7', 'Avg Speed_8', 'Avg Speed_9', 'Avg Speed_10',
       'Avg Speed_11', 'Avg Speed_12', 'Avg Speed_13', 'Avg Speed_14',
       'Avg Speed_15',
        'Next Avg Occupancy_0', 'Next Avg Occupancy_1','Next Avg Occupancy_2',
        'Next Avg Occupancy_3', 'Next Avg Occupancy_4', 'Next Avg Occupancy_5',
        'Next Avg Occupancy_6', 'Next Avg Occupancy_7', 'Next Avg Occupancy_8',
             'Next Avg Occupancy_9','Next Avg Occupancy_10','Next Avg Occupancy_11',
             'Next Avg Occupancy_12','Next Avg Occupancy_13','Next Avg Occupancy_14',
             'Next Avg Occupancy_15','Next Total Flow_0','Next Total Flow_1',
             'Next Total Flow_2','Next Total Flow_3','Next Total Flow_4',
             'Next Total Flow_5','Next Total Flow_6','Next Total Flow_7',
             'Next Total Flow_8','Next Total Flow_9','Next Total Flow_10',
             'Next Total Flow_11','Next Total Flow_12','Next Total Flow_13',
             'Next Total Flow_14','Next Total Flow_15','Next Avg Speed_0',
             'Next Avg Speed_1','Next Avg Speed_2','Next Avg Speed_3',
             'Next Avg Speed_4','Next Avg Speed_5','Next Avg Speed_6',
             'Next Avg Speed_7','Next Avg Speed_8','Next Avg Speed_9',
             'Next Avg Speed_10','Next Avg Speed_11','Next Avg Speed_12',
             'Next Avg Speed_13','Next Avg Speed_14','Next Avg Speed_15'
            ]

In [None]:
accdnt_windows = accdnt_windows[col_order]

accdnt_windows.to_csv('../data/PeMS/Incidents/work_folder/year_accdnt_wndw.csv', index=False)

# Construction of incident-free windows

In [None]:
def get_next_neighbour(stations, station, correction=0):
    fwy_no = station.Fwy
    fwy_dir = station.Dir
    lon = station.Longitude
    lat = station.Latitude
    candidates = stations[(stations['Fwy']==fwy_no) & (stations['Dir']==fwy_dir)].copy()
    if fwy_dir == 'S':
        candidates['Distance'] = lat - candidates['Latitude']
        cond1 = candidates['Latitude'] < lat - correction
    elif fwy_dir == 'N':
        candidates['Distance'] = candidates['Latitude'] - lat
        cond1 = candidates['Latitude'] > lat + correction
    elif fwy_dir == 'W':
        candidates['Distance'] = lon - candidates['Longitude']
        cond1 = candidates['Longitude'] < lon - correction
    else : # fwy_dir == 'E':
        candidates['Distance'] = candidates['Longitude'] - lon
        cond1 = candidates['Longitude'] > lon + correction
    cond2 = (np.abs(candidates['Distance'])<0.025)
    candidates = candidates[cond1 & cond2 & (candidates['Distance'] > 0)].sort_values(['Distance']).reset_index(drop=True)
    return candidates.iloc[0].ID if candidates.size>0 else 0

In [None]:
def get_next_neighbour_full_data(stations, station, correction=0):
    fwy_no = station.Fwy
    fwy_dir = station.Dir
    lon = station.Longitude
    lat = station.Latitude
    candidates = stations[(stations['Fwy']==fwy_no) & (stations['Dir']==fwy_dir)].copy()
    if fwy_dir == 'S':
        candidates['Distance'] = lat - candidates['Latitude']
        candidates['Distance_aux'] = lon - candidates['Longitude']
        cond1 = candidates['Latitude'] < lat - correction
    elif fwy_dir == 'N':
        candidates['Distance'] = candidates['Latitude'] - lat
        candidates['Distance_aux'] = lon - candidates['Longitude']
        cond1 = candidates['Latitude'] > lat + correction
    elif fwy_dir == 'W':
        candidates['Distance'] = lon - candidates['Longitude']
        candidates['Distance_aux'] = lat - candidates['Latitude']
        cond1 = candidates['Longitude'] < lon - correction
    else : # fwy_dir == 'E':
        candidates['Distance'] = candidates['Longitude'] - lon
        candidates['Distance_aux'] = lat - candidates['Latitude']
        cond1 = candidates['Longitude'] > lon + correction
    cond2 = (candidates['Distance']<0.01) & (candidates['Distance']>0) & (np.abs(candidates['Distance_aux'])<.01)
    candidates = candidates[cond1 & cond2 & (candidates['Distance'] > 0)].sort_values(['Distance']).reset_index(drop=True)
    return candidates

In [None]:
def check_inc_eligibility(an_inc, a_station, vicinity_km, timestamp, td):
    st_loc = (a_station[['Latitude', 'Longitude']])
    inc_tmsmp = an_inc.Timestamp
    cnd1 = vincenty((an_inc.Latitude, an_inc.Longitude), st_loc).kilometers < vicinity_km
    cnd2 = (inc_tmsmp>=timestamp-td) & (inc_tmsmp<timestamp+td)
    return cnd1 & cnd2

In [None]:
def check_inc_time(an_inc, timestamp, td_bf, td_aft):
    inc_tmsmp = an_inc.Timestamp
    cnd = (inc_tmsmp>=timestamp-td_bf) & (inc_tmsmp<timestamp+td_aft)
    return cnd

In [None]:
def check_inc_location(an_inc, a_station, vicinity_km):
    st_loc = (a_station[['Latitude', 'Longitude']])
    cnd = vincenty((an_inc.Latitude, an_inc.Longitude), st_loc).kilometers < vicinity_km
    return cnd

In [None]:
def obtain_time_series_clear(station, stations, full_series, incs, vicinity_km, mins_before, mins_after, chrcteristics=['Avg Occupancy', 'Total Flow']):#, 'Avg Speed']):
#     np.random.seed(14)
#     print(station)
    TICK_SIZE = 5 # minutes
    TICKS_COUNT = 284
    ticks_before = int(mins_before/TICK_SIZE)
    ticks_after = int(mins_after/TICK_SIZE)
    wnd_centre_nmb = np.random.randint(TICKS_COUNT-ticks_before-ticks_after)+ticks_before
    timestamp = full_series[full_series.Station==station.ID]['Timestamp'].iloc[wnd_centre_nmb] 
    timedelta_bf = dt.timedelta(minutes = mins_before)
    timedelta_aft = dt.timedelta(minutes = mins_after)
    incs_nearby = incs[incs.apply(check_inc_time, axis=1, args=(timestamp, timedelta_bf, timedelta_aft))]
    if incs_nearby.size > 0:
        incs_nearby = incs_nearby[incs_nearby.apply(check_inc_location, axis=1, args=(station, vicinity_km))]
    
    if incs_nearby.size == 0:
        next_st = get_next_neighbour_full_data(stations, station)
        if next_st.size == 0:
            return pd.Series(None)
        else:
            next_st = next_st.iloc[0] 
        test = full_series[(full_series.Station==station.ID) & (full_series.Timestamp >= timestamp-timedelta_bf) & (full_series.Timestamp < timestamp+timedelta_aft)]
        test_next = full_series[(full_series.Station==next_st.ID) & (full_series.Timestamp >= timestamp-timedelta_bf) & (full_series.Timestamp < timestamp+timedelta_aft)]
        result=pd.Series()
        result.at['Timestamp'] = timestamp
        result.at['Latitude_Prev'] = station['Latitude']
        result.at['Longitude_Prev'] = station['Longitude']
        result.at['Latitude_Next'] = next_st['Latitude']
        result.at['Longitude_Next'] = next_st['Longitude']
        result.at['ID_Prev'] = station['ID']
        result.at['ID_Next'] = next_st['ID']
        result.at['Lanes_Prev'] = station['Lanes']
        result.at['Lanes_Next'] = next_st['Lanes']
        res = []
        for chrct in chrcteristics:
            res += [test[chrct].reset_index(drop=True).add_prefix(chrct + '_'),]
            res += [test_next[chrct].reset_index(drop=True).add_prefix('Next ' + chrct + '_'),]
        res = pd.concat(res, axis=0)
        return result.append(res)
    
    return pd.Series(None)

In [None]:
def create_accident_free_windows(path, vicinity_km=3, mins_before=40, mins_after=40):
    np.random.seed(14)
    srs_path = path + '/series/smoothed'
    pth = Path(srs_path)
    inc_path = path + '/inc/light'
    inc_pth = Path(inc_path)
    st_path = path + '/stations'
    st_pth = Path(st_path)
    dest_path = path + '/result'
    inc_cols = ['Timestamp', 'Latitude', 'Longitude', 'Freeway']
    st_cols = ['Latitude', 'Longitude', 'ID', 'Lanes', 'Dir', 'Fwy']
    result = pd.DataFrame()

    for i,child in enumerate(sorted(pth.iterdir())):
        srs_file_name = srs_path + '/' + child.name
        srs_file = pd.read_csv(srs_file_name, parse_dates=['Timestamp'])
        gb = srs_file.groupby('Station')
        srs_file.drop(index=srs_file[srs_file.Station.isin(gb.filter(lambda x: len(x)<284).Station.unique())].index, inplace=True)
        if srs_file.size == 0:
            continue
        
        inc_f_name = inc_path + '/' + sorted(inc_pth.iterdir())[i].name
        incs = pd.read_csv(inc_f_name, parse_dates=['Timestamp'])[inc_cols]
        st_active_file_name = st_path + '/' + sorted(st_pth.iterdir())[0].name
        st_active = pd.read_csv(st_active_file_name)[st_cols]
        st_active.drop(index=st_active[~st_active.ID.isin(srs_file.Station.unique())].index, inplace=True)
        srs_file = srs_file.merge(st_active[['ID', 'Lanes']], how='left', left_on='Station', right_on='ID')
        srs_file['Total Flow'] = srs_file['Total Flow']/srs_file['Lanes']
        size_rdm = 120

        
        stations_rdm = st_active[st_active.ID.isin(np.random.choice(st_active.ID, size_rdm, replace=False))]
        windows = stations_rdm.apply(obtain_time_series_clear, args=(st_active, srs_file, incs, vicinity_km, mins_before, mins_after, ['Avg Occupancy', 'Total Flow', 'Avg Speed']), axis=1)
        windows.dropna(axis=0, how='any', inplace=True)
        
        
        result = pd.concat((result, windows), axis=0)
#         if i>1:
#         break
    result.to_csv(dest_path + '/accident_free_windows.csv', index=False)
    print(path + ' complete')
    return result    

In [None]:
dir_list = [
            '../data/PeMS/Incidents/work_folder/Months/Jan/',
            '../data/PeMS/Incidents/work_folder/Months/Feb/',
            '../data/PeMS/Incidents/work_folder/Months/Mar/',
            '../data/PeMS/Incidents/work_folder/Months/Apr/',
           '../data/PeMS/Incidents/work_folder/Months/May/',
           '../data/PeMS/Incidents/work_folder/Months/Jun/',
           '../data/PeMS/Incidents/work_folder/Months/Jul/',
           '../data/PeMS/Incidents/work_folder/Months/Aug/',
           '../data/PeMS/Incidents/work_folder/Months/Sep/',
           '../data/PeMS/Incidents/work_folder/Months/Oct/',
           '../data/PeMS/Incidents/work_folder/Months/Nov/',
           '../data/PeMS/Incidents/work_folder/Months/Dec/']
for a_dir in dir_list:
    accdnt_free_windows = create_accident_free_windows(a_dir)

In [None]:
accdnt_free_windows = pd.DataFrame()
dir_list = [
            '../data/PeMS/Incidents/work_folder/Months/Jan/result/accident_free_windows.csv',
            '../data/PeMS/Incidents/work_folder/Months/Feb/result/accident_free_windows.csv',
            '../data/PeMS/Incidents/work_folder/Months/Mar/result/accident_free_windows.csv',
            '../data/PeMS/Incidents/work_folder/Months/Apr/result/accident_free_windows.csv',
           '../data/PeMS/Incidents/work_folder/Months/May/result/accident_free_windows.csv',
           '../data/PeMS/Incidents/work_folder/Months/Jun/result/accident_free_windows.csv',
           '../data/PeMS/Incidents/work_folder/Months/Jul/result/accident_free_windows.csv',
           '../data/PeMS/Incidents/work_folder/Months/Aug/result/accident_free_windows.csv',
           '../data/PeMS/Incidents/work_folder/Months/Sep/result/accident_free_windows.csv',
           '../data/PeMS/Incidents/work_folder/Months/Oct/result/accident_free_windows.csv',
           '../data/PeMS/Incidents/work_folder/Months/Nov/result/accident_free_windows.csv',
           '../data/PeMS/Incidents/work_folder/Months/Dec/result/accident_free_windows.csv']
for a_file in dir_list:
    accdnt_free_windows = pd.concat((accdnt_free_windows, pd.read_csv(a_file, parse_dates=['Timestamp'])))

In [None]:
col_order = ['ID_Prev', 'ID_Next', 'Timestamp', 'Latitude_Next', 'Longitude_Next', 'Latitude_Prev', 'Longitude_Prev',
             'Lanes_Prev', 'Lanes_Next',
             'Avg Occupancy_0',
       'Avg Occupancy_1', 'Avg Occupancy_2', 'Avg Occupancy_3',
       'Avg Occupancy_4', 'Avg Occupancy_5', 'Avg Occupancy_6',
       'Avg Occupancy_7', 'Avg Occupancy_8', 'Avg Occupancy_9',
       'Avg Occupancy_10', 'Avg Occupancy_11', 'Avg Occupancy_12',
       'Avg Occupancy_13', 'Avg Occupancy_14', 'Avg Occupancy_15',
       'Total Flow_0',
       'Total Flow_1', 'Total Flow_2', 'Total Flow_3', 'Total Flow_4',
       'Total Flow_5', 'Total Flow_6', 'Total Flow_7', 'Total Flow_8',
       'Total Flow_9', 'Total Flow_10', 'Total Flow_11', 'Total Flow_12',
       'Total Flow_13', 'Total Flow_14', 'Total Flow_15',
       'Avg Speed_0', 'Avg Speed_1', 'Avg Speed_2',
       'Avg Speed_3', 'Avg Speed_4', 'Avg Speed_5', 'Avg Speed_6',
       'Avg Speed_7', 'Avg Speed_8', 'Avg Speed_9', 'Avg Speed_10',
       'Avg Speed_11', 'Avg Speed_12', 'Avg Speed_13', 'Avg Speed_14',
       'Avg Speed_15',
        'Next Avg Occupancy_0', 'Next Avg Occupancy_1','Next Avg Occupancy_2',
        'Next Avg Occupancy_3', 'Next Avg Occupancy_4', 'Next Avg Occupancy_5',
        'Next Avg Occupancy_6', 'Next Avg Occupancy_7', 'Next Avg Occupancy_8',
             'Next Avg Occupancy_9','Next Avg Occupancy_10','Next Avg Occupancy_11',
             'Next Avg Occupancy_12','Next Avg Occupancy_13','Next Avg Occupancy_14',
             'Next Avg Occupancy_15','Next Total Flow_0','Next Total Flow_1',
             'Next Total Flow_2','Next Total Flow_3','Next Total Flow_4',
             'Next Total Flow_5','Next Total Flow_6','Next Total Flow_7',
             'Next Total Flow_8','Next Total Flow_9','Next Total Flow_10',
             'Next Total Flow_11','Next Total Flow_12','Next Total Flow_13',
             'Next Total Flow_14','Next Total Flow_15','Next Avg Speed_0',
             'Next Avg Speed_1','Next Avg Speed_2','Next Avg Speed_3',
             'Next Avg Speed_4','Next Avg Speed_5','Next Avg Speed_6',
             'Next Avg Speed_7','Next Avg Speed_8','Next Avg Speed_9',
             'Next Avg Speed_10','Next Avg Speed_11','Next Avg Speed_12',
             'Next Avg Speed_13','Next Avg Speed_14','Next Avg Speed_15' ]



In [None]:
accdnt_free_windows = accdnt_free_windows[col_order]

accdnt_free_windows.to_csv('../data/PeMS/Incidents/work_folder/year_accdnt_free_wndw.csv', index=False)

# Merging of two window classes

In [None]:
accdnt_windows = pd.read_csv('../data/PeMS/Incidents/work_folder/year_accdnt_wndw.csv', parse_dates=['Timestamp'])

accdnt_windows['y'] = 1

accdnt_free_windows = pd.read_csv('../data/PeMS/Incidents/work_folder/year_accdnt_free_wndw.csv', parse_dates=['Timestamp'])
# accdnt_free_windows['IncidentID'] = 0

accdnt_free_windows['y'] = 0

In [None]:
common_cols = ['ID_Prev', 'ID_Next', 'Timestamp', 'Latitude_Next', 'Longitude_Next', 'Latitude_Prev', 'Longitude_Prev',
               'Lanes_Prev', 'Lanes_Next',
             'Avg Occupancy_0',
       'Avg Occupancy_1', 'Avg Occupancy_2', 'Avg Occupancy_3',
       'Avg Occupancy_4', 'Avg Occupancy_5', 'Avg Occupancy_6',
       'Avg Occupancy_7', 'Avg Occupancy_8', 'Avg Occupancy_9',
       'Avg Occupancy_10', 'Avg Occupancy_11', 'Avg Occupancy_12',
       'Avg Occupancy_13', 'Avg Occupancy_14', 'Avg Occupancy_15',
       'Total Flow_0',
       'Total Flow_1', 'Total Flow_2', 'Total Flow_3', 'Total Flow_4',
       'Total Flow_5', 'Total Flow_6', 'Total Flow_7', 'Total Flow_8',
       'Total Flow_9', 'Total Flow_10', 'Total Flow_11', 'Total Flow_12',
       'Total Flow_13', 'Total Flow_14', 'Total Flow_15',
       'Avg Speed_0', 'Avg Speed_1', 'Avg Speed_2',
       'Avg Speed_3', 'Avg Speed_4', 'Avg Speed_5', 'Avg Speed_6',
       'Avg Speed_7', 'Avg Speed_8', 'Avg Speed_9', 'Avg Speed_10',
       'Avg Speed_11', 'Avg Speed_12', 'Avg Speed_13', 'Avg Speed_14',
       'Avg Speed_15',
        'Next Avg Occupancy_0', 'Next Avg Occupancy_1','Next Avg Occupancy_2',
        'Next Avg Occupancy_3', 'Next Avg Occupancy_4', 'Next Avg Occupancy_5',
        'Next Avg Occupancy_6', 'Next Avg Occupancy_7', 'Next Avg Occupancy_8',
             'Next Avg Occupancy_9','Next Avg Occupancy_10','Next Avg Occupancy_11',
             'Next Avg Occupancy_12','Next Avg Occupancy_13','Next Avg Occupancy_14',
             'Next Avg Occupancy_15','Next Total Flow_0','Next Total Flow_1',
             'Next Total Flow_2','Next Total Flow_3','Next Total Flow_4',
             'Next Total Flow_5','Next Total Flow_6','Next Total Flow_7',
             'Next Total Flow_8','Next Total Flow_9','Next Total Flow_10',
             'Next Total Flow_11','Next Total Flow_12','Next Total Flow_13',
             'Next Total Flow_14','Next Total Flow_15','Next Avg Speed_0',
             'Next Avg Speed_1','Next Avg Speed_2','Next Avg Speed_3',
             'Next Avg Speed_4','Next Avg Speed_5','Next Avg Speed_6',
             'Next Avg Speed_7','Next Avg Speed_8','Next Avg Speed_9',
             'Next Avg Speed_10','Next Avg Speed_11','Next Avg Speed_12',
             'Next Avg Speed_13','Next Avg Speed_14','Next Avg Speed_15',
              'y']

In [None]:
dataset = pd.concat((accdnt_windows[common_cols], accdnt_free_windows[common_cols]), axis=0).reset_index(drop=True)

In [None]:
dataset['Hour'] = dataset.Timestamp.dt.hour

dataset = dataset.drop(index=dataset[(dataset['Hour']<6) | (dataset['Hour']>21)].index).reset_index(drop=True)

np.random.seed=442
dataset = dataset.reindex(index=np.random.permutation(dataset.index))

dataset['Width_change'] = dataset.apply(lambda x: 1 if x.Lanes_Prev==x.Lanes_Next else 0, axis=1)

dataset.reset_index(drop=True, inplace=True)

In [None]:
dataset.to_csv('../data/PeMS/Incidents/work_folder/windows.csv', index=False)