In [2]:
import numpy as np
import pandas as pd
import geopandas as gpd
from shapely.geometry import Polygon
import os
import time
import datetime as dt
import matplotlib.pyplot as plt
from tqdm import tqdm
import seaborn as sns
from mpl_toolkits.axes_grid1 import make_axes_locatable
#CAN_PATH = '/data2/jiang/Toyota/CAN data/0701ver/'
#NEW_CAN_PATH = '/data2/jiang/Toyota/CAN_day_data/'
OCT_CAN_PATH = '/data2/jiang/Toyota/CAN_day_data_202110/'
NOV_CAN_PATH = '/data2/jiang/Toyota/CAN_day_data_202111/'
MAY_CAN_PATH = '/data2/jiang/Toyota/CAN_day_data_1122ver/'
CAN_AGG_PATH = '/data2/jiang/workToyota/data/CAN_Aggregated/'
#NEW_ACC_PATH = '/data2/jiang/Toyota/JARTIC_data_0721ver/'
#NEW_LINK_PATH = '/data2/jiang/workToyota/data/'
#INT_CAN_PATH = '/data2/jiang/workToyota/data/ACC_CAN_data/'
#DAYS = [date.strftime('%Y-%m-%d') for date in pd.date_range(start='2021-05-10', end='2021-06-01', freq='1D')]
SAMPLING_RATE = '10min'      #Defining time-interval for aggregation
#SPEED_DROP_THRESHOLD = 0.75
LAT_MIN = 35.36
LAT_MAX = 35.90
LON_MIN = 139.537
LON_MAX = 139.947

In [4]:
#Function to extract linkids in give rectangular coordinates
def clip_CAN(mask):
    #df = pd.read_csv(os.path.join(NEW_LINK_PATH, 'link_connect_all.csv'))
    #df = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df['start_lon'], df['start_lat'])).set_crs(epsg=4326)
    dft = df[(df['mmlatitude']>=LAT_MIN) & (df['mmlatitude']<=LAT_MAX) & (df['mmlongitude']>=LON_MIN) & 
            (df['mmlongitude']<=LON_MAX)]
    #df = gpd.clip(df, mask)
    return dft

In [281]:
#Function to compute the average speed profile for given linkid for given accident and show the plot for the same
def plot_speed(df_acc_adverse, df_can_group, link_neighbor_list_adv):
    
    #Conversion into sampling rate points
    accident_time_list = df_acc_adverse.start_time.values
    accident_time_list = pd.to_datetime(accident_time_list)
    
    linkid_list = df_acc_adverse.coord_start_upstream_nearestlink.values
    
    #Plotting accident profiles row by row
    for accident_time, linkid, linkid_neighbor, end_time, idx in zip(accident_time_list, linkid_list, link_neighbor_list_adv, 
                                                      df_acc_adverse.end_time.values, df_acc['Unnamed: 0'].values):
        
        
        ##Plotting speed chart for accident linkid for +-2.5 days
        start_time = accident_time
        accident_time = accident_time.floor(SAMPLING_RATE)
        
        #Creating time-slices +-2.5 days before/after the accident
        timeslices = pd.date_range(accident_time-dt.timedelta(days=2,hours=12), accident_time+dt.timedelta(days=2,hours=12), 
                               freq=SAMPLING_RATE)
    
        #Extracting relevant GPS Points
        tmp=df_can_group[df_can_group.linkid==linkid].set_index('gps_timestamp').reindex(timeslices).reset_index()
        tmp['speed_typea'] = tmp['speed_typea'].interpolate(limit=15,limit_direction='both')
        #tmp.fillna(0)
        
        #Plotting the accidents profile
        data = tmp['speed_typea'].values
        accident_time_str = accident_time.strftime('%Y-%m-%d %H:%M:%S')
        accident_time_hour = accident_time_str[-8:]
    
        ticks = [date.strftime('%Y-%m-%d %H:%M:%S') for date in timeslices]
        all_hour_int = [i for i, x in enumerate(ticks) if accident_time_hour in x]
        all_hour_str = [x for x in ticks if accident_time_hour in x]
    
        fig, (ax0, ax1) = plt.subplots(2, 1, figsize=(25,22), gridspec_kw={'height_ratios':[10,10]}) 
        ax0.plot(data)
        
        for i,hour in zip(all_hour_int, all_hour_str):
            if hour == accident_time_str:
                ax0.axvline(x=i, label=f'{hour}_accident', c='r')
            else:
                ax0.axvline(x=i, label=hour, c='g')
        ax0.legend(loc="upper left")
        ax0.set_title(f"link id {linkid} accident time{start_time.strftime('%Y-%m-%d %H:%M:%S')}", fontsize=12, y=1)
        ax0.set_xticks(np.arange(data.shape[0]))
        ax0.set_xticklabels(ticks, rotation=90)
        ax0.set_xlabel('Time')
        ax0.set_ylabel('Value')
        
        
        ##Plotting Heatmap like graph for the day of the accident
        DAY = accident_time.strftime('%Y-%m-%d')
        inds = pd.date_range(pd.Timestamp(DAY), pd.Timestamp(DAY)+pd.Timedelta('1D'),freq=SAMPLING_RATE,closed='left')
        
        #Getting data in speed-linkid matrix format 
        spd = df_can_group[df_can_group.linkid.isin(linkid_neighbor) & 
                                (df_can_group.gps_timestamp.dt.date==accident_time.date())]
        spd = spd.pivot(index='gps_timestamp',columns='linkid',values='speed_typea')
        spd = spd.reindex(columns=linkid_neighbor)
        spd = spd.reindex(inds)
        spd.interpolate(axis=1,limit=15,limit_direction='both',inplace=True)
        
        
        inds = pd.date_range(pd.Timestamp(DAY), pd.Timestamp(DAY)+pd.Timedelta('1D'),freq=SAMPLING_RATE,closed='left')
        cols = linkid_neighbor
        sns.set_theme(style="white")

        st = accident_time.floor(SAMPLING_RATE)
        st_ind = inds.indexer_between_time(st.time(),(st+pd.Timedelta('2min')).time())
        et = pd.Timestamp(end_time).ceil(SAMPLING_RATE)
        et_ind = inds.indexer_between_time(et.time(),(et+pd.Timedelta('2min')).time())
        
        ax1.set_yticks(np.arange(len(cols)))
        ax1.set_yticklabels(cols)
        ax1.set_xticks(np.arange(inds.shape[0]))
        ax1.set_xticklabels(inds, rotation=90)
        ax1.axhline(y=cols.index(linkid), label=f'accident', c='r')
        ax1.axvline(x=st_ind, label=f'accident', c='r')
        ax1.axvline(x=et_ind, label=f'accident', c='r')
        
        #im = ax1.imshow(spd.values.T, interpolation='bilinear',cmap='RdYlGn')
        im = ax1.imshow(spd.values.T, cmap='RdYlGn')
        
        divider = make_axes_locatable(ax1)
        cax = divider.append_axes("right", size="5%", pad=0.05)

        plt.colorbar(im, cax=cax)

        fig.tight_layout()
        
        fig.savefig(NEW_LINK_PATH+'Adverse_Accidents_Plots/'+str(idx)+': linkid='+str(linkid)+
                            ',accident-time='+start_time.strftime('%Y-%m-%d %H:%M:%S'))

# Main Function

In [4]:
#Main1 (Aggregating the avg speed for each linkid and saving in monthly files (without reindexing))

mask = Polygon([(LON_MIN,LAT_MAX), (LON_MAX,LAT_MAX), (LON_MAX,LAT_MIN), (LON_MIN,LAT_MIN)])
month_list = ['MAY', 'OCT', 'NOV']
i = 0

#Reading the files
for PATH_VAR in [MAY_CAN_PATH, OCT_CAN_PATH, NOV_CAN_PATH]:
    files = [os.path.join(PATH_VAR+filename) for filename in os.listdir(PATH_VAR)]
    files.sort()
    
    tik = time.time()
    df_month = []
    for filename in files:
        #Reading the file, dropping NA values, and clipping it
        df = pd.read_csv(filename, compression='gzip')
        df = df.dropna(subset=['speed_typea'])
        df = clip_CAN(mask)
        
        #Sampling and getting avg speed after aggregation
        df['gps_timestamp'] = pd.to_datetime(df['gps_timestamp'])
        df['gps_timestamp'] = df['gps_timestamp'].dt.floor(SAMPLING_RATE)    
        df = df.groupby(['linkid','gps_timestamp'], as_index=False)['speed_typea'].mean()[['linkid',"gps_timestamp",
                                                                                         "speed_typea"]]

        #Reindexing timestamps
        #current_date = df['gps_timestamp'].iloc[0].date()
        #timeslices = pd.date_range(current_date, current_date+dt.timedelta(hours=24), freq=SAMPLING_RATE)[:-1]
        #mux = pd.MultiIndex.from_product([df.linkid.unique(), timeslices],names=['linkid', 'gps_timestamp'])
        #df = df.set_index(['linkid', 'gps_timestamp']).reindex(mux).reset_index()
        
        #Appending the one day result
        df_month.append(df)
    
    del df
    df_month = pd.concat(df_month)
    df_month.to_csv(CAN_AGG_PATH+month_list[i]+'_CAN.csv.gz', compression='gzip', index=False)
    i += 1
    
    tok = time.time()
    print(tok-tik)

KeyboardInterrupt: 

In [28]:
#Main2
#Getting common list of all linkids

files = [os.path.join(CAN_AGG_PATH+filename) for filename in os.listdir(CAN_AGG_PATH)]
files.sort()
files = files[2:-1:2]

linkid_list = []

for filename in files:
    tik = time.time()
    print('Processing '+filename)
    df = pd.read_csv(filename, compression='gzip')
    linkid_list = linkid_list + df.linkid.unique().tolist()
    tok = time.time()
    print(tok-tik)
#del df
linkid_list = np.unique(np.array(linkid_list))

Processing /data2/jiang/workToyota/data/CAN_Aggregated/MAY_CAN.csv.gz
129.92215299606323
Processing /data2/jiang/workToyota/data/CAN_Aggregated/NOV_CAN.csv.gz
153.91129755973816
Processing /data2/jiang/workToyota/data/CAN_Aggregated/OCT_CAN.csv.gz
156.29844331741333


In [17]:
#Main3
#Extracting capital linkids only and doing the reindexing (monthly files)

df_capital_link = pd.read_csv('/data2/jiang/Toyota/graph_data/capital_graph_link_info.csv')
capital_linkid_list = df_capital_link['link_id'].unique()

files = [os.path.join(CAN_AGG_PATH+filename) for filename in os.listdir(CAN_AGG_PATH)]
for filename in files:
    tik = time.time()
    print('Processing '+filename)
    df = pd.read_csv(filename, compression='gzip')
    df.drop(['Unnamed: 0'], axis=1, inplace=True)
    df = df[df['linkid'].isin(capital_linkid_list)]
    print(len(df))
    
    df['gps_timestamp'] = pd.to_datetime(df['gps_timestamp'])
    first_date = df['gps_timestamp'].iloc[0].date()
    last_date = df['gps_timestamp'].iloc[-1].date()
    timeslices = pd.date_range(first_date, last_date+dt.timedelta(days=1), freq=SAMPLING_RATE)[:-1]
    
    mux = pd.MultiIndex.from_product([capital_linkid_list, timeslices],names=['linkid', 'gps_timestamp'])
    df = df.set_index(['linkid', 'gps_timestamp']).reindex(mux).reset_index()
    df.to_csv(filename[:-7]+'_CAPITAL.csv.gz', compression='gzip', index=False)
    
    print(df.linkid.nunique())
    print(len(df))
    print(df.speed_typea.isna().sum())
    tok = time.time()
    print(tok-tik)

Processing /data2/jiang/workToyota/data/CAN_Aggregated/OCT_CAN.csv.gz
7237824
2841
12682224
5444400
364.59753036499023
Processing /data2/jiang/workToyota/data/CAN_Aggregated/MAY_CAN.csv.gz
6478303
2841
12682224
6203921
330.33218908309937
Processing /data2/jiang/workToyota/data/CAN_Aggregated/NOV_CAN.csv.gz
7224913
2841
12273120
5048207
349.8731253147125


In [85]:
#Main4
#Doing the reindexing for all linkids (day-wise file)

files = [os.path.join(CAN_AGG_PATH,filename) for filename in os.listdir(CAN_AGG_PATH)]
files.sort()
files = files[2:-1:2]

for filename in files:
    tik = time.time()
    print('Processing '+filename)
    df = pd.read_csv(filename, compression='gzip')
    df.drop(['Unnamed: 0'], axis=1, inplace=True)
    
    df['gps_timestamp'] = pd.to_datetime(df['gps_timestamp'])
    first_date = df['gps_timestamp'].iloc[0].date()
    last_date = df['gps_timestamp'].iloc[-1].date()
    dayslices = pd.date_range(first_date, last_date, freq='1D')
    
    for day in dayslices:
        df_tmp = df[df['gps_timestamp'].dt.date==day]
        timeslices = pd.date_range(day, day+dt.timedelta(days=1), freq=SAMPLING_RATE)[:-1]
        #mux = pd.MultiIndex.from_product([linkid_list, timeslices],names=['linkid', 'gps_timestamp'])
        #df_tmp = df_tmp.set_index(['linkid', 'gps_timestamp']).reindex(mux).reset_index()
        y = pd.DataFrame([], index=pd.MultiIndex.from_product([linkid_list, timeslices],names=['linkid', 'gps_timestamp']))
        y = y.merge(df_tmp, on=['linkid', 'gps_timestamp'], how='left').reset_index().drop(['index'],axis=1)
        y.to_csv(os.path.join(CAN_AGG_PATH,'CAN_Daywise_Reindexed',day.strftime('%Y%m'),day.strftime('%Y%m%d')+
                                   '_CAN_Reindexed.csv.gz'), compression='gzip', index=False)

    #print(df.linkid.nunique())
    #print(len(df))
    #print(df.speed_typea.isna().sum())
    tok = time.time()
    print(tok-tik)

Processing /data2/jiang/workToyota/data/CAN_Aggregated/MAY_CAN.csv.gz
22719.928786754608
Processing /data2/jiang/workToyota/data/CAN_Aggregated/NOV_CAN.csv.gz
22124.38141465187
Processing /data2/jiang/workToyota/data/CAN_Aggregated/OCT_CAN.csv.gz
22883.279091358185


In [190]:
#Main5
#Creating Accident Tensor based on reindexed Capital linkids (monthly files)
#/data2/jiang/Toyota/JARTIC_data_202105/vics_regulation_202105_shutokou.csv
#/data2/jiang/Toyota/JARTIC_data_202110/vics_regulation_202110_C01_2.csv
#/data2/jiang/Toyota/JARTIC_data_202111/vics_regulation_202111_C01.csv

df_capital_link = pd.read_csv('/data2/jiang/Toyota/graph_data/capital_graph_link_info.csv')
capital_linkid_list = df_capital_link['link_id'].unique()

files = ['/data2/jiang/Toyota/JARTIC_data_202105/vics_regulation_202105_shutokou.csv',
            '/data2/jiang/Toyota/JARTIC_data_202110/vics_regulation_202110_C01_2.csv',
            '/data2/jiang/Toyota/JARTIC_data_202111/vics_regulation_202111_C01.csv']

for i,filename in enumerate(files):
    tik = time.time()
    print('Processing '+filename)
    
    if i==1:
        df = pd.read_csv(filename, encoding='shift-jis')
    else:
        df = pd.read_csv(filename)
    
    df['start_time'] = pd.to_datetime(df['start_time'])
    df['end_time'] = pd.to_datetime(df['end_time'])
    df['start_time'] = df['start_time'].dt.floor(SAMPLING_RATE)
    df['end_time'] = df['end_time'].dt.floor(SAMPLING_RATE)
    
    df.drop(['Unnamed: 0'], axis=1, inplace=True)
    df = df[df['coord_start_upstream_nearestlink'].isin(capital_linkid_list)]
    
    #first_date = df['start_time'].iloc[0].date()
    month = df['start_time'].iloc[0].month
    year = df['start_time'].iloc[0].year
    first_date = pd.to_datetime(str(year)+'-'+str(month)+'-01')
    last_date = df['start_time'].max().date()
    timeslices = pd.date_range(first_date, last_date+dt.timedelta(days=1), freq=SAMPLING_RATE)[:-1]
    print(timeslices[0], timeslices[-1])
    
    #mux = pd.MultiIndex.from_product([capital_linkid_list, timeslices],names=['linkid', 'gps_timestamp'])
    y = pd.DataFrame([], index=pd.MultiIndex.from_product([capital_linkid_list, timeslices],
                                                          names=['linkid', 'gps_timestamp'])).reset_index()
    y['accident_flag'] = 0
    #df['accident_flag'] = 1
    for _,row in df.iterrows():
        #y.loc[(row['coord_start_upstream_nearestlink'],row['start_time']):(row['coord_start_upstream_nearestlink'],
        #                                                                   row['end_time']), 'accident_flag'] = 1
        y['accident_flag'] = np.where((y['linkid']==row['coord_start_upstream_nearestlink']) & 
                (y['gps_timestamp']>=row['start_time']) & (y['gps_timestamp']<=row['end_time']), 1, y['accident_flag'])
        
    y.to_csv(CAN_AGG_PATH+'ACCIDENT_'+first_date.strftime('%Y-%m')+'_CAPITAL.csv.gz', compression='gzip', index=False)
    
    #print(df.linkid.nunique())
    #print(len(df))
    #print(df.speed_typea.isna().sum())
    tok = time.time()
    print(tok-tik)

Processing /data2/jiang/Toyota/JARTIC_data_202105/vics_regulation_202105_shutokou.csv


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


2021-05-01 00:00:00 2021-05-31 23:50:00
6383.675914049149
Processing /data2/jiang/Toyota/JARTIC_data_202110/vics_regulation_202110_C01_2.csv
2021-10-01 00:00:00 2021-10-31 23:50:00
5606.243230819702
Processing /data2/jiang/Toyota/JARTIC_data_202111/vics_regulation_202111_C01.csv
2021-11-01 00:00:00 2021-11-30 23:50:00
6630.13353061676


In [22]:
len(capital_linkid_list)

2841

In [130]:
#Main6
#Filling NaN values => (2) followed by (1)
##(1): xxInterpolationxx Forward Fill based on last xxand nextxx available values (but it wouldn't be very accurate)
##(2): Taking average of speed for that time period across all days (wherever data is available) for that particular linkid 
##     - combined with weekday and accident_flag information (there is possibility of having no value available)

can_files = [CAN_AGG_PATH+'MAY_CAN_CAPITAL_Reindexed.csv.gz', CAN_AGG_PATH+'OCT_CAN_CAPITAL_Reindexed.csv.gz',
             CAN_AGG_PATH+'NOV_CAN_CAPITAL_Reindexed.csv.gz']

acc_files = [CAN_AGG_PATH+'ACCIDENT_2021-05_CAPITAL.csv.gz', CAN_AGG_PATH+'ACCIDENT_2021-10_CAPITAL.csv.gz',
             CAN_AGG_PATH+'ACCIDENT_2021-11_CAPITAL.csv.gz']

for can_file,acc_file in zip(can_files,acc_files):
    tik = time.time()
    print('Processing '+can_file)
    df = pd.read_csv(can_file)
    df['accident_flag'] = pd.read_csv(acc_file)['accident_flag']
    
    df['gps_timestamp'] = pd.to_datetime(df['gps_timestamp'])
    df['weekday'] = df['gps_timestamp'].dt.weekday
    df['weekday'] = np.where(df['weekday']>=5, 0, 1)
    df['interval'] = df.index%144
    df['time'] = df['gps_timestamp'].dt.time
    
    #Method (2)
    df_mean = df[df.accident_flag==0].groupby(by=['linkid','time','weekday']).speed_typea.mean().reset_index(name='speed_avg')
    df = df.merge(df_mean, on=['linkid','time','weekday'], how='left')
    df['speed_typea'] = np.where(df['accident_flag']==0, df['speed_typea'].fillna(df['speed_avg']), df['speed_typea'])
    
    #Method (1)
    #if pd.isna(df['speed_typea'].iloc[0]):
    #    df.loc[0, 'speed_typea'] = -2
    df['speed_typea'] = np.where(df['speed_typea'].isna() & (df['linkid']!=df['linkid'].shift()), -2, df['speed_typea'])
    df['speed_typea'] = np.where(df['speed_typea'].isna() & (df['interval']==138), -2, df['speed_typea'])
    df['speed_typea'] = np.where(df['accident_flag']==0, df['speed_typea'].fillna(method='ffill'), df['speed_typea'])
    df['speed_typea'] = np.where(((df['speed_typea']==-2) | (df['speed_typea'].isna())) & (df['accident_flag']==1), 
                                         -1, df['speed_typea'])
    df['speed_typea'] = np.where(df['speed_typea']==-2, 100, df['speed_typea'])
    
    #Saving File
    df[['linkid','gps_timestamp','speed_typea','accident_flag']].to_csv(can_file[:-7]+'_FilledNA.csv.gz',compression='gzip',
                                                                       index=False)
    tok = time.time()
    print(tok-tik)

Processing /data2/jiang/workToyota/data/CAN_Aggregated/MAY_CAN_CAPITAL_Reindexed.csv.gz
144.00749135017395
Processing /data2/jiang/workToyota/data/CAN_Aggregated/OCT_CAN_CAPITAL_Reindexed.csv.gz
149.11396408081055
Processing /data2/jiang/workToyota/data/CAN_Aggregated/NOV_CAN_CAPITAL_Reindexed.csv.gz
144.20460200309753


In [60]:
len(df)

12682224

In [38]:
df.linkid.nunique()*144

409104

In [16]:
#Main7
#Creating Real Accident (filtered from cause column) Tensor based on reindexed Capital linkids (monthly files)
#/data2/jiang/Toyota/JARTIC_data_202105/vics_accident_202105.csv
#/data2/jiang/Toyota/JARTIC_data_202110/vics_accident_202110.csv
#/data2/jiang/Toyota/JARTIC_data_202111/vics_accident_202111.csv

df_capital_link = pd.read_csv('/data2/jiang/Toyota/graph_data/capital_graph_link_info.csv')
capital_linkid_list = df_capital_link['link_id'].unique()

files = ['/data2/jiang/Toyota/JARTIC_data_202105/vics_accident_202105.csv', 
         '/data2/jiang/Toyota/JARTIC_data_202110/vics_accident_202110.csv',
         '/data2/jiang/Toyota/JARTIC_data_202111/vics_accident_202111.csv']

for filename in files:
    tik = time.time()
    print('Processing '+filename)
    
    df = pd.read_csv(filename)
    
    df['start_time'] = pd.to_datetime(df['start_time'])
    df['end_time'] = pd.to_datetime(df['end_time'])
    df['start_time'] = df['start_time'].dt.floor(SAMPLING_RATE)
    df['end_time'] = df['end_time'].dt.floor(SAMPLING_RATE)
    
    df.drop(['Unnamed: 0'], axis=1, inplace=True)
    df = df[df['coord_start_upstream_nearestlink'].isin(capital_linkid_list)]
    
    #first_date = df['start_time'].iloc[0].date()
    month = df['start_time'].iloc[0].month
    year = df['start_time'].iloc[0].year
    first_date = pd.to_datetime(str(year)+'-'+str(month)+'-01')
    last_date = df['start_time'].max().date()
    timeslices = pd.date_range(first_date, last_date+dt.timedelta(days=1), freq=SAMPLING_RATE)[:-1]
    print(timeslices[0], timeslices[-1])
    
    #mux = pd.MultiIndex.from_product([capital_linkid_list, timeslices],names=['linkid', 'gps_timestamp'])
    y = pd.DataFrame([], index=pd.MultiIndex.from_product([capital_linkid_list, timeslices],
                                                          names=['linkid', 'gps_timestamp'])).reset_index()
    y['real_accident_flag'] = 0
    #df['accident_flag'] = 1
    for _,row in df.iterrows():
        #y.loc[(row['coord_start_upstream_nearestlink'],row['start_time']):(row['coord_start_upstream_nearestlink'],
        #                                                                   row['end_time']), 'accident_flag'] = 1
        y['real_accident_flag'] = np.where((y['linkid']==row['coord_start_upstream_nearestlink']) & 
                (y['gps_timestamp']>=row['start_time']) & (y['gps_timestamp']<=row['end_time']), 1, y['real_accident_flag'])
        
    y.to_csv(CAN_AGG_PATH+'REAL_ACCIDENT_'+first_date.strftime('%Y-%m')+'_CAPITAL.csv.gz', compression='gzip', index=False)
    
    #print(df.linkid.nunique())
    #print(len(df))
    #print(df.speed_typea.isna().sum())
    tok = time.time()
    print(tok-tik)

Processing /data2/jiang/Toyota/JARTIC_data_202105/vics_accident_202105.csv
2021-05-01 00:00:00 2021-05-31 23:50:00
238.3499116897583
Processing /data2/jiang/Toyota/JARTIC_data_202110/vics_accident_202110.csv
2021-10-01 00:00:00 2021-10-31 23:50:00
296.89824318885803
Processing /data2/jiang/Toyota/JARTIC_data_202111/vics_accident_202111.csv
2021-11-01 00:00:00 2021-11-30 23:50:00
335.2698199748993


In [23]:
#Main8
#Merging the FilledNA files with real_accident_flag data
#columns -> linkid, gps_timestamp, speed_typea, accident_flag, realaccident_flag
#or you can rename them into:
#linkid, gps_timestamp, speed_typea, incident_flag, accident_flag
#※here the incident_flag corresponds to the original accident_flag, accident_flag is the newly added realaccident flag.
#if realaccident_flag=1, accident_flag must =1, but not vise versa. 
#Because the realaccident data is a pure subset of accident data (incident data).

can_files = [CAN_AGG_PATH+'MAY_CAN_CAPITAL_Reindexed_FilledNA.csv.gz', CAN_AGG_PATH+'OCT_CAN_CAPITAL_Reindexed_FilledNA.csv.gz',
             CAN_AGG_PATH+'NOV_CAN_CAPITAL_Reindexed_FilledNA.csv.gz']

acc_files = [CAN_AGG_PATH+'REAL_ACCIDENT_2021-05_CAPITAL.csv.gz', CAN_AGG_PATH+'REAL_ACCIDENT_2021-10_CAPITAL.csv.gz',
             CAN_AGG_PATH+'REAL_ACCIDENT_2021-11_CAPITAL.csv.gz']

for can_file,acc_file in zip(can_files,acc_files):
    tik = time.time()
    print('Processing '+can_file)
    df = pd.read_csv(can_file)
    df['real_accident_flag'] = pd.read_csv(acc_file)['real_accident_flag']
    
    #Saving File
    df.to_csv(can_file[:-7]+'_IncidentAccident.csv.gz',compression='gzip', index=False)
    tok = time.time()
    print(tok-tik)

Processing /data2/jiang/workToyota/data/CAN_Aggregated/MAY_CAN_CAPITAL_Reindexed_FilledNA.csv.gz
121.92135000228882
Processing /data2/jiang/workToyota/data/CAN_Aggregated/OCT_CAN_CAPITAL_Reindexed_FilledNA.csv.gz
126.28425812721252
Processing /data2/jiang/workToyota/data/CAN_Aggregated/NOV_CAN_CAPITAL_Reindexed_FilledNA.csv.gz
122.5358817577362
