In [1]:
import os
import pandas as pd
import numpy as np
import datetime
#import folium 
import plotly.express as px
import plotly
import plotly.graph_objects as go
import argparse
import pyarrow
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
from functools import reduce
from datetime import datetime, timedelta, date
from numpy import percentile
import warnings
warnings.filterwarnings('ignore')

In [8]:
## This function is defined to:
#(1) read each individual vechile file in a certain date folder
#(2) calculate total time moving, time moving above 80km/h, and time moving above 100km/h 
# if the data is not empty in each individual vechile file

def time_mov(folder):
    temp = pd.DataFrame()
    
    for i in os.listdir(folder):
        i_file = os.path.join(folder,i)
        i_data = pd.read_parquet(i_file)
        if len(i_data) > 0:
            clean_i = data_process(i_data)
            agg_stats_i = clean_i.groupby(['day_hr']).apply(lambda x:pd.Series({'time_mov_s': x[x['speed'] > 0]['time_delta'].sum(),
                                                     'time_spd80_s': x[x['speed'] >= 80]['time_delta'].sum(),
                                                     'time_spd100_s': x[x['speed'] > 100]['time_delta'].sum()})).reset_index()

            agg_i = agg_stats_i.merge(clean_i, how = 'left').drop_duplicates(['day_hr','date','time_mov_s','time_spd80_s','time_spd100_s'])
        
            temp = pd.concat([temp,agg_i])
    
    return temp

In [2]:
## This function is definde to calculate the time difference in seconds between each data entry 
def data_process(df):
    df = df.sort_values('time_str')
    df.time_str = pd.to_datetime(df.time_str, format = "%Y-%m-%d %H:%M:%S")
    df['date'] = df.time_str.dt.date
    df['day_hr'] = df['time_str'].dt.floor('h')
    df['time_delta'] = df['time_str'].diff().apply(lambda x:x.total_seconds())
    
    return df

In [3]:
## Filepaths
if os.getlogin() == 'wb575963':
    dropbox = "C:/Users/wb575963/Dropbox/PSV Rider Feedback"
    dropbox_outputs = "C:/Users/wb575963/Dropbox/PSV Rider Feedback/Data/Sensor Data/FinalData"

    
## Data path
data_dir = os.path.join(dropbox, "Data")
sensor_dir = os.path.join(data_dir, "Sensor Data")
sensor_raw_dir = os.path.join(sensor_dir, "RawData")
sensor_final_dir = os.path.join(sensor_dir, "FinalData")
sensor_indiv_dir = os.path.join(sensor_raw_dir, "sensor_tracing_individual_data")

In [5]:
## set a period of time we want to get the aggregated data 
start = datetime.strptime("2022-9-1", "%Y-%m-%d")
date_generated = pd.date_range(start, periods=122)
date_range = date_generated.to_native_types().tolist()
date_range

['2022-09-01',
 '2022-09-02',
 '2022-09-03',
 '2022-09-04',
 '2022-09-05',
 '2022-09-06',
 '2022-09-07',
 '2022-09-08',
 '2022-09-09',
 '2022-09-10',
 '2022-09-11',
 '2022-09-12',
 '2022-09-13',
 '2022-09-14',
 '2022-09-15',
 '2022-09-16',
 '2022-09-17',
 '2022-09-18',
 '2022-09-19',
 '2022-09-20',
 '2022-09-21',
 '2022-09-22',
 '2022-09-23',
 '2022-09-24',
 '2022-09-25',
 '2022-09-26',
 '2022-09-27',
 '2022-09-28',
 '2022-09-29',
 '2022-09-30',
 '2022-10-01',
 '2022-10-02',
 '2022-10-03',
 '2022-10-04',
 '2022-10-05',
 '2022-10-06',
 '2022-10-07',
 '2022-10-08',
 '2022-10-09',
 '2022-10-10',
 '2022-10-11',
 '2022-10-12',
 '2022-10-13',
 '2022-10-14',
 '2022-10-15',
 '2022-10-16',
 '2022-10-17',
 '2022-10-18',
 '2022-10-19',
 '2022-10-20',
 '2022-10-21',
 '2022-10-22',
 '2022-10-23',
 '2022-10-24',
 '2022-10-25',
 '2022-10-26',
 '2022-10-27',
 '2022-10-28',
 '2022-10-29',
 '2022-10-30',
 '2022-10-31',
 '2022-11-01',
 '2022-11-02',
 '2022-11-03',
 '2022-11-04',
 '2022-11-05',
 '2022-11-

In [6]:
## a function to obtain a panel data for the whole period of time 
def time_moving_panel(d_range):
    panel_df = pd.DataFrame()
    for d in d_range:
        one_folder = os.path.join(sensor_indiv_dir,d)
        print(one_folder)
        one_df = time_mov(one_folder)
    
        panel_df = pd.concat([panel_df,one_df])
    
    return panel_df
    

In [None]:
time_dayhr_s = time_moving_panel(date_range)

C:/Users/wb575963/Dropbox/PSV Rider Feedback\Data\Sensor Data\RawData\sensor_tracing_individual_data\2022-09-01


In [None]:
time_dayhr_s_clean = time_dayhr_s[['day_hr', 'time_mov_s', 'time_spd80_s', 'time_spd100_s','reg_no','date']]

In [None]:
time_dayhr_s_clean.to_parquet(os.path.join(dropbox_outputs,'time_moving_dayhr.gz.parquet'),
              compression='gzip', index = False)

### Aggregated to daily level

In [None]:
time_day_s = time_dayhr_s_clean.groupby(['reg_no','date']).apply(lambda x: pd.Series({'time_mov_s': x['time_mov_s'].sum(),
                                                           'time_spd80_s': x['time_spd80_s'].sum(),
                                                           'time_spd100_s': x['time_spd100_s'].sum()})).reset_index()
time_day_s

In [None]:
time_day_s.to_parquet(os.path.join(dropbox_outputs,'time_moving_day.gz.parquet'),
              compression='gzip', index = False)