In [1]:
from sqlalchemy import create_engine
from datetime import datetime, timedelta
import pandas as pd
import numpy as np
import feather
import os
import torch


# connect to db
user = 'root'
pswd = 'Curry5566'
host = '127.0.0.1'
port = '3306'
db = 'transport'
engine = create_engine(f"mysql+pymysql://{user}:{pswd}@{host}:{port}/{db}?charset=utf8")


def getEndDate(startDate: str, days: int) -> str:
    startDate += ' 00:00:00'
    end = str((datetime.strptime(startDate, '%Y-%m-%d %H:%M:%S') + timedelta(days=days)).replace(microsecond=0))
    return end

def getRollingMean(startDate: str, endDate: str) -> pd.DataFrame:
    """ Get the rolling mean from db
        ```text
        ---
        @Params
        startDate: The date for start, format='%Y-%m-%d'
        endDate: The date for end, format='%Y-%m-%d'

        ---
        @Returns
        DataFrame
        ```
    """
    sql  = " SELECT "
    sql += " 	STAC.VDID, STAC.RoadName, STAC.`Start`, STAC.`End`, "
    sql += " 	STAC.RoadDirection, DYMC.Speed, DYMC.Occupancy, DYMC.Volume, "
    sql += " 	STAC.LocationMile, DYMC.DataCollectTime "
    sql += " FROM ( "
    sql += " 	SELECT "
    sql += " 		VDSTC.id, VDSTC.VDID, ROAD.RoadName, SEC.`Start`, SEC.`End`, "
    sql += " 		VDSTC.RoadDirection, VDSTC.LocationMile "
    sql += " 	FROM vd_static_n5 VDSTC "
    sql += " 	JOIN road_info ROAD ON VDSTC.RoadInfoID = ROAD.id "
    sql += " 	JOIN section_info SEC ON ROAD.id = SEC.RoadInfoID "
    sql += " 	AND VDSTC.LocationMile >= SEC.StartKM "
    sql += " 	AND VDSTC.LocationMile <= SEC.EndKM "
    sql += " 	WHERE VDSTC.Mainlane = 1 "
    sql += " ) STAC JOIN ( "
    sql += " 	SELECT "
    sql += " 		VdStaticID, "
    sql += " 		CASE "
    sql += " 			WHEN MIN(Speed) = -99 THEN -99 "
    sql += " 			ELSE AVG(Speed) "
    sql += " 		END AS Speed,  "
    sql += " 		CASE "
    sql += " 			WHEN MIN(Occupancy) = -99 THEN -99 "
    sql += " 			ELSE AVG(Occupancy) "
    sql += " 		END AS Occupancy,  "
    sql += " 		CASE "
    sql += " 			WHEN MIN(Volume) = -99 THEN -99 "
    sql += " 			ELSE AVG(Volume) "
    sql += " 		END AS Volume, "
    sql += " 		MAX(DataCollectTime) AS DataCollectTime, "
    sql += " 		(UNIX_TIMESTAMP(DataCollectTime)-UNIX_TIMESTAMP(%(start)s)) DIV 300 "
    sql += " 	FROM vd_dynamic_detail_n5_202301 "
    sql += " 	WHERE id BETWEEN ( "
    sql += " 		SELECT id FROM vd_dynamic_detail_n5_202301 "
    sql += " 		WHERE DataCollectTime = %(start)s "
    sql += " 		ORDER BY id LIMIT 1 "
    sql += " 	) AND ( "
    sql += " 		SELECT id FROM vd_dynamic_detail_n5_202301 "
    sql += " 		WHERE DataCollectTime < %(end)s "
    sql += " 		ORDER BY id DESC LIMIT 1 "
    sql += " 	) "
    sql += " 	GROUP BY VdStaticID, (UNIX_TIMESTAMP(DataCollectTime)-UNIX_TIMESTAMP(%(start)s)) DIV 300 "
    sql += " ) DYMC ON STAC.id = DYMC.VdStaticID "
    sql += " ORDER BY STAC.RoadDirection, STAC.LocationMile, DYMC.DataCollectTime; "

    df = pd.read_sql(sql, con=engine, params={'start': startDate, 'end': endDate})
    engine.dispose()
    return df.sort_values(by=['RoadDirection','DataCollectTime','LocationMile']).reset_index(drop=True)

def getRollingMeanDaily(selectDate: str) -> pd.DataFrame:
    sql  = " SELECT "
    sql += " 	STAC.VDID, STAC.RoadName, STAC.`Start`, STAC.`End`, "
    sql += " 	STAC.RoadDirection, DYMC.Speed, DYMC.Occupancy, DYMC.Volume, "
    sql += " 	STAC.LocationMile, DYMC.DataCollectTime "
    sql += " FROM ( "
    sql += " 	SELECT "
    sql += " 		VDSTC.id, VDSTC.VDID, ROAD.RoadName, SEC.`Start`, SEC.`End`, "
    sql += " 		VDSTC.RoadDirection, VDSTC.LocationMile "
    sql += " 	FROM fwy_n5.vd_static_2023 VDSTC "
    sql += " 	JOIN transport.road_info ROAD ON VDSTC.RoadInfoID = ROAD.id "
    sql += " 	JOIN transport.section_info SEC ON ROAD.id = SEC.RoadInfoID "
    sql += " 	AND VDSTC.LocationMile >= SEC.StartKM "
    sql += " 	AND VDSTC.LocationMile <= SEC.EndKM "
    sql += " 	WHERE VDSTC.Mainlane = 1 "
    sql += " ) STAC JOIN ( "
    sql += " 	SELECT "
    sql += " 		VdStaticID, "
    sql += " 		CASE "
    sql += " 			WHEN MIN(Speed) = -99 THEN -99 "
    sql += " 			ELSE AVG(Speed) "
    sql += " 		END AS Speed,  "
    sql += " 		CASE "
    sql += " 			WHEN MIN(Occupancy) = -99 THEN -99 "
    sql += " 			ELSE AVG(Occupancy) "
    sql += " 		END AS Occupancy,  "
    sql += " 		CASE "
    sql += " 			WHEN MIN(Volume) = -99 THEN -99 "
    sql += " 			ELSE AVG(Volume) "
    sql += " 		END AS Volume, "
    sql += " 		MAX(DataCollectTime) AS DataCollectTime, "
    sql += " 		(UNIX_TIMESTAMP(DataCollectTime)-UNIX_TIMESTAMP(%(selectDate)s)) DIV 300 "
    sql += " 	FROM fwy_n5.vd_dynamic_detail_{} ".format(selectDate.replace('-',''))
    sql += " 	GROUP BY VdStaticID, (UNIX_TIMESTAMP(DataCollectTime)-UNIX_TIMESTAMP(%(selectDate)s)) DIV 300 "
    sql += " ) DYMC ON STAC.id = DYMC.VdStaticID "
    sql += " ORDER BY STAC.RoadDirection, STAC.LocationMile, DYMC.DataCollectTime; "

    df = pd.read_sql(sql, con=engine, params={'selectDate': selectDate})
    engine.dispose()
    return df.sort_values(by=['RoadDirection','DataCollectTime','LocationMile']).reset_index(drop=True)

def groupVDs(df: pd.DataFrame, each: int) -> dict:
    """ Get the dict of VD groups
        ```text
        ---
        @Params
        df: DataFrame which is referenced by.
        each: The quantity of VDs would be considered as a group.

        ---
        @Returns
        vdGroups: The keys are the VDs we focus on, and the values are the collections of VDs which are correlated corresponding to the keys.
        ```
    """
    vdGroups = {}
    lb = each // 2
    ub = each - (each // 2)
    for vdid in df['VDID'].unique():
        vdGroups.setdefault(f"{vdid}", [])
    for no, vdid in enumerate(df['VDID'].unique()):
        startIdx = max(no-lb, 0)
        endIdx = min(no+ub, len(df['VDID'].unique())-1)
        vdGroups[f"{vdid}"] += list(df['VDID'].unique()[startIdx:no]) + list(df['VDID'].unique()[no:endIdx])

    delList = []
    for k in vdGroups.keys():
        if (len(vdGroups[k]) != each):
            delList.append(k)
    for k in delList:
        del vdGroups[k]
    
    return vdGroups

def genArrLists(df: pd.DataFrame, startDate: str, endDate: str, vdGroups: dict, groupKey: str,
                each: int, timeWindow: int = 30) -> tuple:
    """ # NOTE: This function has been deprecated.
        Generate array lists for each traffic flow data (speed, volume, and occupancy)
        ```text
        ---
        @Params
        startDate: The date for start, format='%Y-%m-%d'
        endDate: The date for end, format='%Y-%m-%d'
        vdGroups: Can get it from groupVDs(),
        groupKey: The key of vdGroups,
        each: The quantity of VDs would be considered as a group,
        timeWindow: The length of period we consider, and the default value is 30 (minutes).

        ---
        @Returns
        speeds: list,
        vols: list,
        occs: list
        ```
    """
    freq5 = pd.date_range(startDate, endDate, freq='5min')
    speeds, vols, occs = [], [], []
    speed, vol, occ = [], [], []
    for dtStart, dtEnd in zip(freq5[:-1], freq5[1:]):
        print(f"dtStart: {dtStart}")
        tmpDf = df.loc[(df['VDID'].isin(vdGroups[f"{groupKey}"])) &\
                       (df['DataCollectTime']>dtStart) &\
                       (df['DataCollectTime']<dtEnd)].sort_values(by='LocationMile')
        if (len(speed) < timeWindow//5) and (len(vol) < timeWindow//5):
            if (tmpDf[['Speed']].shape[0]>0) and (tmpDf[['Volume']].shape[0]>0):
                speed.append(tmpDf[['Speed']].to_numpy())
                vol.append(tmpDf[['Volume']].to_numpy())
                occ.append(tmpDf[['Occupancy']].to_numpy())
            else:
                speed.append(np.array([[-99.] for _ in range(each)]))
                vol.append(np.array([[-99.] for _ in range(each)]))
                occ.append(np.array([[-99.] for _ in range(each)]))
        else:
            speeds.append(np.concatenate(speed, axis=1))
            vols.append(np.concatenate(vol, axis=1))
            occs.append(np.concatenate(occ, axis=1))
            
            speed.clear()
            vol.clear()
            occ.clear()
    
    return speeds, vols, occs

def genArrLists_(df: pd.DataFrame, vdGroups: dict, groupKey: str, each: int, timeWindow: int = 30) -> tuple:
    """ Generate array lists for each traffic flow data (speed, volume, and occupancy)
        ```text
        ---
        @Params
        df: 
        vdGroups: The outpur of groupVDs(),
        groupKey: The key of vdGroups,
        each: The quantity of VDs would be considered as a group,
        timeWindow: The length of period we consider, and the default value is 30 (minutes).

        ---
        @Returns
        speeds: list with each item as a tuple, all of them are represented (X,y).
        vols: list with each item as a tuple, all of them are represented (X,y).
        occs: list with each item as a tuple, all of them are represented (X,y).
        ```
    """
    speeds, vols, occs = [], [], []
    tmpDf = df.loc[(df['VDID'].isin(vdGroups[f"{groupKey}"]))].sort_values(by=['LocationMile', 'DataCollectTime'])

    indices = [x for x in range(0, tmpDf.shape[0]+1, tmpDf.shape[0]//each)]
    speedMatx = np.zeros((each, tmpDf.shape[0]//each))
    volMatx = np.zeros((each, tmpDf.shape[0]//each))
    occMatx = np.zeros((each, tmpDf.shape[0]//each))
    for i, j, k in zip(range(each), indices[:-1], indices[1:]):
        speedMatx[i] += tmpDf.iloc[j:k,:]['Speed'].to_numpy()
        volMatx[i] += tmpDf.iloc[j:k,:]['Volume'].to_numpy()
        occMatx[i] += tmpDf.iloc[j:k,:]['Occupancy'].to_numpy()

    sliceLen = int((timeWindow / 5) + 1)
    for x in range(speedMatx.shape[1]//sliceLen*sliceLen-(sliceLen-1)):
        speeds.append((speedMatx[:,x:x+sliceLen][:,:-1], speedMatx[:,x:x+sliceLen][:,[-1]]))
        vols.append((volMatx[:,x:x+sliceLen][:,:-1], volMatx[:,x:x+sliceLen][:,[-1]]))
        occs.append((occMatx[:,x:x+sliceLen][:,:-1], occMatx[:,x:x+sliceLen][:,[-1]]))
    
    return speeds, vols, occs

def genTensors(speeds: list, vols: list) -> list:
    """ Generate torch.Tensors.
        The sizes of the tensors are `[batch, 2, each, 6]`, and `each` depends on how many VDs regarded as a group.
    """
    dataCollection = []
    for s, v in zip(speeds, vols):
        s = torch.tensor(s, dtype=torch.float).unsqueeze(0).unsqueeze(0)
        v = torch.tensor(v, dtype=torch.float).unsqueeze(0).unsqueeze(0)
        dataCollection.append(torch.concat([s, v], dim=1))
    return dataCollection

In [16]:
# # 取得一年份資料
# firstDate = list(map(lambda x: datetime.strftime(x, '%Y-%m-%d'), list(pd.date_range('2023-01-01', '2023-12-31', freq='MS'))))
# lastDate = list(map(lambda x: datetime.strftime(x, '%Y-%m-%d'), list(pd.date_range('2023-01-01', '2023-12-31', freq='ME'))))
# for first, last in zip(firstDate, lastDate):
#     dataframes = []
#     dateList = list(map(lambda x: datetime.strftime(x, '%Y-%m-%d'), list(pd.date_range(first, last))))
#     for date in dateList:
#         print(date)
#         dataframes.append(getRollingMeanDaily(date))
#     dataframes = pd.concat(dataframes).reset_index(drop=True)
#     display(dataframes)
#     feather.write_dataframe(dataframes, dest=f"./dataset/{date[:7].replace('-','')}.feather")

['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05', '2023-01-06', '2023-01-07', '2023-01-08', '2023-01-09', '2023-01-10', '2023-01-11', '2023-01-12', '2023-01-13', '2023-01-14', '2023-01-15', '2023-01-16', '2023-01-17', '2023-01-18', '2023-01-19', '2023-01-20', '2023-01-21', '2023-01-22', '2023-01-23', '2023-01-24', '2023-01-25', '2023-01-26', '2023-01-27', '2023-01-28', '2023-01-29', '2023-01-30', '2023-01-31']
['2023-02-01', '2023-02-02', '2023-02-03', '2023-02-04', '2023-02-05', '2023-02-06', '2023-02-07', '2023-02-08', '2023-02-09', '2023-02-10', '2023-02-11', '2023-02-12', '2023-02-13', '2023-02-14', '2023-02-15', '2023-02-16', '2023-02-17', '2023-02-18', '2023-02-19', '2023-02-20', '2023-02-21', '2023-02-22', '2023-02-23', '2023-02-24', '2023-02-25', '2023-02-26', '2023-02-27', '2023-02-28']
['2023-03-01', '2023-03-02', '2023-03-03', '2023-03-04', '2023-03-05', '2023-03-06', '2023-03-07', '2023-03-08', '2023-03-09', '2023-03-10', '2023-03-11', '2023-03-12', '20

In [2]:
dataframes = []
dateList = list(map(lambda x: datetime.strftime(x, '%Y-%m-%d'), list(pd.date_range('2023-08-01', '2023-08-31'))))
for date in dateList:
    print(date)
    dataframes.append(getRollingMeanDaily(date))
dataframes = pd.concat(dataframes).reset_index(drop=True)
display(dataframes)
feather.write_dataframe(dataframes, dest='./dataset/202308.feather')

2023-08-01
2023-08-02
2023-08-03
2023-08-04
2023-08-05
2023-08-06
2023-08-07
2023-08-08
2023-08-09
2023-08-10
2023-08-11
2023-08-12
2023-08-13
2023-08-14
2023-08-15
2023-08-16
2023-08-17
2023-08-18
2023-08-19
2023-08-20
2023-08-21
2023-08-22
2023-08-23
2023-08-24
2023-08-25
2023-08-26
2023-08-27
2023-08-28
2023-08-29
2023-08-30
2023-08-31


Unnamed: 0,VDID,RoadName,Start,End,RoadDirection,Speed,Occupancy,Volume,LocationMile,DataCollectTime
0,VD-N5-N-0.178-M-LOOP,國道5號,南港系統交流道,石碇交流道,N,-99.0000,-99.0000,-99.0000,0.178,2023-08-01 00:04:00
1,VD-N5-N-0.706-M-LOOP,國道5號,南港系統交流道,石碇交流道,N,37.2000,4.1000,4.8000,0.706,2023-08-01 00:04:00
2,VD-N5-N-1.068-M-LOOP,國道5號,南港系統交流道,石碇交流道,N,-99.0000,-99.0000,-99.0000,1.068,2023-08-01 00:04:00
3,VD-N5-N-2.068-M-PS-LOOP,國道5號,南港系統交流道,石碇交流道,N,-99.0000,-99.0000,-99.0000,2.068,2023-08-01 00:04:00
4,VD-N5-N-3.198-M-LOOP,國道5號,南港系統交流道,石碇交流道,N,45.3000,3.7000,4.7000,3.198,2023-08-01 00:04:00
...,...,...,...,...,...,...,...,...,...,...
930610,VD-N5-S-44.202-M-LOOP,國道5號,宜蘭交流道,羅東交流道,S,105.0000,1.8333,2.5000,44.202,2023-08-31 23:57:00
930611,VD-N5-S-46.566-M-LOOP,國道5號,宜蘭交流道,羅東交流道,S,66.8333,0.8333,0.8333,46.566,2023-08-31 23:57:00
930612,VD-N5-S-48.040-M-LOOP,國道5號,羅東交流道,蘇澳交流道,S,62.3333,0.8333,0.8333,48.040,2023-08-31 23:57:00
930613,VD-N5-S-51.138-M-IMAGE,國道5號,羅東交流道,蘇澳交流道,S,76.3333,14.5000,1.0000,51.138,2023-08-31 23:57:00


In [10]:
# df0731 = getRollingMeanDaily('2023-07-31')
df0731.tail(150)

Unnamed: 0,VDID,RoadName,Start,End,RoadDirection,Speed,Occupancy,Volume,LocationMile,DataCollectTime
30090,VD-N5-S-0.191-M-LOOP,國道5號,南港系統交流道,石碇交流道,S,90.6000,2.9000,4.1000,0.191,2023-07-31 23:49:00
30091,VD-N5-S-1.072-M-LOOP,國道5號,南港系統交流道,石碇交流道,S,90.7000,2.8000,3.7000,1.072,2023-07-31 23:49:00
30092,VD-N5-S-2.050-M-PS-LOOP,國道5號,南港系統交流道,石碇交流道,S,83.3000,3.4000,4.4000,2.050,2023-07-31 23:49:00
30093,VD-N5-S-3.178-M-LOOP,國道5號,南港系統交流道,石碇交流道,S,83.5000,3.2000,4.1000,3.178,2023-07-31 23:49:00
30094,VD-N5-S-3.506-M-LOOP,國道5號,南港系統交流道,石碇交流道,S,85.5000,3.2000,4.1000,3.506,2023-07-31 23:49:00
...,...,...,...,...,...,...,...,...,...,...
30235,VD-N5-S-44.202-M-LOOP,國道5號,宜蘭交流道,羅東交流道,S,106.3333,3.3333,3.8333,44.202,2023-07-31 23:57:00
30236,VD-N5-S-46.566-M-LOOP,國道5號,宜蘭交流道,羅東交流道,S,62.5000,0.8333,0.8333,46.566,2023-07-31 23:57:00
30237,VD-N5-S-48.040-M-LOOP,國道5號,羅東交流道,蘇澳交流道,S,31.1667,0.5000,0.6667,48.040,2023-07-31 23:57:00
30238,VD-N5-S-51.138-M-IMAGE,國道5號,羅東交流道,蘇澳交流道,S,75.5000,10.5000,1.1667,51.138,2023-07-31 23:57:00


In [2]:
df = []
for filename in os.listdir('./dataset')[1:]:
    df.append(feather.read_dataframe(f"./dataset/{filename}"))
df = pd.concat(df).reset_index(drop=True)
df

Unnamed: 0,VDID,RoadName,Start,End,RoadDirection,Speed,Occupancy,Volume,LocationMile,DataCollectTime
0,VD-N5-N-0.178-M-LOOP,國道5號,南港系統交流道,石碇交流道,N,-99.0000,-99.0000,-99.0000,0.178,2023-03-01 00:04:00
1,VD-N5-N-0.706-M-LOOP,國道5號,南港系統交流道,石碇交流道,N,-99.0000,-99.0000,-99.0000,0.706,2023-03-01 00:04:00
2,VD-N5-N-1.068-M-LOOP,國道5號,南港系統交流道,石碇交流道,N,-99.0000,-99.0000,-99.0000,1.068,2023-03-01 00:04:00
3,VD-N5-N-2.068-M-PS-LOOP,國道5號,南港系統交流道,石碇交流道,N,89.7000,5.0000,6.6000,2.068,2023-03-01 00:04:00
4,VD-N5-N-3.198-M-LOOP,國道5號,南港系統交流道,石碇交流道,N,93.1000,5.5000,5.8000,3.198,2023-03-01 00:04:00
...,...,...,...,...,...,...,...,...,...,...
2770525,VD-N5-S-44.202-M-LOOP,國道5號,宜蘭交流道,羅東交流道,S,87.3333,1.8333,2.6667,44.202,2023-05-31 23:57:00
2770526,VD-N5-S-46.566-M-LOOP,國道5號,宜蘭交流道,羅東交流道,S,52.6667,0.6667,1.0000,46.566,2023-05-31 23:57:00
2770527,VD-N5-S-48.040-M-LOOP,國道5號,羅東交流道,蘇澳交流道,S,50.0000,0.6667,0.6667,48.040,2023-05-31 23:57:00
2770528,VD-N5-S-51.138-M-IMAGE,國道5號,羅東交流道,蘇澳交流道,S,30.0000,3.0000,0.3333,51.138,2023-05-31 23:57:00


In [None]:
# # TODO: main
# if __name__ == '__main__':
#     startDate = '2023-01-01'
#     endDate = getEndDate(startDate, days=10)
#     # df = getRollingMean(startDate, endDate)
#     df = feather.read_dataframe('./20230101-20230110.feather').sort_values(by=['RoadDirection','DataCollectTime','LocationMile']).reset_index(drop=True)
    
#     # Northbound data
#     northDf = df.loc[df['RoadDirection']=='N'].reset_index(drop=True)
#     each = 3
#     vdGroups = groupVDs(northDf, each)    
#     speedDataset, volDataset, occDataset = [], [], []
#     for groupKey in vdGroups.keys():
#         speeds, vols, occs = genArrLists_(northDf, vdGroups, groupKey, each, timeWindow=30)
#         speedDataset.append(speeds)
#         volDataset.append(vols)
#         occDataset.append(occs)

#     # Southbound data
#     southDf = df.loc[df['RoadDirection']=='S'].reset_index(drop=True)
#     each = 3
#     vdGroups = groupVDs(southDf, each)    
#     speedDataset, volDataset, occDataset = [], [], []
#     for groupKey in vdGroups.keys():
#         speeds, vols, occs = genArrLists_(southDf, vdGroups, groupKey, each, timeWindow=30)
#         speedDataset.append(speeds)
#         volDataset.append(vols)
#         occDataset.append(occs)

In [117]:
# This cell is actually same as df.sort_values()

# groupDf = df.groupby(['RoadDirection','DataCollectTime','LocationMile']).agg({
#     'VDID': 'max',
#     'RoadName': 'max',
#     'Start': 'max',
#     'End': 'max',
#     'Speed': 'max',
#     'Occupancy': 'max',
#     'Volume': 'max',
# }).reset_index().sort_values(by=['RoadDirection','DataCollectTime','LocationMile'])
# groupDf

In [146]:
northDf = df.loc[df['RoadDirection']=='N'].reset_index(drop=True)
each = 3
print(f"grouping VDs: {datetime.strftime(datetime.now().replace(microsecond=0), '%Y-%m-%d %H:%M:%S')}")
vdGroups = groupVDs(northDf, each)
print(f"grouping VDs done: {datetime.strftime(datetime.now().replace(microsecond=0), '%Y-%m-%d %H:%M:%S')}")
# TODO: Declare lists for collecting speeds, vols, and occs

for groupKey in vdGroups.keys():
    print(f"[{groupKey}]: {datetime.strftime(datetime.now().replace(microsecond=0), '%Y-%m-%d %H:%M:%S')}")
    speeds, vols, occs = genArrLists_(northDf, vdGroups, groupKey, each, timeWindow=30)
    # tensors = genTensors(speeds, vols)
print(f"Done: {datetime.strftime(datetime.now().replace(microsecond=0), '%Y-%m-%d %H:%M:%S')}")

grouping VDs: 2024-03-13 22:42:16
grouping VDs done: 2024-03-13 22:42:28
[VD-N5-N-0.706-M-LOOP]: 2024-03-13 22:42:28
[VD-N5-N-1.068-M-LOOP]: 2024-03-13 22:42:28
[VD-N5-N-2.068-M-PS-LOOP]: 2024-03-13 22:42:29
[VD-N5-N-3.198-M-LOOP]: 2024-03-13 22:42:29
[VD-N5-N-3.943-M-LOOP]: 2024-03-13 22:42:29
[VD-N5-N-5.883-M-LOOP]: 2024-03-13 22:42:29
[VD-N5-N-7.107-M-LOOP]: 2024-03-13 22:42:30
[VD-N5-N-8.011-M-LOOP]: 2024-03-13 22:42:30
[VD-N5-N-9.028-M-IMAGE]: 2024-03-13 22:42:30
[VD-N5-N-9.840-M-LOOP]: 2024-03-13 22:42:31
[VD-N5-N-10.866-M-PS-LOOP]: 2024-03-13 22:42:31
[VD-N5-N-11.903-M-PS-LOOP]: 2024-03-13 22:42:31
[VD-N5-N-12.922-M-LOOP]: 2024-03-13 22:42:31
[VD-N5-N-13.707-M-LOOP]: 2024-03-13 22:42:32
[VD-N5-N-14.550-M-LOOP]: 2024-03-13 22:42:32
[VD-N5-N-15.488-M-LOOP]: 2024-03-13 22:42:32
[VD-N5-N-16.196-M-LOOP]: 2024-03-13 22:42:33
[VD-N5-N-16.900-M-PS-LOOP]: 2024-03-13 22:42:33
[VD-N5-N-17.608-M-LOOP]: 2024-03-13 22:42:33
[VD-N5-N-18.313-M-PS-LOOP]: 2024-03-13 22:42:33
[VD-N5-N-19.012-M-LOO

In [133]:
groupKey

'VD-N5-N-0.706-M-LOOP'

In [135]:
speeds[0]

(array([[-99. ,  79.1,  77.5,  89.4,  87.3,  89.4],
        [-99. ,  79. ,  79.9,  83. ,  81.1,  81.4],
        [-99. ,  78.2,  84.8,  90.6,  88.5,  89.7]]),
 array([[94.6],
        [75.6],
        [90.4]]))

In [134]:
northDf

Unnamed: 0,VDID,RoadName,Start,End,RoadDirection,Speed,Occupancy,Volume,LocationMile,DataCollectTime
0,VD-N5-N-0.178-M-LOOP,國道5號,南港系統交流道,石碇交流道,N,-99.0000,-99.0000,-99.0000,0.178,2023-03-01 00:04:00
1,VD-N5-N-0.706-M-LOOP,國道5號,南港系統交流道,石碇交流道,N,-99.0000,-99.0000,-99.0000,0.706,2023-03-01 00:04:00
2,VD-N5-N-1.068-M-LOOP,國道5號,南港系統交流道,石碇交流道,N,-99.0000,-99.0000,-99.0000,1.068,2023-03-01 00:04:00
3,VD-N5-N-2.068-M-PS-LOOP,國道5號,南港系統交流道,石碇交流道,N,89.7000,5.0000,6.6000,2.068,2023-03-01 00:04:00
4,VD-N5-N-3.198-M-LOOP,國道5號,南港系統交流道,石碇交流道,N,93.1000,5.5000,5.8000,3.198,2023-03-01 00:04:00
...,...,...,...,...,...,...,...,...,...,...
1451225,VD-N5-N-45.230-M-LOOP,國道5號,宜蘭交流道,羅東交流道,N,51.1667,0.5000,0.8333,45.230,2023-05-31 23:57:00
1451226,VD-N5-N-47.198-M-IMAGE,國道5號,羅東交流道,蘇澳交流道,N,38.3333,5.6667,4.8333,47.198,2023-05-31 23:57:00
1451227,VD-N5-N-49.070-M-LOOP,國道5號,羅東交流道,蘇澳交流道,N,98.6667,1.0000,1.3333,49.070,2023-05-31 23:57:00
1451228,VD-N5-N-51.095-M-LOOP,國道5號,羅東交流道,蘇澳交流道,N,54.0000,0.5000,0.5000,51.095,2023-05-31 23:57:00


In [4]:
groupDf = df.sort_values(by=['RoadDirection','DataCollectTime','LocationMile']).reset_index(drop=True)
groupDf

Unnamed: 0,VDID,RoadName,Start,End,RoadDirection,Speed,Occupancy,Volume,LocationMile,DataCollectTime
0,VD-N5-N-0.178-M-LOOP,國道5號,南港系統交流道,石碇交流道,N,-99.0000,-99.0000,-99.0000,0.178,2023-03-01 00:04:00
1,VD-N5-N-0.706-M-LOOP,國道5號,南港系統交流道,石碇交流道,N,-99.0000,-99.0000,-99.0000,0.706,2023-03-01 00:04:00
2,VD-N5-N-1.068-M-LOOP,國道5號,南港系統交流道,石碇交流道,N,-99.0000,-99.0000,-99.0000,1.068,2023-03-01 00:04:00
3,VD-N5-N-2.068-M-PS-LOOP,國道5號,南港系統交流道,石碇交流道,N,89.7000,5.0000,6.6000,2.068,2023-03-01 00:04:00
4,VD-N5-N-3.198-M-LOOP,國道5號,南港系統交流道,石碇交流道,N,93.1000,5.5000,5.8000,3.198,2023-03-01 00:04:00
...,...,...,...,...,...,...,...,...,...,...
2770525,VD-N5-S-44.202-M-LOOP,國道5號,宜蘭交流道,羅東交流道,S,87.3333,1.8333,2.6667,44.202,2023-05-31 23:57:00
2770526,VD-N5-S-46.566-M-LOOP,國道5號,宜蘭交流道,羅東交流道,S,52.6667,0.6667,1.0000,46.566,2023-05-31 23:57:00
2770527,VD-N5-S-48.040-M-LOOP,國道5號,羅東交流道,蘇澳交流道,S,50.0000,0.6667,0.6667,48.040,2023-05-31 23:57:00
2770528,VD-N5-S-51.138-M-IMAGE,國道5號,羅東交流道,蘇澳交流道,S,30.0000,3.0000,0.3333,51.138,2023-05-31 23:57:00


In [5]:
northDf = groupDf.loc[groupDf['RoadDirection']=='N'].reset_index(drop=True)
northDf

Unnamed: 0,VDID,RoadName,Start,End,RoadDirection,Speed,Occupancy,Volume,LocationMile,DataCollectTime
0,VD-N5-N-0.178-M-LOOP,國道5號,南港系統交流道,石碇交流道,N,-99.0000,-99.0000,-99.0000,0.178,2023-03-01 00:04:00
1,VD-N5-N-0.706-M-LOOP,國道5號,南港系統交流道,石碇交流道,N,-99.0000,-99.0000,-99.0000,0.706,2023-03-01 00:04:00
2,VD-N5-N-1.068-M-LOOP,國道5號,南港系統交流道,石碇交流道,N,-99.0000,-99.0000,-99.0000,1.068,2023-03-01 00:04:00
3,VD-N5-N-2.068-M-PS-LOOP,國道5號,南港系統交流道,石碇交流道,N,89.7000,5.0000,6.6000,2.068,2023-03-01 00:04:00
4,VD-N5-N-3.198-M-LOOP,國道5號,南港系統交流道,石碇交流道,N,93.1000,5.5000,5.8000,3.198,2023-03-01 00:04:00
...,...,...,...,...,...,...,...,...,...,...
1451225,VD-N5-N-45.230-M-LOOP,國道5號,宜蘭交流道,羅東交流道,N,51.1667,0.5000,0.8333,45.230,2023-05-31 23:57:00
1451226,VD-N5-N-47.198-M-IMAGE,國道5號,羅東交流道,蘇澳交流道,N,38.3333,5.6667,4.8333,47.198,2023-05-31 23:57:00
1451227,VD-N5-N-49.070-M-LOOP,國道5號,羅東交流道,蘇澳交流道,N,98.6667,1.0000,1.3333,49.070,2023-05-31 23:57:00
1451228,VD-N5-N-51.095-M-LOOP,國道5號,羅東交流道,蘇澳交流道,N,54.0000,0.5000,0.5000,51.095,2023-05-31 23:57:00


In [6]:
vdGroups = groupVDs(northDf, each=3)
vdGroups

{'VD-N5-N-0.706-M-LOOP': ['VD-N5-N-0.178-M-LOOP',
  'VD-N5-N-0.706-M-LOOP',
  'VD-N5-N-1.068-M-LOOP'],
 'VD-N5-N-1.068-M-LOOP': ['VD-N5-N-0.706-M-LOOP',
  'VD-N5-N-1.068-M-LOOP',
  'VD-N5-N-2.068-M-PS-LOOP'],
 'VD-N5-N-2.068-M-PS-LOOP': ['VD-N5-N-1.068-M-LOOP',
  'VD-N5-N-2.068-M-PS-LOOP',
  'VD-N5-N-3.198-M-LOOP'],
 'VD-N5-N-3.198-M-LOOP': ['VD-N5-N-2.068-M-PS-LOOP',
  'VD-N5-N-3.198-M-LOOP',
  'VD-N5-N-3.943-M-LOOP'],
 'VD-N5-N-3.943-M-LOOP': ['VD-N5-N-3.198-M-LOOP',
  'VD-N5-N-3.943-M-LOOP',
  'VD-N5-N-5.883-M-LOOP'],
 'VD-N5-N-5.883-M-LOOP': ['VD-N5-N-3.943-M-LOOP',
  'VD-N5-N-5.883-M-LOOP',
  'VD-N5-N-7.107-M-LOOP'],
 'VD-N5-N-7.107-M-LOOP': ['VD-N5-N-5.883-M-LOOP',
  'VD-N5-N-7.107-M-LOOP',
  'VD-N5-N-8.011-M-LOOP'],
 'VD-N5-N-8.011-M-LOOP': ['VD-N5-N-7.107-M-LOOP',
  'VD-N5-N-8.011-M-LOOP',
  'VD-N5-N-9.028-M-IMAGE'],
 'VD-N5-N-9.028-M-IMAGE': ['VD-N5-N-8.011-M-LOOP',
  'VD-N5-N-9.028-M-IMAGE',
  'VD-N5-N-9.840-M-LOOP'],
 'VD-N5-N-9.840-M-LOOP': ['VD-N5-N-9.028-M-IMAGE',
  'VD-N

In [14]:
len(vdGroups.keys())

52

In [9]:
start = '2023-03-01'
end = '2023-03-11'
each = 3
# vdGroups = groupVDs(northDf, each)
groupKey = 'VD-N5-N-3.198-M-LOOP'

speeds, vols, occs = genArrLists(northDf, start, end, vdGroups, groupKey, each)

dtStart: 2023-03-01 00:00:00
dtStart: 2023-03-01 00:05:00
dtStart: 2023-03-01 00:10:00
dtStart: 2023-03-01 00:15:00
dtStart: 2023-03-01 00:20:00
dtStart: 2023-03-01 00:25:00
dtStart: 2023-03-01 00:30:00
dtStart: 2023-03-01 00:35:00
dtStart: 2023-03-01 00:40:00
dtStart: 2023-03-01 00:45:00
dtStart: 2023-03-01 00:50:00
dtStart: 2023-03-01 00:55:00
dtStart: 2023-03-01 01:00:00
dtStart: 2023-03-01 01:05:00
dtStart: 2023-03-01 01:10:00
dtStart: 2023-03-01 01:15:00
dtStart: 2023-03-01 01:20:00
dtStart: 2023-03-01 01:25:00
dtStart: 2023-03-01 01:30:00
dtStart: 2023-03-01 01:35:00
dtStart: 2023-03-01 01:40:00
dtStart: 2023-03-01 01:45:00
dtStart: 2023-03-01 01:50:00
dtStart: 2023-03-01 01:55:00
dtStart: 2023-03-01 02:00:00
dtStart: 2023-03-01 02:05:00
dtStart: 2023-03-01 02:10:00
dtStart: 2023-03-01 02:15:00
dtStart: 2023-03-01 02:20:00
dtStart: 2023-03-01 02:25:00
dtStart: 2023-03-01 02:30:00
dtStart: 2023-03-01 02:35:00
dtStart: 2023-03-01 02:40:00
dtStart: 2023-03-01 02:45:00
dtStart: 2023-

In [136]:
start = '2023-03-01'
end = '2023-05-31'
each = 3
# vdGroups = groupVDs(northDf, each)
groupKey = 'VD-N5-N-0.706-M-LOOP'

timeWindow = 30
# freq5 = pd.date_range(start, end, freq='5min')
# speeds, vols, occs = [], [], []
# tmpDf = df.loc[(df['VDID'].isin(vdGroups[f"{groupKey}"]))].sort_values(by=['LocationMile', 'DataCollectTime'])

# indices = [x for x in range(0, tmpDf.shape[0]+1, tmpDf.shape[0]//each)]
# speedMatx = np.zeros((each, tmpDf.shape[0]//each))
# volMatx = np.zeros((each, tmpDf.shape[0]//each))
# occMatx = np.zeros((each, tmpDf.shape[0]//each))
# for i, j, k in zip(range(each), indices[:-1], indices[1:]):
#     speedMatx[i] += tmpDf.iloc[j:k,:]['Speed'].to_numpy()
#     volMatx[i] += tmpDf.iloc[j:k,:]['Volume'].to_numpy()
#     occMatx[i] += tmpDf.iloc[j:k,:]['Occupancy'].to_numpy()

# for x in range(speedMatx.shape[1]//7*7-6):
#     speeds.append((speedMatx[:,x:x+7][:,:-1], speedMatx[:,x:x+7][:,[-1]]))
#     vols.append((volMatx[:,x:x+7][:,:-1], volMatx[:,x:x+7][:,[-1]]))
#     occs.append((occMatx[:,x:x+7][:,:-1], occMatx[:,x:x+7][:,[-1]]))












s, v, o = [], [], []
tmpDf = df.loc[(df['VDID'].isin(vdGroups[f"{groupKey}"]))].sort_values(by=['LocationMile', 'DataCollectTime'])

indices = [x for x in range(0, tmpDf.shape[0]+1, tmpDf.shape[0]//each)]
speedMatx = np.zeros((each, tmpDf.shape[0]//each))
volMatx = np.zeros((each, tmpDf.shape[0]//each))
occMatx = np.zeros((each, tmpDf.shape[0]//each))
for i, j, k in zip(range(each), indices[:-1], indices[1:]):
    speedMatx[i] += tmpDf.iloc[j:k,:]['Speed'].to_numpy()
    volMatx[i] += tmpDf.iloc[j:k,:]['Volume'].to_numpy()
    occMatx[i] += tmpDf.iloc[j:k,:]['Occupancy'].to_numpy()

sliceLen = int((timeWindow / 5) + 1)
for x in range(speedMatx.shape[1]//sliceLen*sliceLen-(sliceLen-1)):
    s.append((speedMatx[:,x:x+sliceLen][:,:-1], speedMatx[:,x:x+sliceLen][:,[-1]]))
    v.append((volMatx[:,x:x+sliceLen][:,:-1], volMatx[:,x:x+sliceLen][:,[-1]]))
    o.append((occMatx[:,x:x+sliceLen][:,:-1], occMatx[:,x:x+sliceLen][:,[-1]]))

In [140]:
speeds[1]

(array([[79.1, 77.5, 89.4, 87.3, 89.4, 94.6],
        [79. , 79.9, 83. , 81.1, 81.4, 75.6],
        [78.2, 84.8, 90.6, 88.5, 89.7, 90.4]]),
 array([[84.5],
        [79.9],
        [85.8]]))

In [141]:
s[1]

(array([[79.1, 77.5, 89.4, 87.3, 89.4, 94.6],
        [79. , 79.9, 83. , 81.1, 81.4, 75.6],
        [78.2, 84.8, 90.6, 88.5, 89.7, 90.4]]),
 array([[84.5],
        [79.9],
        [85.8]]))

In [124]:
v[2]

(array([[5.8, 5. , 4.5, 5.2, 3.7, 3.9],
        [6.2, 4.8, 4.6, 5.2, 3.2, 4. ],
        [5.4, 4.9, 3.8, 4.9, 3.1, 4. ]]),
 array([[3.9],
        [5. ],
        [4.4]]))

In [125]:
volMatx[:,2:9]

array([[5.8, 5. , 4.5, 5.2, 3.7, 3.9, 3.9],
       [6.2, 4.8, 4.6, 5.2, 3.2, 4. , 5. ],
       [5.4, 4.9, 3.8, 4.9, 3.1, 4. , 4.4]])

In [101]:
[x for x in range(0, speedMatx.shape[1]+1, 7)][-2]

26376

In [99]:
(speedMatx.shape[1]//7)*7

26383

In [102]:
occMatx[:,26376:26383]

array([[2.5, 2. , 2. , 2.4, 2.1, 1.6, 1.4],
       [2.5, 1.6, 2.6, 2.4, 2.1, 1.4, 2.2],
       [2.2, 1.7, 1.9, 1.9, 2. , 1.7, 1.4]])

In [112]:
occMatx[:,0:7][:,[-1]]

array([[  5. , -99. ,   4.3,   3.6,   3.5,   3.9],
       [  5.5, -99. ,   4.7,   4.5,   3.3,   3.7],
       [-99. ,   3.6,   4. ,   3.5,   2.5,   3.3]])

In [113]:
tmpL = []
for x in range(occMatx.shape[1]//7*7-6):
    tmpL.append((occMatx[:,x:x+7][:,:-1], occMatx[:,x:x+7][:,[-1]]))
    # print(f"occMatx[:,{x}:{x+7}]")

In [89]:
occMatx[:,0:9]

array([[  5. , -99. ,   4.3,   3.6,   3.5,   3.9,   2.7,   2.7,   2.8],
       [  5.5, -99. ,   4.7,   4.5,   3.3,   3.7,   2.3,   2.8,   3.6],
       [-99. ,   3.6,   4. ,   3.5,   2.5,   3.3,   2.2,   2.6,   3. ]])

In [90]:
occMatx[:,0:7]

array([[  5. , -99. ,   4.3,   3.6,   3.5,   3.9,   2.7],
       [  5.5, -99. ,   4.7,   4.5,   3.3,   3.7,   2.3],
       [-99. ,   3.6,   4. ,   3.5,   2.5,   3.3,   2.2]])

In [95]:
occMatx[:,26383:26386]

array([[1.5   , 0.8   , 2.3333],
       [0.8   , 0.8   , 2.3333],
       [0.9   , 0.5   , 2.3333]])

In [79]:
speedMatx.shape

(3, 26386)

In [80]:
26386/7

3769.4285714285716

In [69]:
tmpDf.iloc[26386:52772,:]

Unnamed: 0,VDID,RoadName,Start,End,RoadDirection,Speed,Occupancy,Volume,LocationMile,DataCollectTime
4,VD-N5-N-3.198-M-LOOP,國道5號,南港系統交流道,石碇交流道,N,93.1000,5.5000,5.8000,3.198,2023-03-01 00:04:00
59,VD-N5-N-3.198-M-LOOP,國道5號,南港系統交流道,石碇交流道,N,-99.0000,-99.0000,-99.0000,3.198,2023-03-01 00:09:00
114,VD-N5-N-3.198-M-LOOP,國道5號,南港系統交流道,石碇交流道,N,88.6000,4.7000,6.2000,3.198,2023-03-01 00:14:00
169,VD-N5-N-3.198-M-LOOP,國道5號,南港系統交流道,石碇交流道,N,93.0000,4.5000,4.8000,3.198,2023-03-01 00:19:00
224,VD-N5-N-3.198-M-LOOP,國道5號,南港系統交流道,石碇交流道,N,88.6000,3.3000,4.6000,3.198,2023-03-01 00:24:00
...,...,...,...,...,...,...,...,...,...,...
2755859,VD-N5-N-3.198-M-LOOP,國道5號,南港系統交流道,石碇交流道,N,86.9000,1.4000,1.9000,3.198,2023-05-31 23:39:00
2755914,VD-N5-N-3.198-M-LOOP,國道5號,南港系統交流道,石碇交流道,N,63.7000,2.2000,3.1000,3.198,2023-05-31 23:44:00
2755969,VD-N5-N-3.198-M-LOOP,國道5號,南港系統交流道,石碇交流道,N,67.1000,0.8000,1.2000,3.198,2023-05-31 23:49:00
2756024,VD-N5-N-3.198-M-LOOP,國道5號,南港系統交流道,石碇交流道,N,53.3000,0.8000,1.1000,3.198,2023-05-31 23:54:00


In [67]:
volMatx

array([[  6.6   , -99.    ,   5.8   , ...,   2.    ,   1.2   ,   2.8333],
       [  5.8   , -99.    ,   6.2   , ...,   1.2   ,   1.1   ,   3.1667],
       [-99.    ,   5.1   ,   5.4   , ...,   1.3   ,   0.6   ,   3.3333]])

In [48]:
zeroMatx = np.zeros((each, tmpDf.shape[0]))
zeroMatx[0]

array([0., 0., 0., ..., 0., 0., 0.])

In [11]:
len(speeds)

411

In [12]:
tensorList = genTensors(speeds, vols)
tensorList[0].shape

torch.Size([1, 2, 3, 6])