In [2]:
from sqlalchemy import create_engine
from datetime import datetime
from datetime import timedelta
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import pandas as pd
import numpy as np
import feather
import os
import torch


# connect to db
user = 'root'
pswd = 'Curry5566'
host = '127.0.0.1'
port = '3306'
db = 'transport'
engine = create_engine(f"mysql+pymysql://{user}:{pswd}@{host}:{port}/{db}?charset=utf8")


def getEndDate(startDate: str, days: int) -> str:
    startDate += ' 00:00:00'
    end = str((datetime.strptime(startDate, '%Y-%m-%d %H:%M:%S') + timedelta(days=days)).replace(microsecond=0))
    return end

def getRollingMean(startDate: str, endDate: str) -> pd.DataFrame:
    """ Get the rolling mean from db
        ```text
        ---
        @Params
        startDate: The date for start, format='%Y-%m-%d'
        endDate: The date for end, format='%Y-%m-%d'

        ---
        @Returns
        DataFrame
        ```
    """
    sql  = " SELECT "
    sql += " 	STAC.VDID, STAC.RoadName, STAC.`Start`, STAC.`End`, "
    sql += " 	STAC.RoadDirection, DYMC.Speed, DYMC.Occupancy, DYMC.Volume, "
    sql += " 	STAC.LocationMile, DYMC.DataCollectTime "
    sql += " FROM ( "
    sql += " 	SELECT "
    sql += " 		VDSTC.id, VDSTC.VDID, ROAD.RoadName, SEC.`Start`, SEC.`End`, "
    sql += " 		VDSTC.RoadDirection, VDSTC.LocationMile "
    sql += " 	FROM vd_static_n5 VDSTC "
    sql += " 	JOIN road_info ROAD ON VDSTC.RoadInfoID = ROAD.id "
    sql += " 	JOIN section_info SEC ON ROAD.id = SEC.RoadInfoID "
    sql += " 	AND VDSTC.LocationMile >= SEC.StartKM "
    sql += " 	AND VDSTC.LocationMile <= SEC.EndKM "
    sql += " 	WHERE VDSTC.Mainlane = 1 "
    sql += " ) STAC JOIN ( "
    sql += " 	SELECT "
    sql += " 		VdStaticID, "
    sql += " 		CASE "
    sql += " 			WHEN MIN(Speed) = -99 THEN -99 "
    sql += " 			ELSE AVG(Speed) "
    sql += " 		END AS Speed,  "
    sql += " 		CASE "
    sql += " 			WHEN MIN(Occupancy) = -99 THEN -99 "
    sql += " 			ELSE AVG(Occupancy) "
    sql += " 		END AS Occupancy,  "
    sql += " 		CASE "
    sql += " 			WHEN MIN(Volume) = -99 THEN -99 "
    sql += " 			ELSE AVG(Volume) "
    sql += " 		END AS Volume, "
    sql += " 		MAX(DataCollectTime) AS DataCollectTime, "
    sql += " 		(UNIX_TIMESTAMP(DataCollectTime)-UNIX_TIMESTAMP(%(start)s)) DIV 300 "
    sql += " 	FROM vd_dynamic_detail_n5_202301 "
    sql += " 	WHERE id BETWEEN ( "
    sql += " 		SELECT id FROM vd_dynamic_detail_n5_202301 "
    sql += " 		WHERE DataCollectTime = %(start)s "
    sql += " 		ORDER BY id LIMIT 1 "
    sql += " 	) AND ( "
    sql += " 		SELECT id FROM vd_dynamic_detail_n5_202301 "
    sql += " 		WHERE DataCollectTime < %(end)s "
    sql += " 		ORDER BY id DESC LIMIT 1 "
    sql += " 	) "
    sql += " 	GROUP BY VdStaticID, (UNIX_TIMESTAMP(DataCollectTime)-UNIX_TIMESTAMP(%(start)s)) DIV 300 "
    sql += " ) DYMC ON STAC.id = DYMC.VdStaticID "
    sql += " ORDER BY STAC.RoadDirection, STAC.LocationMile, DYMC.DataCollectTime; "

    df = pd.read_sql(sql, con=engine, params={'start': startDate, 'end': endDate})
    engine.dispose()
    return df.sort_values(by=['RoadDirection','DataCollectTime','LocationMile']).reset_index(drop=True)

def getRollingMeanDaily(selectDate: str) -> pd.DataFrame:
    sql  = " SELECT "
    sql += " 	STAC.VDID, STAC.RoadName, STAC.`Start`, STAC.`End`, "
    sql += " 	STAC.RoadDirection, DYMC.Speed, DYMC.Occupancy, DYMC.Volume, "
    sql += " 	STAC.LocationMile, DYMC.DataCollectTime "
    sql += " FROM ( "
    sql += " 	SELECT "
    sql += " 		VDSTC.id, VDSTC.VDID, ROAD.RoadName, SEC.`Start`, SEC.`End`, "
    sql += " 		VDSTC.RoadDirection, VDSTC.LocationMile "
    sql += " 	FROM fwy_n5.vd_static_2023 VDSTC "
    sql += " 	JOIN transport.road_info ROAD ON VDSTC.RoadInfoID = ROAD.id "
    sql += " 	JOIN transport.section_info SEC ON ROAD.id = SEC.RoadInfoID "
    sql += " 	AND VDSTC.LocationMile >= SEC.StartKM "
    sql += " 	AND VDSTC.LocationMile <= SEC.EndKM "
    sql += " 	WHERE VDSTC.Mainlane = 1 "
    sql += " ) STAC JOIN ( "
    sql += " 	SELECT "
    sql += " 		VdStaticID, "
    sql += " 		CASE "
    sql += " 			WHEN MIN(Speed) = -99 THEN -99 "
    sql += " 			ELSE AVG(Speed) "
    sql += " 		END AS Speed,  "
    sql += " 		CASE "
    sql += " 			WHEN MIN(Occupancy) = -99 THEN -99 "
    sql += " 			ELSE AVG(Occupancy) "
    sql += " 		END AS Occupancy,  "
    sql += " 		CASE "
    sql += " 			WHEN MIN(Volume) = -99 THEN -99 "
    sql += " 			ELSE AVG(Volume) "
    sql += " 		END AS Volume, "
    sql += " 		MAX(DataCollectTime) AS DataCollectTime, "
    sql += " 		(UNIX_TIMESTAMP(DataCollectTime)-UNIX_TIMESTAMP(%(selectDate)s)) DIV 300 "
    sql += " 	FROM fwy_n5.vd_dynamic_detail_{} ".format(selectDate.replace('-',''))
    sql += " 	GROUP BY VdStaticID, (UNIX_TIMESTAMP(DataCollectTime)-UNIX_TIMESTAMP(%(selectDate)s)) DIV 300 "
    sql += " ) DYMC ON STAC.id = DYMC.VdStaticID "
    sql += " ORDER BY STAC.RoadDirection, STAC.LocationMile, DYMC.DataCollectTime; "

    df = pd.read_sql(sql, con=engine, params={'selectDate': selectDate})
    engine.dispose()
    return df.sort_values(by=['RoadDirection','DataCollectTime','LocationMile']).reset_index(drop=True)

def groupVDs(df: pd.DataFrame, each: int) -> dict:
    """ Get the dict of VD groups
        ```text
        ---
        @Params
        df: DataFrame which is referenced by.
        each: The quantity of VDs would be considered as a group.

        ---
        @Returns
        vdGroups: The keys are the VDs we focus on, and the values are the collections of VDs which are correlated corresponding to the keys.
        ```
    """
    vdGroups = {}
    lb = each // 2
    ub = each - (each // 2)
    for vdid in df['VDID'].unique():
        vdGroups.setdefault(f"{vdid}", [])
    for no, vdid in enumerate(df['VDID'].unique()):
        startIdx = max(no-lb, 0)
        endIdx = min(no+ub, len(df['VDID'].unique())-1)
        vdGroups[f"{vdid}"] += list(df['VDID'].unique()[startIdx:no]) + list(df['VDID'].unique()[no:endIdx])

    delList = []
    for k in vdGroups.keys():
        if (len(vdGroups[k]) != each):
            delList.append(k)
    for k in delList:
        del vdGroups[k]
    
    return vdGroups

def genArrLists(df: pd.DataFrame, startDate: str, endDate: str, vdGroups: dict, groupKey: str,
                each: int, timeWindow: int = 30) -> tuple:
    """ # NOTE: This function has been deprecated.
        Generate array lists for each traffic flow data (speed, volume, and occupancy)
        ```text
        ---
        @Params
        startDate: The date for start, format='%Y-%m-%d'
        endDate: The date for end, format='%Y-%m-%d'
        vdGroups: Can get it from groupVDs(),
        groupKey: The key of vdGroups,
        each: The quantity of VDs would be considered as a group,
        timeWindow: The length of period we consider, and the default value is 30 (minutes).

        ---
        @Returns
        speeds: list,
        vols: list,
        occs: list
        ```
    """
    freq5 = pd.date_range(startDate, endDate, freq='5min')
    speeds, vols, occs = [], [], []
    speed, vol, occ = [], [], []
    for dtStart, dtEnd in zip(freq5[:-1], freq5[1:]):
        print(f"dtStart: {dtStart}")
        tmpDf = df.loc[(df['VDID'].isin(vdGroups[f"{groupKey}"])) &\
                       (df['DataCollectTime']>dtStart) &\
                       (df['DataCollectTime']<dtEnd)].sort_values(by='LocationMile')
        if (len(speed) < timeWindow//5) and (len(vol) < timeWindow//5):
            if (tmpDf[['Speed']].shape[0]>0) and (tmpDf[['Volume']].shape[0]>0):
                speed.append(tmpDf[['Speed']].to_numpy())
                vol.append(tmpDf[['Volume']].to_numpy())
                occ.append(tmpDf[['Occupancy']].to_numpy())
            else:
                speed.append(np.array([[-99.] for _ in range(each)]))
                vol.append(np.array([[-99.] for _ in range(each)]))
                occ.append(np.array([[-99.] for _ in range(each)]))
        else:
            speeds.append(np.concatenate(speed, axis=1))
            vols.append(np.concatenate(vol, axis=1))
            occs.append(np.concatenate(occ, axis=1))
            
            speed.clear()
            vol.clear()
            occ.clear()
    
    return speeds, vols, occs

def genSamples(df: pd.DataFrame, vdGroups: dict, groupKey: str, each: int, timeWindow: int = 30) -> tuple:
    """ Generate samples for each traffic data (speed, volume, and occupancy)
        ```text
        ---
        @Params
        df: 
        vdGroups: The outpur of groupVDs(),
        groupKey: The key of vdGroups,
        each: The quantity of VDs would be considered as a group,
        timeWindow: The length of period we consider, and the default value is 30 (minutes).

        ---
        @Returns
        speeds: list with each item as a tuple, all of them are represented (X,y).
        vols: list with each item as a tuple, all of them are represented (X,y).
        occs: list with each item as a tuple, all of them are represented (X,y).
        ```
    """
    speeds, vols, occs = [], [], []
    tmpDf = df.loc[(df['VDID'].isin(vdGroups[f"{groupKey}"]))].sort_values(by=['LocationMile', 'DataCollectTime'])

    indices = [x for x in range(0, tmpDf.shape[0]+1, tmpDf.shape[0]//each)]
    speedMatx = np.zeros((each, tmpDf.shape[0]//each))
    volMatx = np.zeros((each, tmpDf.shape[0]//each))
    occMatx = np.zeros((each, tmpDf.shape[0]//each))
    for i, j, k in zip(range(each), indices[:-1], indices[1:]):
        speedMatx[i] += tmpDf.iloc[j:k,:]['Speed'].to_numpy()
        volMatx[i] += tmpDf.iloc[j:k,:]['Volume'].to_numpy()
        occMatx[i] += tmpDf.iloc[j:k,:]['Occupancy'].to_numpy()

    sliceLen = int((timeWindow / 5) + 1)
    for x in range(speedMatx.shape[1]//sliceLen*sliceLen-(sliceLen-1)):
        speeds.append((speedMatx[:,x:x+sliceLen][:,:-1], speedMatx[:,x:x+sliceLen][:,[-1]]))
        vols.append((volMatx[:,x:x+sliceLen][:,:-1], volMatx[:,x:x+sliceLen][:,[-1]]))
        occs.append((occMatx[:,x:x+sliceLen][:,:-1], occMatx[:,x:x+sliceLen][:,[-1]]))
    
    return speeds, vols, occs

def genTensors(speeds: list, vols: list) -> list:
    """ Generate torch.Tensors.
        The sizes of the tensors are `[batch, 2, each, 6]`, and `each` depends on how many VDs regarded as a group.
    """
    dataCollection = []
    for s, v in zip(speeds, vols):
        s = torch.tensor(s, dtype=torch.float).unsqueeze(0).unsqueeze(0)
        v = torch.tensor(v, dtype=torch.float).unsqueeze(0).unsqueeze(0)
        dataCollection.append(torch.concat([s, v], dim=1))
    return dataCollection

In [None]:
# # 取得一年份資料
# firstDate = list(map(lambda x: datetime.strftime(x, '%Y-%m-%d'), list(pd.date_range('2023-01-01', '2023-12-31', freq='MS'))))
# lastDate = list(map(lambda x: datetime.strftime(x, '%Y-%m-%d'), list(pd.date_range('2023-01-01', '2023-12-31', freq='ME'))))
# for first, last in zip(firstDate, lastDate):
#     dataframes = []
#     dateList = list(map(lambda x: datetime.strftime(x, '%Y-%m-%d'), list(pd.date_range(first, last))))
#     for date in dateList:
#         print(date)
#         dataframes.append(getRollingMeanDaily(date))
#     dataframes = pd.concat(dataframes).reset_index(drop=True)
#     display(dataframes)
#     feather.write_dataframe(dataframes, dest=f"./dataset/{date[:7].replace('-','')}.feather")

In [5]:
dataframes = []
dateList = list(map(lambda x: datetime.strftime(x, '%Y-%m-%d'), list(pd.date_range('2023-11-01', '2023-11-30'))))
for date in dateList:
    print(date)
    dataframes.append(getRollingMeanDaily(date))
dataframes = pd.concat(dataframes).reset_index(drop=True)
display(dataframes)
feather.write_dataframe(dataframes, dest='./dataset/202311.feather')

2023-11-01
2023-11-02
2023-11-03
2023-11-04
2023-11-05
2023-11-06
2023-11-07
2023-11-08
2023-11-09
2023-11-10
2023-11-11
2023-11-12
2023-11-13
2023-11-14
2023-11-15
2023-11-16
2023-11-17
2023-11-18
2023-11-19
2023-11-20
2023-11-21
2023-11-22
2023-11-23
2023-11-24
2023-11-25
2023-11-26
2023-11-27
2023-11-28
2023-11-29
2023-11-30


Unnamed: 0,VDID,RoadName,Start,End,RoadDirection,Speed,Occupancy,Volume,LocationMile,DataCollectTime
0,VD-N5-N-0.178-M-LOOP,國道5號,南港系統交流道,石碇交流道,N,68.3000,1.7000,2.2000,0.178,2023-11-01 00:04:00
1,VD-N5-N-0.706-M-LOOP,國道5號,南港系統交流道,石碇交流道,N,67.7000,1.6000,2.2000,0.706,2023-11-01 00:04:00
2,VD-N5-N-1.068-M-LOOP,國道5號,南港系統交流道,石碇交流道,N,-99.0000,-99.0000,-99.0000,1.068,2023-11-01 00:04:00
3,VD-N5-N-2.068-M-PS-LOOP,國道5號,南港系統交流道,石碇交流道,N,-99.0000,-99.0000,-99.0000,2.068,2023-11-01 00:04:00
4,VD-N5-N-3.198-M-LOOP,國道5號,南港系統交流道,石碇交流道,N,79.3000,2.0000,2.6000,3.198,2023-11-01 00:04:00
...,...,...,...,...,...,...,...,...,...,...
896170,VD-N5-S-44.202-M-LOOP,國道5號,宜蘭交流道,羅東交流道,S,66.1667,2.8333,3.5000,44.202,2023-11-30 23:57:00
896171,VD-N5-S-46.566-M-LOOP,國道5號,宜蘭交流道,羅東交流道,S,30.1667,0.3333,0.5000,46.566,2023-11-30 23:57:00
896172,VD-N5-S-48.040-M-LOOP,國道5號,羅東交流道,蘇澳交流道,S,31.3333,0.3333,0.6667,48.040,2023-11-30 23:57:00
896173,VD-N5-S-51.138-M-IMAGE,國道5號,羅東交流道,蘇澳交流道,S,78.3333,10.5000,2.0000,51.138,2023-11-30 23:57:00


In [2]:
df = []
for filename in os.listdir('./dataset')[1:]:
    df.append(feather.read_dataframe(f"./dataset/{filename}"))
df = pd.concat(df).reset_index(drop=True)
df

Unnamed: 0,VDID,RoadName,Start,End,RoadDirection,Speed,Occupancy,Volume,LocationMile,DataCollectTime
0,VD-N5-N-0.178-M-LOOP,國道5號,南港系統交流道,石碇交流道,N,-99.0000,-99.0000,-99.0000,0.178,2023-03-01 00:04:00
1,VD-N5-N-0.706-M-LOOP,國道5號,南港系統交流道,石碇交流道,N,-99.0000,-99.0000,-99.0000,0.706,2023-03-01 00:04:00
2,VD-N5-N-1.068-M-LOOP,國道5號,南港系統交流道,石碇交流道,N,-99.0000,-99.0000,-99.0000,1.068,2023-03-01 00:04:00
3,VD-N5-N-2.068-M-PS-LOOP,國道5號,南港系統交流道,石碇交流道,N,89.7000,5.0000,6.6000,2.068,2023-03-01 00:04:00
4,VD-N5-N-3.198-M-LOOP,國道5號,南港系統交流道,石碇交流道,N,93.1000,5.5000,5.8000,3.198,2023-03-01 00:04:00
...,...,...,...,...,...,...,...,...,...,...
5540005,VD-N5-S-44.202-M-LOOP,國道5號,宜蘭交流道,羅東交流道,S,105.0000,1.8333,2.5000,44.202,2023-08-31 23:57:00
5540006,VD-N5-S-46.566-M-LOOP,國道5號,宜蘭交流道,羅東交流道,S,66.8333,0.8333,0.8333,46.566,2023-08-31 23:57:00
5540007,VD-N5-S-48.040-M-LOOP,國道5號,羅東交流道,蘇澳交流道,S,62.3333,0.8333,0.8333,48.040,2023-08-31 23:57:00
5540008,VD-N5-S-51.138-M-IMAGE,國道5號,羅東交流道,蘇澳交流道,S,76.3333,14.5000,1.0000,51.138,2023-08-31 23:57:00


In [None]:
# # TODO: main
# if __name__ == '__main__':
#     # # read feather files to get dataframes
#     # startDate = '2023-01-01'
#     # endDate = getEndDate(startDate, days=10)
#     # # df = getRollingMean(startDate, endDate)
#     # df = feather.read_dataframe('./20230101-20230110.feather').sort_values(by=['RoadDirection','DataCollectTime','LocationMile']).reset_index(drop=True)
    
#     # Northbound data
#     northDf = df.loc[df['RoadDirection']=='N'].reset_index(drop=True)
#     each = 3
#     vdGroups = groupVDs(northDf, each)    
#     speedDataset, volDataset, occDataset = [], [], []
#     for groupKey in vdGroups.keys():
#         speeds, vols, occs = genSamples(northDf, vdGroups, groupKey, each, timeWindow=30)
#         speedDataset.append(speeds)
#         volDataset.append(vols)
#         occDataset.append(occs)

#     # Southbound data
#     southDf = df.loc[df['RoadDirection']=='S'].reset_index(drop=True)
#     each = 3
#     vdGroups = groupVDs(southDf, each)    
#     speedDataset, volDataset, occDataset = [], [], []
#     for groupKey in vdGroups.keys():
#         speeds, vols, occs = genSamples(southDf, vdGroups, groupKey, each, timeWindow=30)
#         speedDataset.append(speeds)
#         volDataset.append(vols)
#         occDataset.append(occs)

In [3]:
EACH = 3
speedDataset, volDataset, occDataset = [], [], []

# Northbound data
northDf = df.loc[df['RoadDirection']=='N'].reset_index(drop=True)
print(f"northDf start grouping: {datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')}")
vdGroups = groupVDs(northDf, each=EACH)
print(f"northDf end grouping: {datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')}")
for groupKey in vdGroups.keys():
    print(groupKey)
    speeds, vols, occs = genSamples(northDf, vdGroups, groupKey, each=EACH, timeWindow=30)
    speedDataset += speeds
    volDataset += vols
    occDataset += occs

# Southbound data
southDf = df.loc[df['RoadDirection']=='S'].reset_index(drop=True)
print(f"southDf start grouping: {datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')}")
vdGroups = groupVDs(southDf, each=EACH)
print(f"southDf end grouping: {datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')}")
# speedDataset, volDataset, occDataset = [], [], []
for groupKey in vdGroups.keys():
    print(groupKey)
    speeds, vols, occs = genSamples(southDf, vdGroups, groupKey, each=EACH, timeWindow=30)
    speedDataset += speeds
    volDataset += vols
    occDataset += occs

northDf start grouping: 2024-03-16 13:43:45
northDf end grouping: 2024-03-16 13:44:09
VD-N5-N-0.706-M-LOOP
VD-N5-N-1.068-M-LOOP
VD-N5-N-2.068-M-PS-LOOP
VD-N5-N-3.198-M-LOOP
VD-N5-N-3.943-M-LOOP
VD-N5-N-5.883-M-LOOP
VD-N5-N-7.107-M-LOOP
VD-N5-N-8.011-M-LOOP
VD-N5-N-9.028-M-IMAGE
VD-N5-N-9.840-M-LOOP
VD-N5-N-10.866-M-PS-LOOP
VD-N5-N-11.903-M-PS-LOOP
VD-N5-N-12.922-M-LOOP
VD-N5-N-13.707-M-LOOP
VD-N5-N-14.550-M-LOOP
VD-N5-N-15.488-M-LOOP
VD-N5-N-16.196-M-LOOP
VD-N5-N-16.900-M-PS-LOOP
VD-N5-N-17.608-M-LOOP
VD-N5-N-18.313-M-PS-LOOP
VD-N5-N-19.012-M-LOOP
VD-N5-N-19.689-M-PS-LOOP
VD-N5-N-20.412-M-LOOP
VD-N5-N-21.055-M-PS-LOOP
VD-N5-N-21.808-M-LOOP
VD-N5-N-22.510-M-PS-LOOP
VD-N5-N-23.209-M-LOOP
VD-N5-N-23.911-M-PS-LOOP
VD-N5-N-24.677-M-LOOP
VD-N5-N-25.310-M-PS-LOOP
VD-N5-N-26.007-M-LOOP
VD-N5-N-26.705-M-PS-LOOP
VD-N5-N-27.468-M-LOOP
VD-N5-N-27.779-M-LOOP
VD-N5-N-28.420-M-LOOP
VD-N5-N-29.000-M-LOOP
VD-N5-N-30.100-M-LOOP
VD-N5-N-30.551-M-LOOP
VD-N5-N-31.200-M-LOOP
VD-N5-N-31.540-M-LOOP
VD-N5-N-32

In [28]:
s = [speedDataset[x][0] for x in range(len(speedDataset))]
v = [volDataset[x][0] for x in range(len(volDataset))]

In [33]:
torch.tensor(speedDataset[0][0], dtype=torch.float).unsqueeze(0).unsqueeze(0)

torch.Size([1, 1, 3, 6])

In [39]:
s1 = torch.tensor(speedDataset[0][1], dtype=torch.float).unsqueeze(0)
v1 = torch.tensor(volDataset[0][1], dtype=torch.float).unsqueeze(0)
torch.concat([s1,v1], dim=1)

tensor([[[94.6000],
         [75.6000],
         [90.4000],
         [ 3.8000],
         [ 3.3000],
         [ 3.3000]]])

In [44]:
class CNNDataset(Dataset):
    def __init__(self, speedCollection, volCollection) -> None:
        self.speedFeature = [speedCollection[x][0] for x in range(len(speedCollection))]
        self.volFeature = [volCollection[x][0] for x in range(len(volCollection))]
        self.speedLabels = [speedCollection[x][1] for x in range(len(speedCollection))]
        self.volLabels = [volCollection[x][1] for x in range(len(volCollection))]

    def __len__(self) -> int:
        return len(self.speedFeature)
    
    def __getitem__(self, idx) -> torch.Tensor:
        f1 = torch.tensor(self.speedFeature[idx], dtype=torch.float).unsqueeze(0).unsqueeze(0)
        f2 = torch.tensor(self.volFeature[idx], dtype=torch.float).unsqueeze(0).unsqueeze(0)
        l1 = torch.tensor(self.speedLabels[idx], dtype=torch.float).unsqueeze(0)
        l2 = torch.tensor(self.volLabels[idx], dtype=torch.float).unsqueeze(0)
        feature = torch.concat([f1, f2], dim=1)
        label = torch.concat([l1, l2], dim=1)
        return feature, label

In [45]:
dataset = CNNDataset(speedDataset, volDataset)

In [46]:
dataset[0]

(tensor([[[[-99.0000,  79.1000,  77.5000,  89.4000,  87.3000,  89.4000],
           [-99.0000,  79.0000,  79.9000,  83.0000,  81.1000,  81.4000],
           [-99.0000,  78.2000,  84.8000,  90.6000,  88.5000,  89.7000]],
 
          [[-99.0000,   5.8000,   5.5000,   5.1000,   4.9000,   4.5000],
           [-99.0000,   4.4000,   6.3000,   4.7000,   4.7000,   4.9000],
           [-99.0000,   4.1000,   6.3000,   4.8000,   4.6000,   5.1000]]]]),
 tensor([[[94.6000],
          [75.6000],
          [90.4000],
          [ 3.8000],
          [ 3.3000],
          [ 3.3000]]]))

In [None]:
# This cell is actually same as df.sort_values()

# groupDf = df.groupby(['RoadDirection','DataCollectTime','LocationMile']).agg({
#     'VDID': 'max',
#     'RoadName': 'max',
#     'Start': 'max',
#     'End': 'max',
#     'Speed': 'max',
#     'Occupancy': 'max',
#     'Volume': 'max',
# }).reset_index().sort_values(by=['RoadDirection','DataCollectTime','LocationMile'])
# groupDf

In [None]:
northDf = df.loc[df['RoadDirection']=='N'].reset_index(drop=True)
each = 3
print(f"grouping VDs: {datetime.strftime(datetime.now().replace(microsecond=0), '%Y-%m-%d %H:%M:%S')}")
vdGroups = groupVDs(northDf, each)
print(f"grouping VDs done: {datetime.strftime(datetime.now().replace(microsecond=0), '%Y-%m-%d %H:%M:%S')}")
# TODO: Declare lists for collecting speeds, vols, and occs

for groupKey in vdGroups.keys():
    print(f"[{groupKey}]: {datetime.strftime(datetime.now().replace(microsecond=0), '%Y-%m-%d %H:%M:%S')}")
    speeds, vols, occs = genArrLists_(northDf, vdGroups, groupKey, each, timeWindow=30)
    # tensors = genTensors(speeds, vols)
print(f"Done: {datetime.strftime(datetime.now().replace(microsecond=0), '%Y-%m-%d %H:%M:%S')}")

In [None]:
groupKey

In [None]:
speeds[0]

In [None]:
northDf

In [None]:
groupDf = df.sort_values(by=['RoadDirection','DataCollectTime','LocationMile']).reset_index(drop=True)
groupDf

In [None]:
northDf = groupDf.loc[groupDf['RoadDirection']=='N'].reset_index(drop=True)
northDf

In [None]:
vdGroups = groupVDs(northDf, each=3)
vdGroups

In [None]:
len(vdGroups.keys())

In [None]:
start = '2023-03-01'
end = '2023-03-11'
each = 3
# vdGroups = groupVDs(northDf, each)
groupKey = 'VD-N5-N-3.198-M-LOOP'

speeds, vols, occs = genArrLists(northDf, start, end, vdGroups, groupKey, each)

In [None]:
start = '2023-03-01'
end = '2023-05-31'
each = 3
# vdGroups = groupVDs(northDf, each)
groupKey = 'VD-N5-N-0.706-M-LOOP'

timeWindow = 30
# freq5 = pd.date_range(start, end, freq='5min')
# speeds, vols, occs = [], [], []
# tmpDf = df.loc[(df['VDID'].isin(vdGroups[f"{groupKey}"]))].sort_values(by=['LocationMile', 'DataCollectTime'])

# indices = [x for x in range(0, tmpDf.shape[0]+1, tmpDf.shape[0]//each)]
# speedMatx = np.zeros((each, tmpDf.shape[0]//each))
# volMatx = np.zeros((each, tmpDf.shape[0]//each))
# occMatx = np.zeros((each, tmpDf.shape[0]//each))
# for i, j, k in zip(range(each), indices[:-1], indices[1:]):
#     speedMatx[i] += tmpDf.iloc[j:k,:]['Speed'].to_numpy()
#     volMatx[i] += tmpDf.iloc[j:k,:]['Volume'].to_numpy()
#     occMatx[i] += tmpDf.iloc[j:k,:]['Occupancy'].to_numpy()

# for x in range(speedMatx.shape[1]//7*7-6):
#     speeds.append((speedMatx[:,x:x+7][:,:-1], speedMatx[:,x:x+7][:,[-1]]))
#     vols.append((volMatx[:,x:x+7][:,:-1], volMatx[:,x:x+7][:,[-1]]))
#     occs.append((occMatx[:,x:x+7][:,:-1], occMatx[:,x:x+7][:,[-1]]))












s, v, o = [], [], []
tmpDf = df.loc[(df['VDID'].isin(vdGroups[f"{groupKey}"]))].sort_values(by=['LocationMile', 'DataCollectTime'])

indices = [x for x in range(0, tmpDf.shape[0]+1, tmpDf.shape[0]//each)]
speedMatx = np.zeros((each, tmpDf.shape[0]//each))
volMatx = np.zeros((each, tmpDf.shape[0]//each))
occMatx = np.zeros((each, tmpDf.shape[0]//each))
for i, j, k in zip(range(each), indices[:-1], indices[1:]):
    speedMatx[i] += tmpDf.iloc[j:k,:]['Speed'].to_numpy()
    volMatx[i] += tmpDf.iloc[j:k,:]['Volume'].to_numpy()
    occMatx[i] += tmpDf.iloc[j:k,:]['Occupancy'].to_numpy()

sliceLen = int((timeWindow / 5) + 1)
for x in range(speedMatx.shape[1]//sliceLen*sliceLen-(sliceLen-1)):
    s.append((speedMatx[:,x:x+sliceLen][:,:-1], speedMatx[:,x:x+sliceLen][:,[-1]]))
    v.append((volMatx[:,x:x+sliceLen][:,:-1], volMatx[:,x:x+sliceLen][:,[-1]]))
    o.append((occMatx[:,x:x+sliceLen][:,:-1], occMatx[:,x:x+sliceLen][:,[-1]]))

In [None]:
speeds[1]

In [None]:
s[1]

In [None]:
v[2]

In [None]:
volMatx[:,2:9]

In [None]:
[x for x in range(0, speedMatx.shape[1]+1, 7)][-2]

In [None]:
(speedMatx.shape[1]//7)*7

In [None]:
occMatx[:,26376:26383]

In [None]:
occMatx[:,0:7][:,[-1]]

In [None]:
tmpL = []
for x in range(occMatx.shape[1]//7*7-6):
    tmpL.append((occMatx[:,x:x+7][:,:-1], occMatx[:,x:x+7][:,[-1]]))
    # print(f"occMatx[:,{x}:{x+7}]")

In [None]:
occMatx[:,0:9]

In [None]:
occMatx[:,0:7]

In [None]:
occMatx[:,26383:26386]

In [None]:
speedMatx.shape

In [None]:
26386/7

In [None]:
tmpDf.iloc[26386:52772,:]

In [None]:
volMatx

In [None]:
zeroMatx = np.zeros((each, tmpDf.shape[0]))
zeroMatx[0]

In [None]:
len(speeds)

In [None]:
tensorList = genTensors(speeds, vols)
tensorList[0].shape