In [17]:
import tensorflow as tf
from keras.src.models import Sequential
from keras.src.layers import Dense
from keras.src.callbacks import TensorBoard
from keras.api.models import save_model, load_model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import os
import pandas as pd

In [18]:
data_path = "../data"
csv_data = f"{data_path}/data.csv"
filtered_data = f"{data_path}/filtered.csv"
segment_path = f"{data_path}/segments"
segment_data = segment_path + "/segment_{}.csv"

In [25]:
headers = {
    "RequestID": str,
    "Boro": str,
    "Yr": str,
    "M": str,
    "D": str,
    "HH": str,
    "MM": str,
    "Vol": str,
    "SegmentID": str,
    "WktGeom": str,
    "street": str,
    "fromSt": str,
    "toSt": str,
    "Direction": str
}

garbage_headers = [
    "RequestID",
    "Boro",
    "WktGeom",
    "street",
    "fromSt",
    "toSt",
    "Direction"
]

needed_headers = {
    "Yr": int,
    "M": int,
    "D": int,
    "HH": int,
    "MM": int,
    "Vol": int,
    "SegmentID": int,
}

if not os.path.exists(filtered_data):
    os.mkdir(data_path) if not os.path.exists(data_path) else ...
    ds = pd.read_csv(csv_data, header=None, names=list(headers.keys()), dtype=headers)
    for h in garbage_headers:
        ds.drop(columns=h, inplace=True)
    ds.drop(ds.index[0], axis="index", inplace=True)
    for column in needed_headers.keys():
        ds[column].astype(int, copy=False)
    ds.to_csv(filtered_data, header=None, index=None)

In [26]:
ds = pd.read_csv(filtered_data, header=None, names=list(needed_headers.keys()), dtype=needed_headers)

In [27]:
if not os.path.exists(segment_path):
    os.mkdir(segment_path)
    for segmentID in ds["SegmentID"].unique():
        ds.loc[ds["SegmentID"] == segmentID].to_csv(segment_data.format(segmentID), header=None, index=None)

In [41]:
segmentID = 83624  # Largest Segment

ds = pd.read_csv(segment_data.format(segmentID), header=None, names=list(needed_headers.keys()), dtype=needed_headers)
ds.drop(columns="SegmentID", inplace=True)

In [42]:
lookup_hash = {}

categories = ["Yr", "M", "D", "HH", "MM"]

for c in categories:
    lookup_hash[c] = ds[c].astype("category").cat.categories
    ds[c] = ds[c].astype("category", copy=False).cat.codes

print(ds)

       Yr  M   D  HH  MM  Vol
0       0  2  24  15   0   85
1       0  2  24  15   1   92
2       0  2  24  15   2   85
3       0  2  24  15   3   98
4       0  2  24  16   0   94
...    .. ..  ..  ..  ..  ...
11312   2  1  21   4   3   18
11313   2  1  21   5   0   14
11314   2  1  21   5   1   24
11315   2  1  21   5   2   23
11316   2  1  21   5   3   29

[11317 rows x 6 columns]
