In [38]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import os
import pandas as pd
import numpy as np

In [39]:
data_path = "../data"
csv_data = f"{data_path}/data.csv"
filtered_data = f"{data_path}/filtered.csv"
segment_path = f"{data_path}/segments"
segment_data = segment_path + "/segment_{}.csv"

In [67]:
learning_rate = 0.0000001
epochs = 100
batch_size = 32

In [68]:
headers = {
    "RequestID": str,
    "Boro": str,
    "Yr": str,
    "M": str,
    "D": str,
    "HH": str,
    "MM": str,
    "Vol": str,
    "SegmentID": str,
    "WktGeom": str,
    "street": str,
    "fromSt": str,
    "toSt": str,
    "Direction": str
}

garbage_headers = [
    "RequestID",
    "Boro",
    "WktGeom",
    "street",
    "fromSt",
    "toSt",
    "Direction"
]

needed_headers = {
    "Yr": int,
    "M": int,
    "D": int,
    "HH": int,
    "MM": int,
    "Vol": int,
    "SegmentID": int,
}

if not os.path.exists(filtered_data):
    os.mkdir(data_path) if not os.path.exists(data_path) else ...
    ds = pd.read_csv(csv_data, header=None, names=list(headers.keys()), dtype=headers)
    for h in garbage_headers:
        ds.drop(columns=h, inplace=True)
    ds.drop(ds.index[0], axis="index", inplace=True)
    for column in needed_headers.keys():
        ds[column].astype(int, copy=False)
    ds.to_csv(filtered_data, header=None, index=None)

In [69]:
ds = pd.read_csv(filtered_data, header=None, names=list(needed_headers.keys()), dtype=needed_headers)

In [70]:
if not os.path.exists(segment_path):
    os.mkdir(segment_path)
    for segmentID in ds["SegmentID"].unique():
        ds.loc[ds["SegmentID"] == segmentID].to_csv(segment_data.format(segmentID), header=None, index=None)

In [71]:
segmentID = 83624  # Largest Segment

ds = pd.read_csv(segment_data.format(segmentID), header=None, names=list(needed_headers.keys()), dtype=needed_headers)
ds.drop(columns="SegmentID", inplace=True)

In [72]:
lookup_hash = {}

categories = ["Yr", "M", "D", "HH", "MM"]

dep = ds["Vol"]
ds.drop(columns="Vol", inplace=True)  # independent variable

for c in categories:
    lookup_hash[c] = ds[c].astype("category").cat.categories
    ds[c] = ds[c].astype("category", copy=False).cat.codes

In [73]:
mms = MinMaxScaler()
dep = mms.fit_transform(dep.values.reshape(-1, 1))

In [74]:
ind_train, ind_test, dep_train, dep_test = train_test_split(ds, dep, test_size=0.2)

In [75]:
def mse_loss(actual, predicted):
    return np.mean(np.square(actual - predicted))

In [76]:
def sgd_update(ind_batch, dep_batch, weights, bias):
    # Forward pass
    predictions = np.dot(ind_batch, weights) + bias
    # Calculate gradients
    d_loss_d_predictions = 2 * (predictions - dep_batch)  # Derivative of MSE loss w.r.t. predictions
    de_predictions_d_weights = ind_batch  # Derivative of predictions w.r.t weights
    d_loss_d_bias = np.sum(d_loss_d_predictions)  # Derivative of MSE loss w.r.t. bias

    # Update weights and bias using SGD
    weights -= learning_rate * np.mean(de_predictions_d_weights, axis=0)
    bias -= learning_rate * d_loss_d_bias

    return weights, bias

In [77]:
weights = np.random.rand(ds.shape[1])
bias = 0

print(weights, bias)

[0.61548608 0.51541087 0.22364216 0.5686094  0.73364524] 0


In [78]:
ind_batch = ind_train[:batch_size]
dep_batch = dep_train[:batch_size]

weights, bias = sgd_update(ind_batch, dep_batch, weights, bias)

In [79]:
for epoch in range(epochs):
    for i in range(0, len(ind_train), batch_size):
        ind_batch = ind_train[i:i+batch_size]
        dep_batch = dep_train[i:i+batch_size]
        weights, bias = sgd_update(ind_batch, dep_batch, weights, bias)
    train_loss = mse_loss(dep_train, np.dot(ind_train, weights) + bias)
    print(f"Epoch {epoch + 1}/{epochs}, Training Loss: {train_loss}")

Epoch 1/100, Training Loss: 150.12518662632414
Epoch 2/100, Training Loss: 135.4399210899621
Epoch 3/100, Training Loss: 122.3712063484896
Epoch 4/100, Training Loss: 110.74147753565921
Epoch 5/100, Training Loss: 100.39263540949348
Epoch 6/100, Training Loss: 91.18391459748905
Epoch 7/100, Training Loss: 82.98998517639647
Epoch 8/100, Training Loss: 75.69926205344387
Epoch 9/100, Training Loss: 69.21239940948321
Epoch 10/100, Training Loss: 63.44094995255555
Epoch 11/100, Training Loss: 58.30617094615749
Epoch 12/100, Training Loss: 53.737960949877404
Epoch 13/100, Training Loss: 49.673912967550905
Epoch 14/100, Training Loss: 46.058471263278484
Epoch 15/100, Training Loss: 42.842180499611906
Epoch 16/100, Training Loss: 39.98101709366088
Epoch 17/100, Training Loss: 37.435793792490585
Epoch 18/100, Training Loss: 35.17162945383927
Epoch 19/100, Training Loss: 33.157476895116126
Epoch 20/100, Training Loss: 31.365702454609917
Epoch 21/100, Training Loss: 29.771711604379384
Epoch 22/10

In [59]:
dep_predict = np.dot(ind_test, weights) + bias
test_loss = mse_loss(dep_test, dep_predict)
print(f"Test Loss: {test_loss}")

Test Loss: 18.496367652096474


In [61]:
print(dep_predict)
print(dep_test)

[-5.1667832   9.39725581  0.28987171 ...  4.18322071 -0.66517954
 -2.46795435]
[[0.2020202 ]
 [0.34343434]
 [0.42424242]
 ...
 [0.25252525]
 [0.21212121]
 [0.32323232]]
