In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import os
import pandas as pd
import numpy as np
from ..constants import Paths, Headers, Constants

In [98]:
learning_rate = 0.000001
epochs = 100
batch_size = 32

In [99]:
if not os.path.exists(Paths.filtered_csv):
    os.mkdir(Paths.data_dir) if not os.path.exists(Paths.data_dir) else ...
    ds = pd.read_csv(Paths.data_csv, header=None, names=list(Headers.headers.keys()), dtype=Headers.headers)
    for h in Headers.garbage_headers:
        ds.drop(columns=h, inplace=True)
    ds.drop(ds.index[0], axis="index", inplace=True)
    for column in Headers.needed_headers.keys():
        ds[column].astype(int, copy=False)
    ds.to_csv(Paths.filtered_csv, header=None, index=None)

In [100]:
ds = pd.read_csv(Paths.filtered_csv, header=None, names=list(Headers.needed_headers.keys()), dtype=Headers.needed_headers)

In [101]:
if not os.path.exists(Paths.segments_dir):
    os.mkdir(Paths.segments_dir)
    for segmentID in ds["SegmentID"].unique():
        ds.loc[ds["SegmentID"] == segmentID].to_csv(Paths.segment_csv.format(segmentID), header=None, index=None)

In [102]:
segmentID = 83624  # Largest Segment

ds = pd.read_csv(Paths.segment_csv.format(segmentID), header=None, names=list(Headers.needed_headers.keys()), dtype=Headers.needed_headers)
ds.drop(columns="SegmentID", inplace=True)

In [103]:
lookup_hash = {}

dep = ds["Vol"]
ds.drop(columns="Vol", inplace=True)  # independent variable

for c in Constants.categories:
    lookup_hash[c] = ds[c].astype("category").cat.categories
    ds[c] = ds[c].astype("category", copy=False).cat.codes

In [104]:
dep_max = dep.max()
dep_min = dep.min()

mms = MinMaxScaler()
dep = mms.fit_transform(dep.values.reshape(-1, 1))

In [105]:
ind_train, ind_test, dep_train, dep_test = train_test_split(ds, dep, test_size=0.2)

In [106]:
def mse_loss(actual, predicted):
    return np.mean(np.square(actual - predicted))

In [107]:
def sgd_update(ind_batch, dep_batch, weights, bias):
    # Forward pass
    predictions = np.dot(ind_batch, weights) + bias
    # Calculate gradients
    d_loss_d_predictions = 2 * (predictions - dep_batch)  # Derivative of MSE loss w.r.t. predictions
    de_predictions_d_weights = ind_batch  # Derivative of predictions w.r.t weights
    d_loss_d_bias = np.sum(d_loss_d_predictions)  # Derivative of MSE loss w.r.t. bias

    # Update weights and bias using SGD
    weights -= learning_rate * np.mean(de_predictions_d_weights, axis=0)
    bias -= learning_rate * d_loss_d_bias

    return weights, bias

In [108]:
weights = np.random.rand(ds.shape[1])
bias = 0

print(weights, bias)

[0.20428375 0.09480842 0.10924289 0.16628508 0.69468387] 0


In [109]:
ind_batch = ind_train[:batch_size]
dep_batch = dep_train[:batch_size]

weights, bias = sgd_update(ind_batch, dep_batch, weights, bias)

In [110]:
for epoch in range(epochs):
    for i in range(0, len(ind_train), batch_size):
        ind_batch = ind_train[i:i+batch_size]
        dep_batch = dep_train[i:i+batch_size]
        weights, bias = sgd_update(ind_batch, dep_batch, weights, bias)
    train_loss = mse_loss(dep_train, np.dot(ind_train, weights) + bias)
    print(f"Epoch {epoch + 1}/{epochs}, Training Loss: {train_loss}")

Epoch 1/100, Training Loss: 8.857815945567538
Epoch 2/100, Training Loss: 4.120075227096472
Epoch 3/100, Training Loss: 2.6807697870151346
Epoch 4/100, Training Loss: 2.230701626900895
Epoch 5/100, Training Loss: 2.066407992832806
Epoch 6/100, Training Loss: 1.9788821725841557
Epoch 7/100, Training Loss: 1.9090780819881545
Epoch 8/100, Training Loss: 1.8421193924724637
Epoch 9/100, Training Loss: 1.7753803072773051
Epoch 10/100, Training Loss: 1.7091798393457849
Epoch 11/100, Training Loss: 1.6442580213504954
Epoch 12/100, Training Loss: 1.5812054383421612
Epoch 13/100, Training Loss: 1.5204081283678283
Epoch 14/100, Training Loss: 1.4620996540285665
Epoch 15/100, Training Loss: 1.4064162812272065
Epoch 16/100, Training Loss: 1.3534360457489347
Epoch 17/100, Training Loss: 1.3032031954616585
Epoch 18/100, Training Loss: 1.2557426844846957
Epoch 19/100, Training Loss: 1.2110685440705842
Epoch 20/100, Training Loss: 1.1691886506805582
Epoch 21/100, Training Loss: 1.1301074216956128
Epoch

In [111]:
dep_predict = np.dot(ind_test, weights) + bias
test_loss = mse_loss(dep_test, dep_predict)
print(f"Test Loss: {test_loss}")

Test Loss: 7.2343398131883765


In [137]:
def convert_to_original(normalized_val, max_origin, min_origin):
    return float("{:.2f}".format((normalized_val * (max_origin - min_origin)) + min_origin))

In [138]:
predicted_original_value = [convert_to_original(i, dep_max, dep_min) for i in dep_predict]
actual_original_value = [convert_to_original(i[0], dep_max, dep_min) for i in dep_test]

print(pd.DataFrame({"Actual": actual_original_value, "Predicted": predicted_original_value}))


      Actual  Predicted
0       25.0     777.44
1       42.0    -468.10
2       91.0      80.93
3        6.0     322.96
4       68.0    -858.46
...      ...        ...
2259    54.0      91.50
2260    19.0     353.58
2261    23.0    -273.63
2262    36.0    -143.61
2263    82.0    -138.81

[2264 rows x 2 columns]


In [114]:
print(dep_max)

198
