## Preliminaries and Utils

In [1]:
import numpy as np
import pandas as pd
import warnings
import time

from sklearn.cluster import KMeans
from sklearn.preprocessing import  MinMaxScaler, StandardScaler

from modules.prediction_models import OnlineDecisionTreeRegressor
# from modules.prediction_models import OnlineRidgePolynomialRegressor
# from modules.prediction_models import OnlineKNNRegressor

from modules.utils import Metrics, PrintSummary, ShowPlots

In [2]:
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings("ignore", message="X does not have valid feature names")
warnings.filterwarnings("ignore", message="X has feature names, but PolynomialFeatures was fitted without feature names")
warnings.filterwarnings("ignore", message="X has feature names, but DecisionTreeRegressor was fitted without feature names")
pd.options.mode.chained_assignment = None

In [3]:
summary = PrintSummary()
plots = ShowPlots()
metrics = Metrics()

## Data Loading

In [4]:
data_train = 'datasets/pm100/job_data_train.parquet'
data_test = 'datasets/pm100/job_data_test.parquet'

In [5]:
df_train = pd.read_parquet(data_train, engine="pyarrow").copy()
df_test = pd.read_parquet(data_test, engine="pyarrow").copy()

df_train['Desired QoS'] = df_train['Desired QoS'].astype(int)
df_test['Desired QoS'] = df_test['Desired QoS'].astype(int)

In [6]:
len_test = len(df_test)
len_test

66756

## Data Preparation

In [13]:
scaler = MinMaxScaler()
scaler.fit(df_train['Run Time'].values.reshape(-1, 1))

In [7]:
fs1 = ["User ID", "Requested Number of Nodes", "Requested Number of CPU", "Requested Number of GPU", "Total Requested Memory", "Desired QoS", "Requested Time"]

fs2 = ["User ID", "Requested Number of Nodes", "Requested Number of CPU", "Requested Number of GPU", "Total Requested Memory", "Desired QoS", "Requested Time",
       "Prev Run Time 1", "Prev Run Time 2", "Prev Run Time 3", "Avg Run Time 2", "Avg Run Time 3", "Avg Run Time All"]

target_name = "Run Time"

In [8]:
y_train = df_train[target_name]
y_test  = df_test[target_name]

## SET 1

In [9]:
X_train = df_train[fs1]
X_test  = df_test[fs1]

In [10]:
regressor = OnlineDecisionTreeRegressor(batch_size=50)
regressor.fit(X_train, y_train)

In [11]:
y_pred = []

start = time.time()
# Simulate streaming job arrivals
for i in range(len(X_test)):
    x = X_test.iloc[[i]]
    y = y_test.iloc[i]
    pred = regressor.predict(x)[0]
    # print("Predicted:", pred, "Actual:", y)
    y_pred.append(pred)
    regressor.partial_fit(x, y)   # online update

end = time.time()

y_pred = np.array(y_pred)

In [14]:
metrics.print(scaler, y_test, y_pred, start, end, len_test)

-------------------------------------------
                 METRICS
-------------------------------------------
Inference time: 71.97202682495117
Latency:        0.00107813570053555
-------------------------------------------
MAE:            4298.521301456049
MAE (hh:mm:ss): 01:11:38
MAE (Scaled):   0.049762923147210564
EA:             0.610410208924922
MAPE:           6890.28383302326
-------------------------------------------


In [15]:
# Save the results in the file with all other predictions

df = pd.read_csv("predictions/pm100/predictions.csv")

df["pred_runtime_dt_fs1"] = y_pred

df.to_csv("predictions/pm100/predictions.csv", index=False)

## SET 2

In [16]:
X_train = df_train[fs2]
X_test  = df_test[fs2]

In [17]:
regressor = OnlineDecisionTreeRegressor(batch_size=50)
regressor.fit(X_train, y_train)

In [18]:
y_pred = []

start = time.time()
# Simulate streaming job arrivals
for i in range(len(X_test)):
    x = X_test.iloc[[i]]
    y = y_test.iloc[i]
    pred = regressor.predict(x)[0]
    # print("Predicted:", pred, "Actual:", y)
    y_pred.append(pred)
    regressor.partial_fit(x, y)   # online update

end = time.time()

y_pred = np.array(y_pred)

In [19]:
metrics.print(scaler, y_test, y_pred, start, end, len_test)

-------------------------------------------
                 METRICS
-------------------------------------------
Inference time: 73.97897696495056
Latency:        0.0011081996669205848
-------------------------------------------
MAE:            3783.693615555156
MAE (hh:mm:ss): 01:03:03
MAE (Scaled):   0.04380288973784621
EA:             0.5749107959347742
MAPE:           6018.522725255227
-------------------------------------------


In [20]:
# Save the results in the file with all other predictions

df = pd.read_csv("predictions/pm100/predictions.csv")

df["pred_runtime_dt_fs2"] = y_pred

df.to_csv("predictions/pm100/predictions.csv", index=False)