## Preliminaries and Utils

In [1]:
import numpy as np
import pandas as pd
import warnings
import time

from sklearn.cluster import KMeans
from sklearn.preprocessing import  MinMaxScaler, StandardScaler

from sklearn.metrics import f1_score
from tqdm import tqdm

# from modules.prediction_models import OnlineDecisionTreeRegressor
# from modules.prediction_models import OnlineRidgePolynomialRegressor
from modules.prediction_models import OnlineKNNClassifier

from modules.utils import Metrics, PrintSummary, ShowPlots

In [2]:
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings("ignore", message="X does not have valid feature names")
warnings.filterwarnings("ignore", message="X has feature names, but PolynomialFeatures was fitted without feature names")
warnings.filterwarnings("ignore", message="X has feature names, but DecisionTreeRegressor was fitted without feature names")
pd.options.mode.chained_assignment = None

In [3]:
summary = PrintSummary()
plots = ShowPlots()
metrics = Metrics()

In [4]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

def evaluate_classifier(y_true, y_pred, label=""):
    acc = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average="macro")
    cm = confusion_matrix(y_true, y_pred)

    print(f"\nEvaluation {label}")
    print(f"Accuracy : {acc:.4f}")
    print(f"Macro-F1 : {f1:.4f}")
    print("Confusion Matrix:")
    print(cm)

    return acc, f1

## Data Loading

In [5]:
data_train = 'datasets/pm100/job_data_train.parquet'
data_test = 'datasets/pm100/job_data_test.parquet'

In [6]:
df_train = pd.read_parquet(data_train, engine="pyarrow").copy()
df_test = pd.read_parquet(data_test, engine="pyarrow").copy()

df_train['Desired QoS'] = df_train['Desired QoS'].astype(int)
df_test['Desired QoS'] = df_test['Desired QoS'].astype(int)

In [7]:
len_test = len(df_test)
len_test

66756

## Data Preparation

In [8]:
scaler = MinMaxScaler()
scaler.fit(df_train['Run Time'].values.reshape(-1, 1))

In [9]:
fs1 = ["User ID", "Requested Number of Nodes", "Requested Number of CPU", "Requested Number of GPU", "Total Requested Memory", "Desired QoS", "Requested Time"]

fs2 = ["User ID", "Requested Number of Nodes", "Requested Number of CPU", "Requested Number of GPU", "Total Requested Memory", "Desired QoS", "Requested Time",
       "Prev Duration (NH7) 1", "Prev Duration (NH7) 2", "Prev Duration (NH7) 3", "Avg Duration (NH7) 2", "Avg Duration (NH7) 3", "Avg Duration (NH7) All"]

target_name = "Duration (H7)"

In [10]:
y_train = df_train[target_name].values
y_test  = df_test[target_name]

## SET 1

In [11]:
X_train = df_train[fs1].values
X_test  = df_test[fs1].values

val_size = int(len(df_train) * 0.1)

# Consecutive split
df_train_sub = df_train.iloc[:-val_size]
df_val       = df_train.iloc[-val_size:]

# Targets
y_train_sub = df_train_sub[target_name].values
y_val       = df_val[target_name].values

# Features
X_train_sub = df_train_sub[fs1].astype(float).values
X_val       = df_val[fs1].astype(float).values

In [12]:
coarse_k = list(range(3, 54, 10))
coarse_results = []

for k in tqdm(coarse_k, desc="Coarse k search"):
    clf = OnlineKNNClassifier(k=k)
    clf.partial_fit(X_train_sub, y_train_sub)
    y_val_pred = clf.predict(X_val)
    f1 = f1_score(y_val, y_val_pred, average="macro")
    coarse_results.append((k, f1))

best_k_coarse, _ = max(coarse_results, key=lambda x: x[1])

refine_range = range(
    max(3, best_k_coarse - 10),
    best_k_coarse + 11
)

refined_results = []

for k in tqdm(refine_range, desc="Refining k"):
    clf = OnlineKNNClassifier(k=k)
    clf.partial_fit(X_train_sub, y_train_sub)
    y_val_pred = clf.predict(X_val)
    f1 = f1_score(y_val, y_val_pred, average="macro")
    refined_results.append((k, f1))

k_opt, best_f1 = max(refined_results, key=lambda x: x[1])

print(f"Optimal k = {k_opt}, Validation F1 = {best_f1}")

Coarse k search: 100%|███████████████████████████████████████████████████████████████████| 6/6 [00:13<00:00,  2.23s/it]
Refining k: 100%|██████████████████████████████████████████████████████████████████████| 21/21 [00:52<00:00,  2.49s/it]

Optimal k = 31, Validation F1 = 0.3272687763444247





In [13]:
classifier = OnlineKNNClassifier(k=k_opt)
classifier.partial_fit(X_train, y_train)

In [14]:
start = time.time()
y_pred = classifier.predict(X_test)
end = time.time()

In [15]:
evaluate_classifier(y_test, y_pred, label="Online k-NN Classification")


Evaluation Online k-NN Classification
Accuracy : 0.5832
Macro-F1 : 0.3873
Confusion Matrix:
[[  643    30   225   244   536   124    20]
 [ 2065  1457   595  2487  1180    83   707]
 [ 1136   111   640   829   472    92    77]
 [ 1589   248   533  1911  8671   110   215]
 [  344   178   358  1078  3883   159   893]
 [  206    64    18   112   412  1079    91]
 [  107    33   205   313   844    33 29316]]


(0.5831535742105578, 0.38731361078154636)

In [16]:
# Save the results in the file with all other predictions

df = pd.read_csv("predictions/pm100/predictions.csv")

df["pred_runtime_knn_c7_fs1"] = y_pred

df.to_csv("predictions/pm100/predictions.csv", index=False)

## SET 2

In [17]:
X_train = df_train[fs2].values
X_test  = df_test[fs2].values

val_size = int(len(df_train) * 0.1)

# Consecutive split
df_train_sub = df_train.iloc[:-val_size]
df_val       = df_train.iloc[-val_size:]

# Targets
y_train_sub = df_train_sub[target_name].values
y_val       = df_val[target_name].values

# Features
X_train_sub = df_train_sub[fs2].astype(float).values
X_val       = df_val[fs2].astype(float).values

In [18]:
coarse_k = list(range(3, 54, 10))
coarse_results = []

for k in tqdm(coarse_k, desc="Coarse k search"):
    clf = OnlineKNNClassifier(k=k)
    clf.partial_fit(X_train_sub, y_train_sub)
    y_val_pred = clf.predict(X_val)
    f1 = f1_score(y_val, y_val_pred, average="macro")
    coarse_results.append((k, f1))

best_k_coarse, _ = max(coarse_results, key=lambda x: x[1])

refine_range = range(
    max(3, best_k_coarse - 10),
    best_k_coarse + 11
)

refined_results = []

for k in tqdm(refine_range, desc="Refining k"):
    clf = OnlineKNNClassifier(k=k)
    clf.partial_fit(X_train_sub, y_train_sub)
    y_val_pred = clf.predict(X_val)
    f1 = f1_score(y_val, y_val_pred, average="macro")
    refined_results.append((k, f1))

k_opt, min_mae = max(refined_results, key=lambda x: x[1])

print(f"Optimal k = {k_opt}, Validation f1 = {min_mae}")

Coarse k search: 100%|███████████████████████████████████████████████████████████████████| 6/6 [00:47<00:00,  7.87s/it]
Refining k: 100%|██████████████████████████████████████████████████████████████████████| 21/21 [03:39<00:00, 10.47s/it]

Optimal k = 48, Validation f1 = 0.6188152979513021





In [19]:
classifier = OnlineKNNClassifier(k=k_opt)
classifier.partial_fit(X_train, y_train)

In [20]:
start = time.time()
y_pred = classifier.predict(X_test)
end = time.time()

In [21]:
evaluate_classifier(y_test, y_pred, label="Online k-NN Classification")


Evaluation Online k-NN Classification
Accuracy : 0.7420
Macro-F1 : 0.6055
Confusion Matrix:
[[  781   217   316   141   143   217     7]
 [   57  6164   617  1189   459    65    23]
 [  164  1223  1074   499   297    92     8]
 [   61  3712   440  5619  3322    82    41]
 [   55   499   236  1159  4455    98   391]
 [  148    30    58    90   100  1547     9]
 [   18   141    48   127   597    24 29896]]


(0.742045658817185, 0.6054546813995605)

In [None]:
# Save the results in the file with all other predictions

df = pd.read_csv("predictions/pm100/predictions.csv")

df["pred_runtime_knn_c7_fs2"] = y_pred

df.to_csv("predictions/pm100/predictions.csv", index=False)