In [None]:
import pandas as pd
from datetime import datetime

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge, Lasso, ElasticNet, PoissonRegressor,BayesianRidge, ARDRegression
from sklearn.multioutput import MultiOutputRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, GridSearchCV

In [None]:
df = pd.read_csv(f"../data/new/dataset2022_08_03-05_35_12_PM.csv")

In [None]:
df["tot_cpu"] = df.w1_num * df.w1_cpu + df.w2_num * df.w2_cpu + df.w3_num * df.w3_cpu + df.w4_num * df.w4_cpu
df["tot_mem"] = df.w1_num * df.w1_mem + df.w2_num * df.w2_mem + df.w3_num * df.w3_mem + df.w4_num * df.w4_mem

In [None]:
all_cols = ["time", "nr_query_cols", "nr_query_conds", "nr_data_rows", "nr_data_cols", "nr_users",  "tot_cpu", "tot_mem"]

In [None]:
df = df[all_cols]

In [None]:
X_cols = ['time', 'nr_query_cols', 'nr_query_conds', 'nr_data_rows', 'nr_data_cols', 'nr_users']
y_cols = ['tot_cpu', 'tot_mem']

In [None]:
X = df[X_cols]

In [None]:
y = df[y_cols]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [None]:
def __perform_grid_search(model, hyperparams):
    grid = GridSearchCV(model, hyperparams, cv=5, scoring="neg_root_mean_squared_error", n_jobs=-1, refit=True)
    grid.fit(X_train, y_train)
    score = grid.score(X_test, y_test)
    return grid, score

In [None]:
print(datetime.now().strftime("%H:%M:%S"))
ard = MultiOutputRegressor(ARDRegression())
ard_hyperparams = {"estimator__n_iter": [50, 100, 200, 300, 400, 500], "estimator__fit_intercept": [True, False],
                     "estimator__alpha_1": [1e-6, 1e-4, 1e-2, 1e-8, 1e-10, 1e-12],
                     "estimator__alpha_2": [1e-6, 1e-4, 1e-2, 1e-8, 1e-10, 1e-12],
                     "estimator__lambda_1": [1e-6, 1e-4, 1e-2, 1e-8, 1e-10, 1e-12],
                     "estimator__lambda_2": [1e-6, 1e-4, 1e-2, 1e-8, 1e-10, 1e-12],}
ard_grid, ard_score = __perform_grid_search(ard, ard_hyperparams)
print("ARD: ", ard_score)
print(datetime.now().strftime("%H:%M:%S"))
print(ard_grid.best_params_)

In [None]:
print(datetime.now().strftime("%H:%M:%S"))
bayes = MultiOutputRegressor(BayesianRidge())
bayes_hyperparams = {"estimator__n_iter": [50, 100, 200, 300, 400, 500], "estimator__fit_intercept": [True, False],
                     "estimator__alpha_1": [1e-6, 1e-4, 1e-2, 1e-8, 1e-10, 1e-12],
                     "estimator__alpha_2": [1e-6, 1e-4, 1e-2, 1e-8, 1e-10, 1e-12],
                     "estimator__lambda_1": [1e-6, 1e-4, 1e-2, 1e-8, 1e-10, 1e-12],
                     "estimator__lambda_2": [1e-6, 1e-4, 1e-2, 1e-8, 1e-10, 1e-12],}
bayes_grid, bayes_score = __perform_grid_search(bayes, bayes_hyperparams)
print("Bayes: ", bayes_score)
print(datetime.now().strftime("%H:%M:%S"))
print(bayes_grid.best_params_)

In [None]:
print(datetime.now().strftime("%H:%M:%S"))
poisson = MultiOutputRegressor(PoissonRegressor())
poisson_hyperparams = {"estimator__alpha": [1, 0.1, 0.01, 0.001, 0.0001, 0],
                       "estimator__fit_intercept": [True, False],
                       "estimator__max_iter": [100, 200, 300, 400, 500, 1000, 2000]}
poisson_grid, poisson_score = __perform_grid_search(poisson, poisson_hyperparams)
print("Poisson: ", poisson_score)
print(datetime.now().strftime("%H:%M:%S"))
print(poisson_grid.best_params_)

In [None]:
print(datetime.now().strftime("%H:%M:%S"))
ridge = Ridge()
ridge_hyperparams = {"alpha": [1, 0.1, 0.01, 0.001, 0.0001, 0], "fit_intercept": [True, False],
                     "solver": ["svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga"]}
ridge_grid, ridge_score = __perform_grid_search(ridge, ridge_hyperparams)
print("Ridge: ", ridge_score)
print(datetime.now().strftime("%H:%M:%S"))
print(ridge_grid.best_params_)

In [None]:
print(datetime.now().strftime("%H:%M:%S"))
lasso = Lasso()
lasso_hyperparams = {"alpha": [1, 0.1, 0.01, 0.001, 0.0001, 0], "fit_intercept": [True, False],
                       "max_iter": [100, 200, 300, 400, 500, 1000, 2000]}
lasso_grid, lasso_score = __perform_grid_search(lasso, lasso_hyperparams)
print("Lasso: ", lasso_score)
print(datetime.now().strftime("%H:%M:%S"))
print(lasso_grid.best_params_)

In [None]:
print(datetime.now().strftime("%H:%M:%S"))
elastic_net = ElasticNet()
elastic_net_hyperparams = {"alpha": [1, 0.1, 0.01, 0.001, 0.0001, 0], "fit_intercept": [True, False],
                           "l1_ratio": [1, 0.1, 0.01, 0.001, 0.0001, 0], "max_iter": [100, 200, 300, 400, 500, 1000, 2000]}
elastic_net_grid, elastic_net_score = __perform_grid_search(elastic_net, elastic_net_hyperparams)
print("Elastic Net: ", elastic_net_score)
print(datetime.now().strftime("%H:%M:%S"))
print(elastic_net_grid.best_params_)

In [None]:
print(datetime.now().strftime("%H:%M:%S"))
neigh = KNeighborsRegressor()
neigh_hyperparams = {"n_neighbors": range(1, 200), "algorithm": ["auto", "brute"],
                     "leaf_size": [1, 5, 10, 15, 30, 50, 100], "weights": ["uniform", "distance"],
                     "metric": ["euclidean", "minkowski"]}
neigh_grid, neigh_score = __perform_grid_search(neigh, neigh_hyperparams)
print("KNN: ", neigh_score)
print(datetime.now().strftime("%H:%M:%S"))
print(neigh_grid.best_params_)

In [None]:
print(datetime.now().strftime("%H:%M:%S"))
tree = DecisionTreeRegressor()
tree_hyperparams = {"criterion": ["squared_error", "friedman_mse", "poisson"],
                    "max_depth": [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 20, 50, 100, None],
                    "min_samples_split": [2, 5, 10],
                    "max_features": ["auto", "sqrt", "log2"]}
tree_grid, tree_score = __perform_grid_search(tree, tree_hyperparams)
print("Decision Tree: ", tree_score)
print(datetime.now().strftime("%H:%M:%S"))
print(tree_grid.best_params_)

In [None]:
print(datetime.now().strftime("%H:%M:%S"))
forest = RandomForestRegressor()
forest_hyperparams = {"criterion": ["squared_error", "absolute_error"],
                      "max_depth": [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 20, 50, 100, None],
                      "min_samples_split": [2, 5, 10],
                      "n_estimators": [50, 100, 200, 250, 300, 350, 400, 500, 1000],
                      "max_features": ["auto", "sqrt", "log2"]
                      }
forest_grid, forest_score = __perform_grid_search(forest, forest_hyperparams)
print("Random Forest: ", forest_score)
print(datetime.now().strftime("%H:%M:%S"))
print(forest_grid.best_params_)

In [None]:
print(datetime.now().strftime("%H:%M:%S"))
svr = MultiOutputRegressor(SVR())
svr_hyperparams = {"estimator__C": [1, 2, 3, 4, 5], "estimator__kernel": ["rbf"],
                       "estimator__degree": [1, 2, 3, 4, 5],
                       "estimator__gamma": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
                   "estimator__cache_size": [100, 200, 500, 1000]}
svr_grid, svr_score = __perform_grid_search(svr, svr_hyperparams)
print("SVR: ", svr_score)
print(datetime.now().strftime("%H:%M:%S"))
print(svr_grid.best_params_)

In [None]:
print(datetime.now().strftime("%H:%M:%S"))
xgb = XGBRegressor()
xgb_hyperparams = {"n_estimators": [50, 100, 200, 250, 300, 350, 400, 500, 1000],
                   "max_depth": [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 20, 50, 100, None],
                   "colsample_bytree": [0.1, 0.3, 0.5, 0.7, 0.9],
                   "min_child_weight": [1, 3, 5, 7, 9, 11, 25, 50],
                   "eta": [0.1, 0.3, 0.5, 0.7, 0.9]
                   }
xgb_grid, xgb_score = __perform_grid_search(xgb, xgb_hyperparams)
print("XGBoost: ", xgb_score)
print(datetime.now().strftime("%H:%M:%S"))
print(xgb_grid.best_params_)

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def get_metrics(actual, model, data):
    predicted = model.predict(data)
    actual_cpu = actual.iloc[:, 0]
    actual_mem = actual.iloc[:, 1]
    predicted_cpu = predicted[:, 0]
    predicted_mem = predicted[:, 1]

    print(model.estimator)
    cri = mean_squared_error
    print(cri(actual_cpu, predicted_cpu), cri(actual_mem, predicted_mem))
    cri = mean_absolute_error
    print(cri(actual_cpu, predicted_cpu), cri(actual_mem, predicted_mem))
    cri = r2_score
    print(cri(actual_cpu, predicted_cpu), cri(actual_mem, predicted_mem))
    print((1 - ((1 - cri(actual_cpu, predicted_cpu)) * ((data.shape[0] - 1)/(data.shape[0] - 1 - data.shape[1])))),
          (1 - ((1 - cri(actual_mem, predicted_mem)) * ((data.shape[0] - 1)/(data.shape[0] - 1 - data.shape[1])))))
    print()

In [None]:
grids = ard_grid, bayes_grid, poisson_grid, ridge_grid, lasso_grid, elastic_net_grid, \
           neigh_grid, tree_grid, \
           forest_grid, svr_grid, xgb_grid

In [None]:
for i in grids:
    get_metrics(y_test, i, X_test)