In [14]:
from typing import Union
import warnings

import matplotlib.pyplot as plt
from modAL.disagreement import (
    max_std_sampling,
)
from modAL.models import ActiveLearner, CommitteeRegressor
import numpy as np
import pandas as pd
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RationalQuadratic, ExpSineSquared
from sklearn.linear_model import SGDRegressor, LinearRegression, BayesianRidge
from sklearn.gaussian_process.kernels import (
    RBF,
    ConstantKernel as C,
    WhiteKernel as Wht,
    Matern as matk,
    RationalQuadratic as rq
)
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    r2_score,
    mean_squared_error,
    mean_absolute_percentage_error,
    mean_absolute_error,
)

from ubend_pl.models.model_list import GS_x, GS_xy, GS_y

In [2]:
def fit_and_validate_model(
    model: Union[SGDRegressor, SVR, GradientBoostingRegressor],
    train_X: np.ndarray,
    train_y: np.ndarray,
    val_X: np.ndarray,
    val_y: np.ndarray,
) -> Union[SGDRegressor, SVR, GradientBoostingRegressor]:
    model = model.fit(train_X, train_y)
    val_preds = model.predict(val_X)
    r2 = r2_score(val_y, val_preds)
    mape = mean_absolute_percentage_error(val_y, val_preds)
    mse = mean_squared_error(val_y, val_preds)
    mae = mean_absolute_error(val_y, val_preds)
    print(f"Validation R2 score: {r2}")
    print(f"MAPE: {mape}")
    print(f"MAE: {mae}")
    print(f"MSE: {mse}")
    return model, {"r2": r2, "mape": mape, "mae": mae, "mse": mse}


def get_test_score(
    model: Union[SGDRegressor, SVR, GradientBoostingRegressor],
    X: np.ndarray,
    y: np.ndarray,
) -> None:
    print(f"Validation R2 score: {r2_score(y, model.predict(X))}")
    print(f"MSE: {mean_squared_error(y, model.predict(X))}")
    return None


def GP_regression(n_feature: int) -> GaussianProcessRegressor:
    cmean = [1.0] * n_feature
    cbound = [[1e-8, 1e8]] * n_feature
    kernel = C(1.0, (1e-8, 1e8)) * matk(cmean, cbound, 1.5) + Wht(
        1.0, (1e-8, 1e8)
    )  # Matern kernel
    gp = GaussianProcessRegressor(
        kernel=kernel, n_restarts_optimizer=10, normalize_y=False
    )
    return gp


def GP_regression_std(
    GP_regression: GaussianProcessRegressor, X: np.ndarray
) -> np.ndarray:
    return np.argmax(GP_regression.predict(X, return_std=True))

In [3]:
train_data = pd.read_csv("../data/processed/norm_train.csv")
val_data = pd.read_csv("../data/processed/norm_val.csv")
train_X, train_y = train_data.drop("pt_loss", axis=1), train_data["pt_loss"]
val_X, val_y = val_data.drop("pt_loss", axis=1), val_data["pt_loss"]

In [4]:
train_X, val_X = train_X.values, val_X.values
train_y, val_y = train_y.values, val_y.values

In [38]:
n_start_points = 10
n_query = 80

In [39]:
initial_X_i = np.random.choice(
    train_X.shape[0], size=n_start_points, replace=False
)
initial_X = train_X[initial_X_i]
initial_y = train_y[initial_X_i].reshape(-1, 1)

In [40]:
regressor = ActiveLearner(
    estimator=GP_regression(train_X.shape[1]),
    query_strategy=max_std_sampling,
    X_training=initial_X,
    y_training=initial_y,
)



In [41]:
for i in range(n_query):
    query_idx, query_instance = regressor.query(train_X)
    regressor.teach(train_X[query_idx].reshape(1, -1), train_y[query_idx].reshape(1, -1))



In [42]:
val_preds = regressor.predict(val_X)
r2 = r2_score(val_y, val_preds)
mape = mean_absolute_percentage_error(val_y, val_preds)
mse = mean_squared_error(val_y, val_preds)
mae = mean_absolute_error(val_y, val_preds)
print(f"Validation R2 score: {r2}")
print(f"MAPE: {mape}")
print(f"MAE: {mae}")
print(f"MSE: {mse}")

Validation R2 score: 0.8843144212351908
MAPE: 0.13287648160379206
MAE: 0.020446188582337956
MSE: 0.0018584916080713193


In [37]:
val_preds = regressor.predict(val_X)
r2 = r2_score(val_y, val_preds)
mape = mean_absolute_percentage_error(val_y, val_preds)
mse = mean_squared_error(val_y, val_preds)
mae = mean_absolute_error(val_y, val_preds)
print(f"Validation R2 score: {r2}")
print(f"MAPE: {mape}")
print(f"MAE: {mae}")
print(f"MSE: {mse}")

Validation R2 score: 0.8618382464317226
MAPE: 0.16115961764019812
MAE: 0.025691885936280655
MSE: 0.0022195718974193316


In [26]:
n_initial = 10
n_feature = 7
cmean = [1.0] * n_feature
cbound = [[1e-8, 1e8]] * n_feature
kernel = C(1.0, (1e-8, 1e8)) * matk(cmean, cbound, 1.5) + Wht(
    1.0, (1e-8, 1e8)
)  # Matern kernel

kernels = [
    C(1.0, (1e-8, 1e8)) * matk(cmean, cbound, 1.5)
    + Wht(1.0, (1e-8, 1e8)),  # Matern kernel
    C(1.0, (1e-8, 1e8)) * matk(cmean, cbound, 2.5)
    + Wht(1.0, (1e-8, 1e8)),  # Matern kernel
    C(1.0, (1e-8, 1e8)) * matk(cmean, cbound, 2.0)
    + Wht(1.0, (1e-8, 1e8)),  # Matern kernel
]

initial_idx = list()
initial_idx.append(np.random.choice(range(100), size=n_initial, replace=False))
initial_idx.append(
    np.random.choice(range(100, 200), size=n_initial, replace=False)
)
initial_idx.append(
    np.random.choice(range(200, 300), size=n_initial, replace=False)
)
learner_list = [
    ActiveLearner(
        estimator=GaussianProcessRegressor(kernel),
        X_training=train_X[idx],
        y_training=train_y[idx].reshape(-1, 1),
    )
    for idx, kernel in zip(initial_idx, kernels)
]



In [27]:
committee = CommitteeRegressor(
    learner_list=learner_list,
    query_strategy=max_std_sampling,
)

In [28]:
n_queries = 50
for idx in range(n_queries):
    query_idx, query_instance = committee.query(train_X)
    committee.teach(train_X[query_idx], train_y[query_idx].reshape(-1, 1))



In [29]:
val_preds = committee.predict(val_X)
r2 = r2_score(val_y, val_preds)
mape = mean_absolute_percentage_error(val_y, val_preds)
mse = mean_squared_error(val_y, val_preds)
mae = mean_absolute_error(val_y, val_preds)
print(f"Validation R2 score: {r2}")
print(f"MAPE: {mape}")
print(f"MAE: {mae}")
print(f"MSE: {mse}")

Validation R2 score: 0.8682709289044478
MAPE: 0.1256560243583271
MAE: 0.02130494705289656
MSE: 0.0021162306986234804


In [30]:
val_preds = committee.predict(val_X)
r2 = r2_score(val_y, val_preds)
mape = mean_absolute_percentage_error(val_y, val_preds)
mse = mean_squared_error(val_y, val_preds)
mae = mean_absolute_error(val_y, val_preds)
print(f"Validation R2 score: {r2}")
print(f"MAPE: {mape}")
print(f"MAE: {mae}")
print(f"MSE: {mse}")

Validation R2 score: 0.8682709289044478
MAPE: 0.1256560243583271
MAE: 0.02130494705289656
MSE: 0.0021162306986234804


In [9]:
def query_strategy_gsx(regressor, X, n_instances=1):
    labeled = regressor.estimator.X_train_
    query_idx = GS_x(labeled, X)
    return query_idx, X[query_idx]

In [10]:
n_start_points = 10
n_query = 80

initial_X_i = np.random.choice(
    train_X.shape[0], size=n_start_points, replace=False
)
initial_X = train_X[initial_X_i]
initial_y = train_y[initial_X_i].reshape(-1, 1)


regressor = ActiveLearner(
    estimator=GP_regression(train_X.shape[1]),
    query_strategy=query_strategy_gsx,
    X_training=initial_X,
    y_training=initial_y,
)



In [11]:
for i in range(n_query):
    query_idx, query_instance = regressor.query(train_X)
    regressor.teach(train_X[query_idx].reshape(1, -1), train_y[query_idx].reshape(1, -1))



In [12]:
val_preds = regressor.predict(val_X)
r2 = r2_score(val_y, val_preds)
mape = mean_absolute_percentage_error(val_y, val_preds)
mse = mean_squared_error(val_y, val_preds)
mae = mean_absolute_error(val_y, val_preds)
print(f"Validation R2 score: {r2}")
print(f"MAPE: {mape}")
print(f"MAE: {mae}")
print(f"MSE: {mse}")

Validation R2 score: 0.875651256044403
MAPE: 0.12241806296891647
MAE: 0.023534148212323653
MSE: 0.001997665565433343


In [18]:
def query_strategy_gsy(regressor, X, n_instances=1):
    labels = regressor.estimator.y_train_
    preds = regressor.estimator.predict(X, return_std=False).reshape(-1, 1)
    query_idx = GS_y(labels, preds)
    return query_idx, X[query_idx]

In [19]:
n_start_points = 10
n_query = 80

initial_X_i = np.random.choice(
    train_X.shape[0], size=n_start_points, replace=False
)
initial_X = train_X[initial_X_i]
initial_y = train_y[initial_X_i].reshape(-1, 1)


regressor = ActiveLearner(
    estimator=GP_regression(train_X.shape[1]),
    query_strategy=query_strategy_gsy,
    X_training=initial_X,
    y_training=initial_y,
)

warnings.filterwarnings("ignore")

for i in range(n_query):
    query_idx, query_instance = regressor.query(train_X)
    regressor.teach(train_X[query_idx].reshape(1, -1), train_y[query_idx].reshape(1, -1))


val_preds = regressor.predict(val_X)
r2 = r2_score(val_y, val_preds)
mape = mean_absolute_percentage_error(val_y, val_preds)
mse = mean_squared_error(val_y, val_preds)
mae = mean_absolute_error(val_y, val_preds)
print(f"Validation R2 score: {r2}")
print(f"MAPE: {mape}")
print(f"MAE: {mae}")
print(f"MSE: {mse}")

(10, 1) (777, 1)
(11, 1) (777, 1)
(12, 1) (777, 1)
(13, 1) (777, 1)
(14, 1) (777, 1)
(15, 1) (777, 1)
(16, 1) (777, 1)
(17, 1) (777, 1)
(18, 1) (777, 1)
(19, 1) (777, 1)
(20, 1) (777, 1)
(21, 1) (777, 1)
(22, 1) (777, 1)
(23, 1) (777, 1)
(24, 1) (777, 1)
(25, 1) (777, 1)
(26, 1) (777, 1)
(27, 1) (777, 1)
(28, 1) (777, 1)
(29, 1) (777, 1)
(30, 1) (777, 1)
(31, 1) (777, 1)
(32, 1) (777, 1)
(33, 1) (777, 1)
(34, 1) (777, 1)
(35, 1) (777, 1)
(36, 1) (777, 1)
(37, 1) (777, 1)
(38, 1) (777, 1)
(39, 1) (777, 1)
(40, 1) (777, 1)
(41, 1) (777, 1)
(42, 1) (777, 1)
(43, 1) (777, 1)
(44, 1) (777, 1)
(45, 1) (777, 1)
(46, 1) (777, 1)
(47, 1) (777, 1)
(48, 1) (777, 1)
(49, 1) (777, 1)
(50, 1) (777, 1)
(51, 1) (777, 1)
(52, 1) (777, 1)
(53, 1) (777, 1)
(54, 1) (777, 1)
(55, 1) (777, 1)
(56, 1) (777, 1)
(57, 1) (777, 1)
(58, 1) (777, 1)
(59, 1) (777, 1)
(60, 1) (777, 1)
(61, 1) (777, 1)
(62, 1) (777, 1)
(63, 1) (777, 1)
(64, 1) (777, 1)
(65, 1) (777, 1)
(66, 1) (777, 1)
(67, 1) (777, 1)
(68, 1) (777, 

In [21]:
def query_strategy_gsxy(regressor, X, n_instances=1):
    labeled = regressor.estimator.X_train_
    labels = regressor.estimator.y_train_
    preds = regressor.estimator.predict(X, return_std=False).reshape(-1, 1)
    query_idx = GS_xy(labeled, X, labels, preds)
    return query_idx, X[query_idx]

In [22]:
n_start_points = 10
n_query = 80

initial_X_i = np.random.choice(
    train_X.shape[0], size=n_start_points, replace=False
)
initial_X = train_X[initial_X_i]
initial_y = train_y[initial_X_i].reshape(-1, 1)


regressor = ActiveLearner(
    estimator=GP_regression(train_X.shape[1]),
    query_strategy=query_strategy_gsxy,
    X_training=initial_X,
    y_training=initial_y,
)

warnings.filterwarnings("ignore")

for i in range(n_query):
    query_idx, query_instance = regressor.query(train_X)
    regressor.teach(train_X[query_idx].reshape(1, -1), train_y[query_idx].reshape(1, -1))


val_preds = regressor.predict(val_X)
r2 = r2_score(val_y, val_preds)
mape = mean_absolute_percentage_error(val_y, val_preds)
mse = mean_squared_error(val_y, val_preds)
mae = mean_absolute_error(val_y, val_preds)
print(f"Validation R2 score: {r2}")
print(f"MAPE: {mape}")
print(f"MAE: {mae}")
print(f"MSE: {mse}")

Validation R2 score: 0.8842471936747524
MAPE: 0.17334480285795253
MAE: 0.026259198836941156
MSE: 0.0018595716204483147


In [26]:
def query_strategy_gsxy(regressor, X,  n_instances=1):
    labeled = regressor.X_training
    labels = regressor.y_training
    preds = regressor.estimator.predict(X).reshape(-1, 1)
    query_idx = GS_xy(labeled, X, labels, preds)
    return query_idx, X[query_idx]

In [28]:
n_start_points = 10
n_query = 100

initial_X_i = np.random.choice(
    train_X.shape[0], size=n_start_points, replace=False
)
initial_X = train_X[initial_X_i]
initial_y = train_y[initial_X_i].reshape(-1, 1)


regressor = ActiveLearner(
    estimator=GradientBoostingRegressor(),
    query_strategy=query_strategy_gsxy,
    X_training=initial_X,
    y_training=initial_y,
)

warnings.filterwarnings("ignore")

for i in range(n_query):
    query_idx, query_instance = regressor.query(train_X)
    regressor.teach(train_X[query_idx].reshape(1, -1), train_y[query_idx].reshape(1, -1))


val_preds = regressor.predict(val_X)
r2 = r2_score(val_y, val_preds)
mape = mean_absolute_percentage_error(val_y, val_preds)
mse = mean_squared_error(val_y, val_preds)
mae = mean_absolute_error(val_y, val_preds)
print(f"Validation R2 score: {r2}")
print(f"MAPE: {mape}")
print(f"MAE: {mae}")
print(f"MSE: {mse}")

Validation R2 score: 0.8897665539178857
MAPE: 0.20102425990161032
MAE: 0.02696590135195065
MSE: 0.001770902965259755
