In [1]:
import numpy as np  # For array operations
import pandas as pd  # For Dataframe operations (similar to Excel spreadsheets)
from scipy.stats import norm

# For plotting figures
import matplotlib.pyplot as plt  
from matplotlib.ticker import (MultipleLocator, FormatStrFormatter,
                              AutoMinorLocator)

# Machine learning-realated functions
from sklearn.preprocessing import StandardScaler, MinMaxScaler # For normalizing inputs
from sklearn.decomposition import PCA # Principle component analysis
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C ,WhiteKernel as Wht,Matern as matk

from tqdm.auto import tqdm  # progress bar

from warnings import filterwarnings # disable warnings
filterwarnings('ignore')

from sklearn.metrics import (
    r2_score,
    mean_squared_error,
    mean_absolute_percentage_error,
    mean_absolute_error,
)

from IPython import display
%matplotlib inline 

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
def preprocess_features(train_data: np.ndarray) -> StandardScaler:
    scaler = StandardScaler()
    scaler = scaler.fit(train_data)
    return scaler

In [28]:
def gpregression(Xtrain,Ytrain,Nfeature):    
    cmean=[1.0]*Nfeature
    cbound=[[1e-3, 1e3]]*Nfeature
    kernel = C(1.0, (1e-3,1e3)) * matk(cmean,cbound,2.0) + Wht(1.0, (1e-3, 1e3))  # Matern kernel
    gp = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=5, normalize_y=False)
    gp.fit(Xtrain, Ytrain)
    return gp

In [19]:
train_data = pd.read_csv("../data/processed/train.csv")
val_data = pd.read_csv("../data/processed/val.csv")
train_X, train_y = train_data.drop("pt_loss", axis=1), train_data["pt_loss"]
val_X, val_y = val_data.drop("pt_loss", axis=1), val_data["pt_loss"]

In [20]:
scaler = preprocess_features(train_X.values)

In [21]:
train_Xs = scaler.transform(train_X.values)
val_Xs = scaler.transform(val_X.values)

In [26]:
gpr = gpregression(train_Xs, train_y.values, Nfeature=7)

In [23]:
val_preds = gpr.predict(val_Xs)
r2 = r2_score(val_y.values, val_preds)
mape = mean_absolute_percentage_error(val_y, val_preds)
mse = mean_squared_error(val_y, val_preds)
mae = mean_absolute_error(val_y, val_preds)
print(f"Validation R2 score: {r2}")
print(f"MAPE: {mape}")
print(f"MAE: {mae}")
print(f"MSE: {mse}")

Validation R2 score: 0.9226938036092356
MAPE: 0.09044659426946486
MAE: 0.01675632162442941
MSE: 0.001241925906220674


In [24]:
gpr = gpregression(train_Xs[:100], train_y.values[:100], Nfeature=7)
val_preds = gpr.predict(val_Xs)
r2 = r2_score(val_y.values, val_preds)
mape = mean_absolute_percentage_error(val_y, val_preds)
mse = mean_squared_error(val_y, val_preds)
mae = mean_absolute_error(val_y, val_preds)
print(f"Validation R2 score: {r2}")
print(f"MAPE: {mape}")
print(f"MAE: {mae}")
print(f"MSE: {mse}")

Validation R2 score: 0.8632565080844814
MAPE: 0.1470761607920514
MAE: 0.025328323370975656
MSE: 0.002196787489822078


In [15]:
gpr = gpregression(train_X.values, train_y.values, Nfeature=7)

In [16]:
val_preds = gpr.predict(val_X.values)
r2 = r2_score(val_y.values, val_preds)
mape = mean_absolute_percentage_error(val_y, val_preds)
mse = mean_squared_error(val_y, val_preds)
mae = mean_absolute_error(val_y, val_preds)
print(f"Validation R2 score: {r2}")
print(f"MAPE: {mape}")
print(f"MAE: {mae}")
print(f"MSE: {mse}")

Validation R2 score: 0.902779078840072
MAPE: 0.10556638610821935
MAE: 0.01828669581607512
MSE: 0.0015618564390988071


In [29]:
gpr = gpregression(train_X.values[:100], train_y.values[:100], Nfeature=7)
val_preds = gpr.predict(val_X.values)
r2 = r2_score(val_y.values, val_preds)
mape = mean_absolute_percentage_error(val_y, val_preds)
mse = mean_squared_error(val_y, val_preds)
mae = mean_absolute_error(val_y, val_preds)
print(f"Validation R2 score: {r2}")
print(f"MAPE: {mape}")
print(f"MAE: {mae}")
print(f"MSE: {mse}")

Validation R2 score: 0.8109405567971341
MAPE: 0.21667114145034735
MAE: 0.032739606515975093
MSE: 0.0030372445067979852
