In [None]:
import pandas as pd

combined_data = pd.read_csv("/ChemBL2599_combined_data.csv.gz")
combined_data.shape

(7572, 251)

In [3]:
# Remove non numerical unused attributes
machine_learning_df = combined_data.drop(columns=["Molecule ChEMBL ID", "Molecule Name", "Molecule Max Phase", "#RO5 Violations", 
                                                  "Compound Key", "Smiles", "Standard Type", "Standard Relation", "Standard Value",	"Standard Units", 
                                                  "pChEMBL Value", "Data Validity Comment",	"Comment", "Uo Units", "Ligand Efficiency BEI",	"Ligand Efficiency LE",
                                                    "Ligand Efficiency LLE", "Ligand Efficiency SEI", "Potential Duplicate", "Assay ChEMBL ID", "Assay Description",
                                                    "Assay Type", "BAO Format ID", "BAO Label", "Assay Organism", "Assay Tissue ChEMBL ID", "Assay Tissue Name",	
                                                    "Assay Cell Type", "Assay Subcellular Fraction", "Assay Parameters", "Assay Variant Accession", 
                                                    "Assay Variant Mutation", "Target ChEMBL ID", "Target Name", "Target Organism", "Target Type", "Document ChEMBL ID",
                                                    "Source ID", "Source Description", "Document Journal", "Document Year", "Cell ChEMBL ID", "Properties", "Action Type",
                                                    "Standard Text Value", "Value", "IC50_m", "Converted Smiles"])
machine_learning_df.head()

Unnamed: 0,Molecular Weight,AlogP,pIC50_m,converted_smile_0,converted_smile_1,converted_smile_2,converted_smile_3,converted_smile_4,converted_smile_5,converted_smile_6,...,converted_smile_190,converted_smile_191,converted_smile_192,converted_smile_193,converted_smile_194,converted_smile_195,converted_smile_196,converted_smile_197,converted_smile_198,converted_smile_199
0,369.47,2.52,6.499997,1.854377,808.395217,19.388541,15.774469,15.774469,12.935561,9.373808,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.61484
1,355.45,2.01,7.500038,1.858084,808.121772,18.681434,15.119768,15.119768,12.435561,8.847099,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.644743
2,375.86,2.36,6.799998,1.858084,813.693554,18.681434,14.497733,15.253662,12.435561,8.536081,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.630094
3,367.46,1.88,6.599998,1.582716,853.946965,18.802754,15.241088,15.241088,13.097357,9.554206,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.634156
4,359.39,3.25,6.657577,1.610247,1135.828414,18.802754,14.523392,14.523392,13.097357,8.296359,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.433853


In [4]:
from abc import ABC, abstractmethod
from typing import Iterable, Union
from sklearn.utils.validation import check_array, column_or_1d
from inspect import isclass
from typing import List, Tuple, Union

from sklearn.exceptions import NotFittedError


def check_is_fitted(applicability_domain,
                    attributes: Union[str, List[str], Tuple[str]],
                    msg: str = None,
                    all_or_any=all):

    if isclass(applicability_domain):
        raise TypeError("{} is a class, not an instance.".format(applicability_domain))
    if msg is None:
        msg = (
            "This %(name)s instance is not fitted yet. Call 'fit' with "
            "appropriate arguments before using this applicability domain."
        )
    if not hasattr(applicability_domain, "fit"):
        raise TypeError("%s is not an estimator instance." % (applicability_domain))
    if not isinstance(attributes, (list, tuple)):
        attributes = [attributes]
    is_fitted = all_or_any([hasattr(applicability_domain, attr) for attr in attributes])
    if not is_fitted:
        raise NotFittedError(msg % {"name": type(applicability_domain).__name__})

class ApplicabilityDomain(ABC):
    def __init__(self):
        self.fitted_ = False

    def fit(self, X):
        X = check_array(X)
        self.num_points, self.num_dims = X.shape
        self._fit(X)
        self.fitted_ = True

    @abstractmethod
    def _fit(self, X):
        pass

    def contains(self, sample) -> Union[bool, Iterable[bool]]:
        check_is_fitted(self, 'fitted_')
        try:
            sample = column_or_1d(sample)
        except ValueError:
            sample = check_array(sample, accept_large_sparse=False)
        if sample.ndim == 1 and sample.shape[0] != self.num_dims:
            raise ValueError('sample must have the same number of features as the applicability domain; '
                             f'{sample.shape[0]} and {self.num_dims} respectively')
        elif sample.ndim == 2 and sample.shape[1] != self.num_dims:
            raise ValueError('sample must have the same number of features as the applicability domain; '
                             f'{sample.shape[1]} and {self.num_dims} respectively')
        return self._contains(sample)

    @abstractmethod
    def _contains(self, sample):
        pass

In [5]:
from math import floor
from typing import Union, Tuple, Optional
import numpy as np
import scipy
from numpy.random import RandomState
from scipy.spatial.distance import cdist, _METRICS as dist_fns
from scipy.stats import f as Fdistrib
from sklearn.decomposition import PCA
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor, NearestNeighbors
from sklearn.neighbors._kde import KernelDensity
from sklearn.preprocessing import RobustScaler, MinMaxScaler, MaxAbsScaler, StandardScaler
from sklearn.utils.extmath import stable_cumsum

class TopKatApplicabilityDomain(ApplicabilityDomain):
    def __init__(self):
        super().__init__()

    def _fit(self, X):
        self.X_min_, self.X_max_ = X.min(axis=0), X.max(axis=0)
        S = (2 * X - self.X_max_ - self.X_min_) / np.where((self.X_max_ - self.X_min_) != 0,
                                                           (self.X_max_ - self.X_min_),1)
        S = np.c_[np.ones(S.shape[0]), S]
        self.eigen_val, self.eigen_vec = np.linalg.eig(S.T.dot(S))
        self.eigen_val, self.eigen_vec = np.real(self.eigen_val), np.real(self.eigen_vec)
        OPS = S.dot(self.eigen_vec)
        self.OPS_min_ = OPS.min(axis=0)
        self.OPS_max_ = OPS.max(axis=0)

    def _contains(self, sample):
        Ssample = (2 * sample - self.X_max_ - self.X_min_) / np.where((self.X_max_ - self.X_min_) != 0,
                                                                      (self.X_max_ - self.X_min_),1)
        if sample.ndim == 1:
            Ssample = np.c_[1, Ssample.reshape((1, -1))]
        else:
            Ssample = np.c_[np.ones((sample.shape[0], 1)), Ssample]
        OPS_sample = Ssample.dot(self.eigen_vec)
        denom = np.divide(np.ones_like(self.eigen_val, dtype=float),
                          self.eigen_val,
                          out=np.zeros_like(self.eigen_val),
                          where=self.eigen_val!=0)
        dOPS = (OPS_sample * OPS_sample).dot(denom)
        if sample.ndim == 1 and isinstance(dOPS, np.ndarray):
            dOPS = dOPS.item()
        return dOPS < (5 * (self.num_dims)) / (2 * self.num_points)


class LeverageApplicabilityDomain(ApplicabilityDomain):
    def __init__(self):
        super().__init__()
        self.scaler = StandardScaler()

    def _fit(self, X):
        X = self.scaler.fit_transform(X)
        self.var_covar = np.linalg.inv(X.T.dot(X))
        self.threshold = 3 * (self.num_dims + 1) / self.num_points

    def _contains(self, sample):
        if sample.ndim == 1:
            sample = self.scaler.transform(sample.reshape(1, -1))
            h = sample.dot(self.var_covar).dot(sample.T)
        else:
            sample = self.scaler.transform(sample)
            h = np.diag(sample.dot(self.var_covar).dot(sample.T))
        return h < self.threshold

class KNNApplicabilityDomain(ApplicabilityDomain):
    def __init__(self, k: int = 5,
                 alpha: float = 0.95,
                 hard_threshold: float = None,
                 scaling: Optional[str] = 'robust',
                 dist: str = 'euclidean',
                 scaler_kwargs=None,
                 njobs: int=1):
        super().__init__()
        if scaler_kwargs is None:
            scaler_kwargs = {}
        if alpha > 1 or alpha < 0:
            raise ValueError('alpha must lie between 0 and 1')
        scaling_methods = ('robust', 'minmax', 'maxabs', 'standard', None)
        if scaling not in scaling_methods:
            raise ValueError(f'scaling method must be one of {scaling_methods}')
        if scaling == 'robust':
            self.scaler = RobustScaler(**scaler_kwargs)
        elif scaling == 'minmax':
            self.scaler = MinMaxScaler(**scaler_kwargs)
        elif scaling == 'maxabs':
            self.scaler = MaxAbsScaler(**scaler_kwargs)
        elif scaling == 'standard':
            self.scaler = StandardScaler(**scaler_kwargs)
        elif scaling is None:
            self.scaler = None
        else:
            raise NotImplementedError('scaling method not implemented')
        if dist not in dist_fns.keys():
            raise NotImplementedError('distance type is not available')
        else:
            self.dist = dist
        self.k = k
        self.alpha = alpha
        self.hard_threshold = hard_threshold
        self.nn = NearestNeighbors(n_neighbors=k, metric=dist, n_jobs=njobs)

    def _fit(self, X):
        self.X_norm = self.scaler.fit_transform(X) if self.scaler is not None else X
        self.nn.fit(self.X_norm)
        self.kNN_dist = self.nn.kneighbors(self.X_norm, return_distance=True, n_neighbors=self.k+1)[0][:, 1:].mean(axis=1)
        kNN_train_distance_sorted_ = np.trim_zeros(np.sort(self.kNN_dist))
        if self.hard_threshold:
            self.threshold_ = self.hard_threshold
        else:
            self.threshold_ = kNN_train_distance_sorted_[floor(kNN_train_distance_sorted_.shape[0] * self.alpha) - 1]
        return self

    def _contains(self, sample):
        if self.scaler is not None:
            if sample.ndim == 1:
                sample = self.scaler.transform(sample.reshape((1, len(sample))))
            else:
                sample = self.scaler.transform(sample)
        kNN_sample_dist = self.nn.kneighbors(sample, return_distance=True)[0].mean(axis=1)
        norm_dist = kNN_sample_dist / self.threshold_
        if self.hard_threshold:
            return norm_dist < 1
        return norm_dist <= 1

In [15]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.feature_selection import RFECV

X = machine_learning_df.drop(columns=["pIC50_m"])
y = machine_learning_df["pIC50_m"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

base_rf = RandomForestRegressor(random_state=42, n_jobs=-1)

rfecv = RFECV(
    estimator=base_rf,
    step=10,
    min_features_to_select=20,
    cv=KFold(n_splits=3, shuffle=True, random_state=42),
    scoring="r2",
    n_jobs=-1
)

rfecv.fit(X_train, y_train)

X_train_rfe = rfecv.transform(X_train)
X_test_rfe = rfecv.transform(X_test)

print(f"Optimal number of features: {rfecv.n_features_}")

param_dist = {
    "n_estimators": [200, 300, 400],
    "max_depth": [None, 10, 15],
    "min_samples_split": [5, 10, 15],
    "min_samples_leaf": [2, 4, 6],
    "max_features": ["sqrt", 0.6, 0.7],
    "bootstrap": [True]
}

random_search = RandomizedSearchCV(
    estimator=base_rf,
    param_distributions=param_dist,
    n_iter=50,
    scoring="r2",
    cv=3,
    verbose=1,
    n_jobs=-1,
    random_state=42
)

random_search.fit(X_train_rfe, y_train)

best_rf = random_search.best_estimator_
print("Best parameters:", random_search.best_params_)

y_pred_train = best_rf.predict(X_train_rfe)
y_pred_test = best_rf.predict(X_test_rfe)

r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)
mse_train = mean_squared_error(y_train, y_pred_train)
mse_test = mean_squared_error(y_test, y_pred_test)
mae_train = mean_absolute_error(y_train, y_pred_train)
mae_test = mean_absolute_error(y_test, y_pred_test)

print(f"Training R2: {r2_train}, Test R2: {r2_test}")
print(f"Training MSE: {mse_train}, Test MSE: {mse_test}")
print(f"Training MAE: {mae_train}, Test MAE: {mae_test}")


Optimal number of features: 202
Fitting 3 folds for each of 50 candidates, totalling 150 fits
Best parameters: {'n_estimators': 200, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 0.6, 'max_depth': None, 'bootstrap': True}
Training R2: 0.8913321283602652, Test R2: 0.7065308558535759
Training MSE: 0.14556093844107282, Test MSE: 0.38190521338608413
Training MAE: 0.24938482548944252, Test MAE: 0.4205877601271368


In [18]:
# The following applicability domains use variables produced from the Random Forest code above.

ad = TopKatApplicabilityDomain()
ad.fit(X_train_rfe)

inside_ad_train = ad.contains(X_train_rfe)
inside_ad_test = ad.contains(X_test_rfe)

y_train_inside = y_train[inside_ad_train]
y_pred_train_inside = y_pred_train[inside_ad_train]

y_test_inside = y_test[inside_ad_test]
y_pred_test_inside = y_pred_test[inside_ad_test]

r2_train = r2_score(y_train_inside, y_pred_train_inside)
r2_test = r2_score(y_test_inside, y_pred_test_inside)
mse_train = mean_squared_error(y_train_inside, y_pred_train_inside)
mse_test = mean_squared_error(y_test_inside, y_pred_test_inside)
mae_train = mean_absolute_error(y_train_inside, y_pred_train_inside)
mae_test = mean_absolute_error(y_test_inside, y_pred_test_inside)

print(f"Training R2: {r2_train}, Test R2: {r2_test}")
print(f"Training MSE: {mse_train}, Test MSE: {mse_test}")
print(f"Training MAE: {mae_train}, Test MAE: {mae_test}")

Training R2: 0.8886791572793092, Test R2: 0.6937782699111775
Training MSE: 0.14150102277951823, Test MSE: 0.3769613519391465
Training MAE: 0.24540610670821436, Test MAE: 0.41639559667792


In [19]:
# The following applicability domains use variables produced from the Random Forest code above.

ad = KNNApplicabilityDomain()
ad.fit(X_train_rfe)

inside_ad_train = ad.contains(X_train_rfe)
inside_ad_test = ad.contains(X_test_rfe)

y_train_inside = y_train[inside_ad_train]
y_pred_train_inside = y_pred_train[inside_ad_train]

y_test_inside = y_test[inside_ad_test]
y_pred_test_inside = y_pred_test[inside_ad_test]

r2_train = r2_score(y_train_inside, y_pred_train_inside)
r2_test = r2_score(y_test_inside, y_pred_test_inside)
mse_train = mean_squared_error(y_train_inside, y_pred_train_inside)
mse_test = mean_squared_error(y_test_inside, y_pred_test_inside)
mae_train = mean_absolute_error(y_train_inside, y_pred_train_inside)
mae_test = mean_absolute_error(y_test_inside, y_pred_test_inside)

print(f"Training R2: {r2_train}, Test R2: {r2_test}")
print(f"Training MSE: {mse_train}, Test MSE: {mse_test}")
print(f"Training MAE: {mae_train}, Test MAE: {mae_test}")

Training R2: 0.8912425702876093, Test R2: 0.7080209269562807
Training MSE: 0.14858592901048454, Test MSE: 0.39223213957288783
Training MAE: 0.25218144710096424, Test MAE: 0.4274065223053353


In [20]:
import numpy as np
from scipy.stats import zscore

selected_features = X.columns[rfecv.support_]

X_selected = X[selected_features]

X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

z_scores = np.abs(zscore(y_train))
mask = (z_scores < 3)
X_train_clean = X_train[mask]
y_train_clean = y_train[mask]

print(f"Removed {len(X_train) - len(X_train_clean)} outliers from training set.")

final_rf = RandomForestRegressor(n_estimators=200, min_samples_split=10, min_samples_leaf=2, max_features=0.6, max_depth=None, bootstrap=True, random_state=42, n_jobs=-1)

final_rf.fit(X_train_clean, y_train_clean)

y_pred_train = final_rf.predict(X_train_clean)
y_pred_test = final_rf.predict(X_test)

r2_train = r2_score(y_train_clean, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)
mse_train = mean_squared_error(y_train_clean, y_pred_train)
mse_test = mean_squared_error(y_test, y_pred_test)
mae_train = mean_absolute_error(y_train_clean, y_pred_train)
mae_test = mean_absolute_error(y_test, y_pred_test)

print(f"Training R2: {r2_train}, Test R2: {r2_test}")
print(f"Training MSE: {mse_train}, Test MSE: {mse_test}")
print(f"Training MAE: {mae_train}, Test MAE: {mae_test}")

Removed 74 outliers from training set.
Training R2: 0.8778325608824171, Test R2: 0.6441102280123767
Training MSE: 0.142587117545208, Test MSE: 0.463136115069882
Training MAE: 0.2485212298809906, Test MAE: 0.44564335475910405


In [21]:
# The following applicability domains use variables produced from the Random Forest code above.

ad = TopKatApplicabilityDomain()
ad.fit(X_train_clean)

inside_ad_train = ad.contains(X_train_clean)
inside_ad_test = ad.contains(X_test)

y_train_inside = y_train_clean[inside_ad_train]
y_pred_train_inside = y_pred_train[inside_ad_train]

y_test_inside = y_test[inside_ad_test]
y_pred_test_inside = y_pred_test[inside_ad_test]

r2_train = r2_score(y_train_inside, y_pred_train_inside)
r2_test = r2_score(y_test_inside, y_pred_test_inside)
mse_train = mean_squared_error(y_train_inside, y_pred_train_inside)
mse_test = mean_squared_error(y_test_inside, y_pred_test_inside)
mae_train = mean_absolute_error(y_train_inside, y_pred_train_inside)
mae_test = mean_absolute_error(y_test_inside, y_pred_test_inside)

print(f"Training R2: {r2_train}, Test R2: {r2_test}")
print(f"Training MSE: {mse_train}, Test MSE: {mse_test}")
print(f"Training MAE: {mae_train}, Test MAE: {mae_test}")

Training R2: 0.8767775242659804, Test R2: 0.6673665648018905
Training MSE: 0.13845495121040607, Test MSE: 0.38162543959826384
Training MAE: 0.24443978631089297, Test MAE: 0.4189405124278786


In [22]:
# The following applicability domains use variables produced from the Random Forest code above.

ad = KNNApplicabilityDomain()
ad.fit(X_train_clean)

inside_ad_train = ad.contains(X_train_clean)
inside_ad_test = ad.contains(X_test)

y_train_inside = y_train_clean[inside_ad_train]
y_pred_train_inside = y_pred_train[inside_ad_train]

y_test_inside = y_test[inside_ad_test]
y_pred_test_inside = y_pred_test[inside_ad_test]

r2_train = r2_score(y_train_inside, y_pred_train_inside)
r2_test = r2_score(y_test_inside, y_pred_test_inside)
mse_train = mean_squared_error(y_train_inside, y_pred_train_inside)
mse_test = mean_squared_error(y_test_inside, y_pred_test_inside)
mae_train = mean_absolute_error(y_train_inside, y_pred_train_inside)
mae_test = mean_absolute_error(y_test_inside, y_pred_test_inside)

print(f"Training R2: {r2_train}, Test R2: {r2_test}")
print(f"Training MSE: {mse_train}, Test MSE: {mse_test}")
print(f"Training MAE: {mae_train}, Test MAE: {mae_test}")

Training R2: 0.8773465353709234, Test R2: 0.6442096970151443
Training MSE: 0.14571388509549324, Test MSE: 0.4779534037295207
Training MAE: 0.2516757570766046, Test MAE: 0.45428817647449576


In [23]:
selected_features = X.columns[rfecv.support_]

X_selected = X[selected_features]

X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

Q1 = y_train.quantile(0.25)
Q3 = y_train.quantile(0.75)
IQR = Q3 - Q1

mask = (y_train >= (Q1 - 1.5 * IQR)) & (y_train <= (Q3 + 1.5 * IQR))
X_train_clean = X_train[mask]
y_train_clean = y_train[mask]

print(f"Removed {len(X_train) - len(X_train_clean)} outliers from training set using IQR.")

final_rf = RandomForestRegressor(n_estimators=200, min_samples_split=10, min_samples_leaf=2, max_features=0.6, max_depth=None, bootstrap=True, random_state=42, n_jobs=-1)

final_rf.fit(X_train_clean, y_train_clean)

y_pred_train = final_rf.predict(X_train_clean)
y_pred_test = final_rf.predict(X_test)

r2_train = r2_score(y_train_clean, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)
mse_train = mean_squared_error(y_train_clean, y_pred_train)
mse_test = mean_squared_error(y_test, y_pred_test)
mae_train = mean_absolute_error(y_train_clean, y_pred_train)
mae_test = mean_absolute_error(y_test, y_pred_test)

print(f"Training R2: {r2_train}, Test R2: {r2_test}")
print(f"Training MSE: {mse_train}, Test MSE: {mse_test}")
print(f"Training MAE: {mae_train}, Test MAE: {mae_test}")

Removed 93 outliers from training set using IQR.
Training R2: 0.8788297516353866, Test R2: 0.6295325894404422
Training MSE: 0.1377237829368992, Test MSE: 0.482106682437953
Training MAE: 0.24684874366445994, Test MAE: 0.4499569013762883


In [24]:
# The following applicability domains use variables produced from the Random Forest code above.

ad = TopKatApplicabilityDomain()
ad.fit(X_train_clean)

inside_ad_train = ad.contains(X_train_clean)
inside_ad_test = ad.contains(X_test)

y_train_inside = y_train_clean[inside_ad_train]
y_pred_train_inside = y_pred_train[inside_ad_train]

y_test_inside = y_test[inside_ad_test]
y_pred_test_inside = y_pred_test[inside_ad_test]

r2_train = r2_score(y_train_inside, y_pred_train_inside)
r2_test = r2_score(y_test_inside, y_pred_test_inside)
mse_train = mean_squared_error(y_train_inside, y_pred_train_inside)
mse_test = mean_squared_error(y_test_inside, y_pred_test_inside)
mae_train = mean_absolute_error(y_train_inside, y_pred_train_inside)
mae_test = mean_absolute_error(y_test_inside, y_pred_test_inside)

print(f"Training R2: {r2_train}, Test R2: {r2_test}")
print(f"Training MSE: {mse_train}, Test MSE: {mse_test}")
print(f"Training MAE: {mae_train}, Test MAE: {mae_test}")

Training R2: 0.8783735046328097, Test R2: 0.6610809508694999
Training MSE: 0.13370224371954362, Test MSE: 0.38936250499707353
Training MAE: 0.2429276653807333, Test MAE: 0.42126439932481247


In [25]:
# The following applicability domains use variables produced from the Random Forest code above.

ad = KNNApplicabilityDomain()
ad.fit(X_train_clean)

inside_ad_train = ad.contains(X_train_clean)
inside_ad_test = ad.contains(X_test)

y_train_inside = y_train_clean[inside_ad_train]
y_pred_train_inside = y_pred_train[inside_ad_train]

y_test_inside = y_test[inside_ad_test]
y_pred_test_inside = y_pred_test[inside_ad_test]

r2_train = r2_score(y_train_inside, y_pred_train_inside)
r2_test = r2_score(y_test_inside, y_pred_test_inside)
mse_train = mean_squared_error(y_train_inside, y_pred_train_inside)
mse_test = mean_squared_error(y_test_inside, y_pred_test_inside)
mae_train = mean_absolute_error(y_train_inside, y_pred_train_inside)
mae_test = mean_absolute_error(y_test_inside, y_pred_test_inside)

print(f"Training R2: {r2_train}, Test R2: {r2_test}")
print(f"Training MSE: {mse_train}, Test MSE: {mse_test}")
print(f"Training MAE: {mae_train}, Test MAE: {mae_test}")

Training R2: 0.8784330931901223, Test R2: 0.6293987483826379
Training MSE: 0.1405745140706707, Test MSE: 0.49784979565471277
Training MAE: 0.2498505991910728, Test MAE: 0.45844278504563746
