In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### (6 points) Create your class that implements the Gradient Boosting concept, based on the locally weighted regression method (Lowess class), and that allows a user-prescribed number of boosting steps. The class you develop should have all the mainstream useful options, including “fit,” “is_fitted”,  and “predict,” methods.  Show applications with real data for regression, 10-fold cross-validations and compare the effect of different scalers, such as the “StandardScaler”, “MinMaxScaler”, and the “QuantileScaler”.  In the case of the “Concrete” data set, determine a choice of hyperparameters that yield lower MSEs for your method when compared to the eXtream Gradient Boosting library.


In [17]:
import numpy as np
import pandas as pd
from xgboost import XGBRegressor
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler, MinMaxScaler, QuantileTransformer
from scipy.spatial import Delaunay
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import mean_squared_error as mse
from scipy import linalg
from scipy.interpolate import interp1d, LinearNDInterpolator, NearestNDInterpolator
from sklearn.decomposition import PCA
from scipy.spatial.distance import cdist
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from sklearn.metrics import r2_score
import warnings
from sklearn import linear_model
warnings.filterwarnings('ignore', category=DeprecationWarning)

# the following line(s) are necessary if you want to make SKlearn compliant functions
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted

#The above is the imports straight from the notebooks

data = pd.read_csv('drive/MyDrive/Gelila_WM/DATA_441/Data_Sets/concrete.csv')

x = data.loc[:,'cement':'age'].values
y = data['strength'].values

# Gaussian Kernel
def Gaussian(x):
  return np.where(np.abs(x)>4,0,1/(np.sqrt(2*np.pi))*np.exp(-1/2*x**2))

# this is the correct vectorized version
def Tricubic(x):
  return np.where(np.abs(x)>1,0,(1-np.abs(x)**3)**3)

# Epanechnikov Kernel
def Epanechnikov(x):
  return np.where(np.abs(x)>1,0,3/4*(1-np.abs(x)**2))

# Quartic Kernel
def Quartic(x):
  return np.where(np.abs(x)>1,0,15/16*(1-np.abs(x)**2)**2)

def weight_function(u,v,kern=Gaussian,tau=0.5):
    return kern(cdist(u, v, metric='euclidean')/(2*tau))

#The function below is called The Lowess Class - Version without Triangulation
#This is from Efficient_Applications_with_Distances, Weights and Boosting.ipynb

#This is the Lowess Class I will be using without Triangulation!!!!
class Lowess:
    def __init__(self, kernel = Gaussian, tau=0.05):
        self.kernel = kernel
        self.tau = tau

    def fit(self, x, y):
        kernel = self.kernel
        tau = self.tau
        self.xtrain_ = x
        self.yhat_ = y

    def predict(self, x_new):
        check_is_fitted(self)
        x = self.xtrain_
        y = self.yhat_
        lm = linear_model.Ridge(alpha=0.0001)
        w = weight_function(x,x_new,self.kernel,self.tau)

        if np.isscalar(x_new):
          lm.fit(np.diag(w)@(x.reshape(-1,1)),np.diag(w)@(y.reshape(-1,1)))
          yest = lm.predict([[x_new]])[0][0]
        else:
          n = len(x_new)
          yest_test = []
          #Looping through all x-points
          for i in range(n):
            lm.fit(np.diag(w[:,i])@x,np.diag(w[:,i])@y)
            yest_test.append(lm.predict([x_new[i]]))
        return np.array(yest_test).flatten()


class GradientBoostingLowess(BaseEstimator, RegressorMixin):
    def __init__(self, n_estimators=100, learning_rate=0.1, base_estimator=None, kernel=Gaussian, tau=0.05):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.base_estimator = base_estimator
        self.kernel = kernel
        self.tau = tau
        self.estimators_ = []

    def fit(self, X, y):
        X, y = check_X_y(X, y)
        self.X_ = X
        self.y_ = y
        self.is_fitted_ = True

        for _ in range(self.n_estimators):
            # Compute pseudo-residuals
            residuals = y - self.predict(X)

            # Fit a base estimator to the pseudo-residuals
            estimator = self.base_estimator
            est = estimator.fit(X, residuals)

            # Add the estimator to the list of estimators
            self.estimators_.append(est)

            # Update predictions with a scaled version of the new estimator
            y_pred = self.predict(X)
            y += self.learning_rate * est.predict(X)

        return self

    def predict(self, X):
        check_is_fitted(self)
        y_pred = np.zeros(len(X))

        for est in self.estimators_:
            y_pred += self.learning_rate * est.predict(X)

        return y_pred

    def is_fitted(self):
        return hasattr(self, 'is_fitted_') and self.is_fitted_

from sklearn.model_selection import cross_val_score

#Dictionary of scalers
scalers = {
    'StandardScaler': StandardScaler(),
    'MinMaxScaler': MinMaxScaler(),
    'QuantileScaler': QuantileTransformer(output_distribution='uniform')
}

# Iterate through different scalers
for scaler_name, scaler in scalers.items():
    print(f"\nWorking with: {scaler_name}")

    # Scale the data using the current scaler
    x_scaled = scaler.fit_transform(x)

    mse_lwr = []
    mse_rf = []
    kf = KFold(n_splits=10, shuffle=True, random_state=1234)
    model_rf = XGBRegressor(objective='reg:squarederror', n_estimators=50, reg_lambda=1, alpha=1, gamma=0.1, max_depth=3)
    model_1 = Lowess(kernel=Epanechnikov, tau=0.4)
    model_2 = Lowess(kernel=Epanechnikov, tau=0.4)

    for idxtrain, idxtest in kf.split(x):
        xtrain = x_scaled[idxtrain]
        ytrain = y[idxtrain].ravel()
        ytest = y[idxtest].ravel()
        xtest = x_scaled[idxtest]

        # Fit Lowess model on training data
        model_1.fit(xtrain, ytrain)
        yhat_train = model_1.predict(xtrain)
        residuals_train = ytrain - yhat_train

        # Fit second Lowess model on residuals
        model_2.fit(xtrain, residuals_train)

        # Predict using both models on test data
        residuals_hat = model_2.predict(xtest)
        yhat_lw = model_1.predict(xtest) + residuals_hat

        # Fit XGBRegressor on training data
        model_rf.fit(xtrain, ytrain)

        # Predict using XGBRegressor on test data
        yhat_rf = model_rf.predict(xtest)

        # Calculate MSE for Locally Weighted Regression and XGBRegressor
        mse_lwr.append(mse(ytest, yhat_lw))
        mse_rf.append(mse(ytest, yhat_rf))

    print('The Cross-validated Mean Squared Error for Locally Weighted Regression is: ' + str(np.mean(mse_lwr)))
    print('The Cross-validated Mean Squared Error for XGBRegressor: ' + str(np.mean(mse_rf)))



Working with: StandardScaler
The Cross-validated Mean Squared Error for Locally Weighted Regression is: 156.99172896515552
The Cross-validated Mean Squared Error for XGBRegressor: 23.24156551207661

Working with: MinMaxScaler
The Cross-validated Mean Squared Error for Locally Weighted Regression is: 45.41168269427634
The Cross-validated Mean Squared Error for XGBRegressor: 23.24156551207661

Working with: QuantileScaler
The Cross-validated Mean Squared Error for Locally Weighted Regression is: 25.586964609565683
The Cross-validated Mean Squared Error for XGBRegressor: 23.24156551207661


Based on this, I have realized now that the choice of scaling method can significantly impact the performance of Locally Weighted Regression. It seems that **MinMaxScaler and QuantileScaler** perform much better than Lowess in this context. However, the XGBRegressor's performance remains relatively consistent across different scaling methods, indicating that it's quite stubborn to changes in scaling!

### (3 points) Based on the Usearch library, create your own class that computes the k_Nearest Neighbors for Regression.

In [50]:
!pip install usearch



In [51]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from usearch.index import Index, MetricKind, search


x = data.drop(columns=['strength']).values
y = data['strength'].values

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


knn_regressor = KNNRegressor(k=3)
knn_regressor.fit(x_train, y_train)


y_pred = knn_regressor.predict(x_test)


mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)


print(f'Adjusted Mean Squared Error with k=3: {mse:.4f}')
print(f'Adjusted R-squared with k=3: {r2:.4f}')


Adjusted Mean Squared Error with k=3: 80.7602
Adjusted R-squared with k=3: 0.6866
