In [2]:
#Import libraries
import numpy as np
import pandas as pd
from tqdm import tqdm

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
from sklearn.linear_model import Ridge

In [46]:
#Define Ridge Regression class
class RidgeRegression:
    
    def __init__(self, C = 1.0, batch_size = 1, learning_rate = 0.01, max_iter = 1000, tol = 1e-3):
        self.C = 1.0 / C
        self.batch_size = batch_size
        self.lr = learning_rate
        self.max_iter = max_iter
        self.tol = tol
    
    def fit(self, xdata, ydata):
        if np.size(xdata) == np.size(xdata, 0):
            size = 1
        else:
            size = np.size(xdata, 1)
        
        #self.batch_size = xdata.shape[0]
        self.coef = np.random.random(size + 1)
        
        for it in range(self.max_iter):
            #x_batch, y_batch = xdata, ydata 
            x_batch, y_batch = self.__get_batch(xdata, ydata)
            error = self.__loss_function(x_batch, y_batch)
            if np.max(abs(error)) > self.tol:
                self.__update_coef(x_batch, error)
            else:
                break
            
    def predict(self, xdata):
        x_pred = np.c_[np.ones(xdata.shape[0]), xdata]
        return np.dot(x_pred, self.coef)

    def __get_batch(self, xdata, ydata):
        batch = np.random.choice(len(xdata),size = self.batch_size, replace = False)
        return xdata[batch], ydata[batch]

    def __loss_function(self, xdata, ydata):
        error = self.predict(xdata) - ydata #+ np.dot(self.C, self.coef)        
        return 2 * error
    
    def __update_coef(self, x_batch, error):
        x = np.c_[np.ones(x_batch.shape[0]), x_batch]
        self.coef += -self.lr * ((np.dot(x.T, error) + 2 * self.C * self.coef)  / self.batch_size)

    def __to_numpy(self, data):
        if type(data) != np.ndarray:
          return np.asarray(data)
        else:
          return data

In [10]:
#Define cross-validation function
def cross_val(estimator, xdata, ydata, cv):
    kf = KFold(n_splits = cv)
    score = []
    for train_index, test_index in kf.split(xdata):
        x_train, x_test = xdata[train_index], xdata[test_index]
        y_train, y_test = ydata[train_index], ydata[test_index]
        estimator.fit(x_train, y_train)
        score.append(r2_score(y_test, estimator.predict(x_test)))
    return score

In [11]:
#Read dataset
xdata = pd.read_csv('airfoil_self_noise.dat', sep = '\t', header = None)

ydata = xdata.iloc[:,5].values
#xdata = xdata.iloc[:,:5].values
xdata = MinMaxScaler().fit_transform(xdata.iloc[:,:5].values)

In [47]:
#Cross-validate implemented algorithm
score = cross_val(RidgeRegression(max_iter = 50000, 
                                  learning_rate = 0.01, 
                                  batch_size = 1200,
                                  C = 1.0,
                                  tol = 1e-8), xdata, ydata, 10)
print('Mean r2 score (Implemented Ridge Regression): %5.3f' %np.mean(score))

Mean r2 score (Implemented Ridge Regression): 0.395


In [29]:
#Cross-validate sklearn Ridge Regression
score = cross_val(Ridge(), xdata, ydata, 10)
print('Mean r2 score (SkLearn Ridge Regression): %5.3f' %np.mean(score))

Mean r2 score (SkLearn Ridge Regression): 0.399
