In [2]:
import numpy as np
import pandas as pd

In [3]:
def get_data(path):
    df = pd.read_csv(path, skiprows=72, header=None, sep=r'\s+', index_col=0)
    df.columns = [f'A{i}' for i in range(1, 16)] + ['B']
    df = df.reset_index(drop=True)
    return df.iloc[:, :-1], df.iloc[:, -1]

In [4]:
def normalize_and_add_ones(X):
    X = np.array(X)
    X_max = np.max(X,axis=0)
    X_min = np.min(X,axis=0)
    X_normalized = (X-X_min) / (X_max-X_min)
    ones = np.ones(X_normalized.shape[0])
    return np.column_stack((ones,X_normalized))

In [5]:
class RidgeRegression:
    def __init__(self):
        pass
    def fit(self,X_train,y_train,LAMBDA):
        W = np.linalg.inv(X_train.T.dot(X_train) + LAMBDA*np.eye(X_train.shape[1])).dot(X_train.T.dot(y_train))
        return W

    def predict(self,W,X_new):
        return np.array(X_new).dot(W)

    def computeRSS(self,Y_new,Y_pred):
        return np.mean((Y_new-Y_pred)**2)

    def getTheBestLAMBDA(self,X_train,y_train):

        def crossValidation(num_folds,LAMBDA):
            row_ids = np.arange(X_train.shape[0])
            valid_ids = np.split(row_ids[:len(row_ids)-len(row_ids) % num_folds],num_folds)
            valid_ids[-1] = np.append(valid_ids[-1],row_ids[len(row_ids)-len(row_ids) % num_folds:])
            train_ids = [[k for k in row_ids if k not in valid_ids[i]] for i in range(num_folds)]
            avg_RSS = 0
            for i in range(num_folds):
                W = self.fit(X_train[train_ids[i]],y_train[train_ids[i]],LAMBDA)
                y_pred = self.predict(W,X_train[valid_ids[i]])
                avg_RSS+=self.computeRSS(y_train[valid_ids[i]],y_pred)
                return avg_RSS / num_folds

        def rangeScan(best_LAMBDA,min_RSS,LAMBDA_values):
            for current_LAMBDA in LAMBDA_values:
                avg_RSS = crossValidation(num_folds=2,LAMBDA=current_LAMBDA)
                if avg_RSS < min_RSS:
                    best_LAMBDA = current_LAMBDA
                    min_RSS = avg_RSS
            return best_LAMBDA,min_RSS

        best_LAMBDA, min_RSS = rangeScan(best_LAMBDA=0,min_RSS=1000**2,LAMBDA_values=range(50))

        LAMBDA_values = np.arange(max(0,(best_LAMBDA-1)*1000,(best_LAMBDA+1)*1000,1))*1.0/1000
        best_LAMBDA,min_RSS = rangeScan(best_LAMBDA=best_LAMBDA,min_RSS=min_RSS,LAMBDA_values=LAMBDA_values)
        
        return best_LAMBDA

In [6]:
X,y = get_data('Data/DeathRate.txt')

In [7]:
X = normalize_and_add_ones(X)

In [8]:
X_train,y_train = X[:50],y[:50]
X_test,y_test = X[50:],y[50:]

In [9]:
ridge_reg = RidgeRegression()
best_LAMBDA = ridge_reg.getTheBestLAMBDA(X_train,y_train)

In [10]:
best_LAMBDA

0.018

In [11]:
W_learned = ridge_reg.fit(X_train,y_train,best_LAMBDA)

In [12]:
y_pred = ridge_reg.predict(W_learned,X_test)

In [13]:
ridge_reg.computeRSS(y_test,y_pred)

1416.5610862599851