In [1]:
# Setup
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm, kstest, anderson, shapiro
# for comparison only
from sklearn.linear_model import LinearRegression, Ridge, ElasticNet

In [2]:
# import data
data = pd.read_csv('https://raw.githubusercontent.com/dvasiliu/AML/main/Data%20Sets/Advertising.csv?raw=true')

In [3]:
# here the goal is to predict the Sales (cont. variable) by using values from TV, Radio and Newspaper advertising
x = data.loc[:,['TV','Radio','Newspaper']].values
y = data['Sales'].values

In [4]:
def zscore(x):
    return (x- np.mean(x,axis=0))/np.std(x,axis=0)

In [5]:
xscaled = zscore(x)

## Demonstrate the Elastic Net Regularization

In [9]:
def MSE(w,b,x,y):
    predictions = x@w + b
    errors = y-predictions
    return np.mean(errors**2)

In [28]:
def gradient_enet(w,b,x,y,alpha,lam):
    predictions = x@w + b
    errors = y - predictions
    return -2/len(y)*errors@x + alpha*(lam*np.sign(w) + (1-lam)*np.array(w)), -2/len(y)*sum(errors)

In [30]:
# we want the "optimal" value for the weights
maxiter = 2000
learning_rate = 0.01
# we want some stopping criteria, for example if the weights are no longer significantly updated
# we measure the difference between w_new and w_old vs epsilon -> tolerance for deciding convergence.
tol = 1e-6
w_old = [1,2,3]
b_old = 0
alpha = 0.01
lam = 0.5

In [33]:
# how to implement the gradient descent
for it in range(maxiter):
    gw, gb =  gradient_enet(w_old,b_old,xscaled,y,alpha,lam)
    w_new = w_old - learning_rate*gw
    b_new = b_old - learning_rate*gb

    if max(abs(w_new-w_old))<tol or abs(b_new-b_old)<tol:
        print("The algorithm has converged!")
        break

    w_old = w_new
    b_old = b_new
    if (it+1)%100==0:
        print('The MSE is : '+str(MSE(w_old,b_old,xscaled,y)))
w_trained = w_old
b_trained = b_old

The MSE is : 2.877187046786579
The MSE is : 2.7880588407635423
The MSE is : 2.7847385787653525
The MSE is : 2.7844645780848447
The MSE is : 2.7844156083395437
The algorithm has converged!


In [34]:
w_trained

array([ 3.90718497,  2.78024098, -0.01482842])

In [35]:
b_trained

np.float64(14.022450086450641)

In [19]:
# to do k-fold CV from scratch
# shuffle the data
ind = np.random.permutation(len(x))
# subdivide into k folds
k = 5
folds = np.array_split(ind,k)

# the actual CV loop

for fold in folds:
    testind = ind[fold]
    trainind = np.delete(ind,testind)
    ####

In [24]:
errors@x

array([460192.19099444,  64893.70172274,  74493.40650196])