<a href="https://colab.research.google.com/github/erazocar/regressions_ml_hw4/blob/main/Ridge_Lasso_HW4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Lasso and Ridge Regression Analysis**
# **Required Libraries**

In [None]:
#importing required libraries
import numpy as np
import os
import pandas as pd

#Gensim library for document selection and LSI
import sklearn as sk
from sklearn import datasets
from sklearn import linear_model as lm

#Visualization libraries
import matplotlib.pyplot as plt

## **Function Definition**

In [None]:
#Reader for each file, appended to a list.
def reader(path):
  data = pd.read_csv(path, delim_whitespace=True, header = None)
  for i,j in enumerate(data.values):
    for k in range(len(j)):
      if (isinstance(j[k], str)):
        data.loc[i,k] = float(j[k].split(':')[1])
  return data

In [None]:
#Creating dataset and labels for the Boston housing data
def boston_split(sets, testing_size):
  var = []
  #Separating data into dataset and labels
  variables = sets.drop(0, axis=1)
  labels = sets[0]

  #Splitting the data into training and testing datasets, depending on the
  #size of the test
  splitter = sk.model_selection.train_test_split
  X_train, X_test, y_train, y_test = splitter(variables, labels, test_size = testing_size)
  var.append(X_train)
  var.append(y_train)
  var.append(X_test)
  var.append(y_test)
  return var

In [None]:
#Optimal solution function Ridge for BS data
def optim_ridge(data):
  ridgereg = lm.Ridge(alpha = 1 * (2*data[0].count()))
  ridgereg.fit(data.drop(0, axis=1),data[0])
  return ridgereg.coef_, ridgereg.intercept_

In [None]:
#Optimal solution function Lasso for BS data
def optim_lasso(data):
  lassoreg = lm.Lasso(alpha = 1)
  lassoreg.fit(data.drop(0, axis=1),data[0])
  return lassoreg.coef_, lassoreg.intercept_

In [None]:
#Optimal solution function Ridge for E2006 data
def optim_ridge_e20(data):
  ridgereg = lm.Ridge(alpha = 1 * (2*len(data[1])))
  ridgereg.fit(data[0],data[1])
  return ridgereg.coef_, ridgereg.intercept_

In [None]:
#Optimal solution function Lasso for E2006 data
def optim_lasso_e20(data):
  lassoreg = lm.Lasso(alpha = 1)
  lassoreg.fit(data[0],data[1])
  return lassoreg.coef_, lassoreg.intercept_

In [None]:
#Ridge model generator (also plotter)
def ridge_model(X_train, y_train, X_test, y_test, alpha):
  #Fitting the model
  ridgereg = lm.Ridge(alpha=alpha*(2*len(y_train)))
  ridgereg.fit(X_train, y_train)
  y_pred_train = ridgereg.predict(X_train)
  y_pred = ridgereg.predict(X_test)

  plt.scatter(y_test, y_pred)
  plt.tight_layout()
  plt.title('Plot for alpha: %.3g'%alpha)
  plt.show()

  rss = sum ((y_pred-y_test)**2)
  ret = [rss]
  ret.extend([ridgereg.intercept_])
  ret.extend(ridgereg.coef_)
  return ret

In [None]:
#Lasso model generator (also plotter)
def lasso_model(X_train, y_train, X_test, y_test, alpha):
  #Fitting the model
  lassoreg = lm.Lasso(alpha=alpha)
  lassoreg.fit(X_train, y_train)
  y_pred_train = lassoreg.predict(X_train)
  y_pred = lassoreg.predict(X_test)

  plt.scatter(y_test, y_pred)
  plt.tight_layout()
  plt.title('Plot for alpha: %.3g'%alpha)
  plt.show()

  rss = sum ((y_pred-y_test)**2)
  ret = [rss]
  ret.extend([lassoreg.intercept_])
  ret.extend(lassoreg.coef_)
  return ret

In [None]:
#Creating dataset and labels for the E2006 dataset
def e2006_split(training_set, testing_set):
  var  = []
  var.append(training_set[0])
  var.append(training_set[1])
  var.append(testing_set[0])
  var.append(testing_set[1])

  return var

In [None]:
#Coefficient matrix generator
def coeff_matrices(alphas,model, X_train, y_train, X_test, y_test):
  var = []
  ind = ['alpha_%.2g'%alphas[i] for i in range(len(alphas))]
  for i in range(len(alphas)):
    var.append(model(X_train, y_train, X_test, y_test, alphas[i]))
  
  matrix = pd.DataFrame(var, index=ind)
  return matrix

In [None]:
#Plot RMSE for the datasets
def plot_rmse(alphas, matrix, name):
  plt.plot(alphas, matrix[0])
  plt.tight_layout()
  plt.title('Alpha vs RMSE for ' + name)
  plt.show()

In [None]:
#Function calculation Kfold CV for ridge regression
def ridge_kfold(alphas,X_train, y_train):
  regressor = lm.RidgeCV(alphas = alphas*(2*y_train.count()), store_cv_values=True)
  regressor.fit(X_train, y_train)
  cv_mse = np.mean(regressor.cv_values_, axis=0)
  return cv_mse, regressor.alpha_

In [None]:
#Function calculation Kfold CV for lasso regression
def lasso_kfold(alphas, X_train, y_train):
  regressor = lm.LassoCV(alphas=alphas)
  regressor.fit(X_train, y_train)
  return regressor.alpha_

## **Preprocessing**

In [None]:
#Read paths and create contents for the documents.
#Please put the local paths were the queries and docs are located when running.
#Data for problem 1
path1 = '/content/drive/MyDrive/MachineLearning/homework-4/house.txt'
path2 = '/content/drive/MyDrive/MachineLearning/homework-4/house_scale.txt'

BostonHouse = reader(path1)
BostonHouse_scaled = reader(path2)

In [None]:
#Data for problem 2
path3 = '/content/drive/MyDrive/MachineLearning/homework-4/E2006test.txt'
path4 = '/content/drive/MyDrive/MachineLearning/homework-4/E2006train.txt'

#Reading the data from E2006 files using sklearn tools
E2006_test = datasets.load_svmlight_file(path3)
E2006_train = datasets.load_svmlight_file(path4)

## **Problem 1**

###**Question 1**


In [None]:
#Optimal Solutions
BHS_ridge_opt_coef, BHS_ridge_opt_int = optim_ridge(BostonHouse_scaled)
BHS_lasso_opt_coef, BHS_lasso_opt_int = optim_lasso(BostonHouse_scaled)

In [None]:
#Visualize the optimum w vector and score for btoh lasso and ridge with alpha = 1
print('ridge cost vector w: '+ str(BHS_ridge_opt_coef), 'ridge best intercept:' +str(BHS_ridge_opt_int))
print()
print('lasso cost vector w: '+str(BHS_lasso_opt_coef), 'lasso best intercept:'+str(BHS_lasso_opt_int))

###**Question 2**

In [None]:
#Values of alphas used throughout the experiment.
alpha_use = [0, 0.001, 0.01, 0.1, 1, 10, 100]

#Splitted 400 examples as training and 106 examples as testing (21%)
#Used in both ridge and lasso regressions.
BHS_splitted = boston_split(BostonHouse_scaled, 0.21)

In [None]:
#Creation of coefficient matrix for ridge regression. First and second column represent the RMSE and
#the intercept
BHS_matrix_ridge= coeff_matrices(alpha_use, ridge_model, BHS_splitted[0], BHS_splitted[1], BHS_splitted[2], BHS_splitted[3])
BHS_matrix_ridge

In [None]:
#Creation of coefficient matrix for lasso regression. First and second column represent the RMSE and
#the intercept
BHS_matrix_lasso= coeff_matrices(alpha_use, lasso_model, BHS_splitted[0], BHS_splitted[1], BHS_splitted[2], BHS_splitted[3])
BHS_matrix_lasso

In [None]:
#Plotting the alpha vs RMSE for Ridge and Lasso regressions
plot_rmse(alpha_use, BHS_matrix_ridge, 'Ridge')
plot_rmse(alpha_use, BHS_matrix_lasso, 'Lasso')

### **Question 3**

In [None]:
#From sklearn, return the best alpha and the mean square error
#Using the same alphas as before.
BHS_ridge_mse, BHS_ridge_alpha = ridge_kfold(alpha_use[1:],BHS_splitted[0], BHS_splitted[1])
BHS_lasso_alpha = lasso_kfold(alpha_use[1:], BHS_splitted[0], BHS_splitted[1])

In [None]:
#Reporting the best alphas from the 400 dataset.
print('Ridge best alpha for BH scaled data: '+str(BHS_ridge_alpha))
print('Lasso best alpha for BH scaled data: '+str(BHS_lasso_alpha))

In [None]:
#Training the data once again using only the best alpha for ridge
BHS_bestalpha_matrix_ridge= coeff_matrices([BHS_ridge_alpha], ridge_model, BHS_splitted[0], BHS_splitted[1], BHS_splitted[2], BHS_splitted[3])
print('RMSE for best alpha in ridge on BH scaled data: '+str(BHS_bestalpha_matrix_ridge[0]))

In [None]:
#Training the data once again using only the best alpha for lasso
BHS_bestalpha_matrix_lasso= coeff_matrices([BHS_lasso_alpha], lasso_model, BHS_splitted[0], BHS_splitted[1], BHS_splitted[2], BHS_splitted[3])
print('RMSE for best alpha in lasso on BH scaled data: '+str(BHS_bestalpha_matrix_lasso[0]))

### **Question 4**

In [None]:
#Splitted 400 examples as training and 106 examples as testing (21%)
#Used in both ridge and lasso regressions.
BH_splitted = boston_split(BostonHouse, 0.21)

In [None]:
#Using the same alphas as before.
BH_ridge_mse, BH_ridge_alpha = ridge_kfold(alpha_use[1:],BH_splitted[0], BH_splitted[1])
BH_lasso_alpha = lasso_kfold(alpha_use[1:], BH_splitted[0], BH_splitted[1])

In [None]:
#Reporting the best alphas from the 400 dataset on original BH data.
print('Ridge best alpha for BH original data: '+str(BH_ridge_alpha))
print('Lasso best alpha for BH original data: '+str(BH_lasso_alpha))

In [None]:
#Training the data only the best alpha for ridge on BH original data
BH_bestalpha_matrix_ridge= coeff_matrices([BH_ridge_alpha], ridge_model, BH_splitted[0], BH_splitted[1], BH_splitted[2], BH_splitted[3])
print('RMSE for best alpha in ridge on BH original data: '+str(BH_bestalpha_matrix_ridge[0]))

In [None]:
#Training the data once again using only the best alpha for lasso on BH original data
BH_bestalpha_matrix_lasso= coeff_matrices([BH_lasso_alpha], lasso_model, BH_splitted[0], BH_splitted[1], BH_splitted[2], BH_splitted[3])
print('RMSE for best alpha in lasso on BH original data: '+str(BH_bestalpha_matrix_lasso[0]))

## **Problem 2**

### **Question 1**

In [None]:
#Data split into traiing and testing sets
e2006_data = e2006_split(E2006_train, E2006_test)

In [None]:
print(e2006_data[0].shape,len(e2006_data[1]),e2006_data[2].shape,len(e2006_data[3]))

In [None]:
#Optimal Solutions
e2006_ridge_opt_coef, e2006_ridge_opt_int = optim_ridge_e20(e2006_data)
e2006_lasso_opt_coef, e2006_lasso_opt_int = optim_lasso_e20(e2006_data)

In [None]:
#Visualize the optimum w vector and score for btoh lasso and ridge with alpha = 1
print('ridge cost vector w:'+ str(e2006_ridge_opt_coef), 'ridge best intercept:' +str(e2006_ridge_opt_int))
print()
print('lasso cost vector w:'+str(e2006_lasso_opt_coef), 'lasso best intercept:'+str(e2006_lasso_opt_int))

### **Question 2**

In [None]:
#Creation of coefficient matrix for ridge regression for the E2006data
e2006_matrix_ridge= coeff_matrices(alpha_use, ridge_model, e2006_data[0], e2006_data[1], e2006_data[2], e2006_data[3])
e2006_matrix_ridge

In [None]:
#Creation of coefficient matrix for lasso regression. First and second column represent the RMSE and
#the intercept
e2006_matrix_lasso= coeff_matrices(alpha_use, lasso_model, e2006_data[0], e2006_data[1], e2006_data[2], e2006_data[3])
e2006_matrix_lasso

In [None]:
#Plotting the alpha vs RMSE for Ridge and Lasso regressions
plot_rmse(alpha_use, e2006_matrix_ridge, 'Ridge')
plot_rmse(alpha_use, e2006_matrix_lasso, 'Lasso')

### **Question 3**

In [None]:
#From sklearn, return the best alpha and the mean square error
#Using the same alphas as before.
e2006_ridge_mse, e2006_ridge_alpha = ridge_kfold(alpha_use[1:],e2006_splitted[0], e2006_splitted[1])
e2006_lasso_alpha = e2006_kfold(alpha_use[1:], e2006_splitted[0], e2006_splitted[1])

In [None]:
#Reporting the best alphas from the training dataset.
print('Ridge best alpha for e2006 data: '+str(e2006_ridge_alpha))
print('Lasso best alpha for e2006 data: '+str(e2006_lasso_alpha))

In [None]:
#Training the data once again using only the best alpha for ridge
e2006_bestalpha_matrix_ridge= coeff_matrices([e2006_ridge_alpha], ridge_model, e2006_splitted[0], e2006_splitted[1], e2006_splitted[2], e2006_splitted[3])
print('RMSE for best alpha in ridge on e2006 data: '+str(e2006_bestalpha_matrix_ridge[0]))

In [None]:
#Training the data once again using only the best alpha for lasso
e2006_bestalpha_matrix_lasso= coeff_matrices([e2006_lasso_alpha], lasso_model, e2006_splitted[0], e2006_splitted[1], e2006_splitted[2], e2006_splitted[3])
print('RMSE for best alpha in lasso on e2006 data: '+str(e2006_bestalpha_matrix_lasso[0]))