# Simple Linear versus Ridge Regression 

## Step 1:  Getting, understanding, and preprocessing the dataset

We first import the standard libaries and some libraries that will help us scale the data and perform some "feature engineering" by transforming the data into $\Phi_2({\bf x})$

In [None]:
import numpy as np
import sklearn
from sklearn.datasets import load_boston
from sklearn.preprocessing import PolynomialFeatures
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import sklearn.linear_model
from sklearn.model_selection import KFold

###  Importing the dataset

In [None]:
# Import the boston dataset from sklearn
# Load dataset to some variable 
boston_data = load_boston()
# print(boston_data.data)

In [None]:
#  Create X and Y variables - X holding the .data and Y holding .target 
X = boston_data.data
y = boston_data.target

#  Reshape Y to be a rank 2 matrix using y.reshape()
Y = y.reshape(X.shape[0], 1)
# print(Y)

# Observe the number of features and the number of labels
print('The number of features is: ', X.shape[1])
# Printing out the features
print('The features: ', boston_data.feature_names)
# The number of examples
print('The number of examples in our dataset: ', X.shape[0])
# Observing the first 2 rows of the data
print(X[0:2])


The number of features is:  13
The features:  ['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
 'B' 'LSTAT']
The number of examples in our dataset:  506
[[6.3200e-03 1.8000e+01 2.3100e+00 0.0000e+00 5.3800e-01 6.5750e+00
  6.5200e+01 4.0900e+00 1.0000e+00 2.9600e+02 1.5300e+01 3.9690e+02
  4.9800e+00]
 [2.7310e-02 0.0000e+00 7.0700e+00 0.0000e+00 4.6900e-01 6.4210e+00
  7.8900e+01 4.9671e+00 2.0000e+00 2.4200e+02 1.7800e+01 3.9690e+02
  9.1400e+00]]


We will also create polynomial feeatures for the dataset to test linear and ridge regression on data with d = 1 and data with d = 2. Feel free to increase the # of degress and see what effect it has on the training and test error. 

In [None]:
# Create a PolynomialFeatures object with degree = 2. Using PolynomialFeatures(degree=2)
# Transform X and save it into X_2 using poly.fit_transform(X)
# Simply copy Y into Y_2 

pfeatures = PolynomialFeatures(degree=2)
  
# transforms the existing features to higher degree features.
X_2 = pfeatures.fit_transform(X)
y_2 = Y

In [None]:
# the shape of X_2 and Y_2 - should be (506, 105) and (506, 1) respectively
print(X_2.shape)
print(y_2.shape)


(506, 105)
(506, 1)


# Your code goes here

In [None]:
# Define the get_coeff_ridge_normaleq function. Use the normal equation method.
# Return w values

def get_coeff_ridge_normaleq(X_train, y_train, alpha):
    I = np.identity(X_train.shape[1])
    w = np.dot(np.dot(np.linalg.inv(np.dot(X_train.T, X_train) + alpha * I), X_train.T), y_train)
    return w

In [None]:
# Define the get_coeff_ridge_normaleq function. Use the normal equation method.
# Return w values

def get_coeff_linear_normaleq(X_train, y_train):
    w = np.dot(np.dot(np.linalg.pinv(np.dot(X_train.T, X_train)), X_train.T), y_train)
    return w

In [None]:
# Define the evaluate_err_ridge function.
# Return the train_error and test_error values

def evaluate_err(X_train, X_test, y_train, y_test, w): 
    pred_train = np.dot(X_train, w)
    pred_test = np.dot(X_test, w)
    train_error = np.mean(np.square(y_train - pred_train))
    test_error = np.mean(np.square(y_test - pred_test))
    
    return train_error, test_error

In [None]:
# Finish writting the k_fold_cross_validation function. 
# Returns the average training error and average test error from the k-fold cross validation
# Sklearns K-Folds cross-validator: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html

def k_fold_cross_validation(k, X, y, alpha=None):
    kf = KFold(n_splits=k, random_state=21, shuffle=True)
    total_E_val_test = 0
    total_E_val_train = 0
    train_ = []
    test_ = []
    print("*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*")
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        # Centering the data so we do not need the intercept term (we could have also chose w_0=average y value)
        # Subtract y_train_mean from y_train and y_test
        y_train_mean = np.mean(y_train)
        y_train -= y_train_mean
        y_test -= y_train_mean
        
        # Scaling the data matrix
        scaler = preprocessing.StandardScaler().fit(X_train)
        # And scaler.transform(...)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)
        
        # Determine the training error and the test error
        # Use get_coeff_linear_normaleq or get_coeff_ridge_normaleq to get w
        if alpha is None:
          m = "Linear"
          w = get_coeff_linear_normaleq(X_train, y_train)
        else:
          m = "Ridge"
          w = get_coeff_ridge_normaleq(X_train, y_train, alpha)
        # And use evaluate_err()
        total_E_val_train = evaluate_err(X_train, X_test, y_train, y_test, w)[0]
        total_E_val_test = evaluate_err(X_train, X_test, y_train, y_test, w)[1]
        train_.append(total_E_val_train)
        test_.append(total_E_val_test)
        print("<<Training Error>> {:.3f}".format(total_E_val_train), " <<Testing Error>> {:.3f}".format(total_E_val_test))
    train_average = np.mean(train_)
    test_average = np.mean(test_)
    print("\n<<Average Training Error>> {:.3f}".format(train_average), " <<Average Testing Error>> {:.3f}".format(test_average))
    print("*-*-*-*-*-*-*-*-*-*-* {} *-*-*-*-*-*-*-*-*-*-*\n".format(m))
       ##############
    return  total_E_val_test, total_E_val_train
    


In [None]:
# print the error for the both linear regression and ridge regression
# the error should include both training error and testing error

In [None]:
# test the various polynomial regressions (requirement 6) asked in the question, and the various regularization lambdas (requirement 4).

In [None]:
# LINEAR
test, train = k_fold_cross_validation(10, X, y)

*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
<<Training Error>> 20.244  <<Testing Error>> 37.104
<<Training Error>> 22.585  <<Testing Error>> 17.124
<<Training Error>> 21.210  <<Testing Error>> 29.085
<<Training Error>> 22.104  <<Testing Error>> 20.751
<<Training Error>> 22.726  <<Testing Error>> 15.481
<<Training Error>> 22.086  <<Testing Error>> 21.169
<<Training Error>> 23.218  <<Testing Error>> 10.543
<<Training Error>> 21.192  <<Testing Error>> 29.642
<<Training Error>> 22.361  <<Testing Error>> 18.710
<<Training Error>> 20.335  <<Testing Error>> 36.751

<<Average Training Error>> 21.806  <<Average Testing Error>> 23.636
*-*-*-*-*-*-*-*-*-*-* Linear *-*-*-*-*-*-*-*-*-*-*



In [None]:
# RIDGE with different lambda values
for i in np.logspace(1, 7, num = 13):
  print("lambda: ", i)
  k_fold_cross_validation(10, X, y, i)

##########################################
# Best model performace when lambda = 10 with the average training error of 21.893 and the average testing error of 23.689.

lambda:  10.0
*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
<<Training Error>> 20.332  <<Testing Error>> 37.032
<<Training Error>> 22.671  <<Testing Error>> 17.227
<<Training Error>> 21.307  <<Testing Error>> 28.983
<<Training Error>> 22.188  <<Testing Error>> 21.102
<<Training Error>> 22.810  <<Testing Error>> 15.281
<<Training Error>> 22.168  <<Testing Error>> 21.004
<<Training Error>> 23.311  <<Testing Error>> 10.152
<<Training Error>> 21.283  <<Testing Error>> 30.047
<<Training Error>> 22.441  <<Testing Error>> 18.600
<<Training Error>> 20.417  <<Testing Error>> 37.457

<<Average Training Error>> 21.893  <<Average Testing Error>> 23.689
*-*-*-*-*-*-*-*-*-*-* Ridge *-*-*-*-*-*-*-*-*-*-*

lambda:  31.622776601683793
*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
<<Training Error>> 20.733  <<Testing Error>> 37.184
<<Training Error>> 23.066  <<Testing Error>> 17.637
<<Training Error>> 21.725  <<Testing Error>> 29.418
<<Training Error>> 22.567  <<Testing Error>> 21.850
<<Trai

In [None]:
# LINEAR (degree = 2)
test_2, train_2 = k_fold_cross_validation(10, X_2, y_2)

<<Training Error>> 4.893  <<Testing Error>> 19.750
<<Training Error>> 6.022  <<Testing Error>> 18.438
<<Training Error>> 6.067  <<Testing Error>> 8.606
<<Training Error>> 6.154  <<Testing Error>> 7.242
<<Training Error>> 6.035  <<Testing Error>> 8.004
<<Training Error>> 5.960  <<Testing Error>> 9.025
<<Training Error>> 5.969  <<Testing Error>> 7.999
<<Training Error>> 5.882  <<Testing Error>> 10.427
<<Training Error>> 5.594  <<Testing Error>> 13.269
<<Training Error>> 5.513  <<Testing Error>> 15.790

*-*-*-*-*-*-*-*-*-*-*-* Linear *-*-*-*-*-*-*-*-*-*-*-*

<<Average Training Error>> 5.809  <<Average Testing Error>> 11.855


In [None]:
# RIDGE (degree = 2) with different lambda values
for i in np.logspace(1, 7, num = 13):
  k_fold_cross_validation(10, X_2, y_2, i)
  print("lambda: ", i)

##########################################
# Best model performace when lambda = 10 with the average training error of 10.049 and the average testing error of 13.476.

<<Training Error>> 9.184  <<Testing Error>> 21.366
<<Training Error>> 10.419  <<Testing Error>> 9.051
<<Training Error>> 10.066  <<Testing Error>> 14.244
<<Training Error>> 10.573  <<Testing Error>> 6.474
<<Training Error>> 10.001  <<Testing Error>> 16.944
<<Training Error>> 10.230  <<Testing Error>> 11.701
<<Training Error>> 10.424  <<Testing Error>> 7.712
<<Training Error>> 10.165  <<Testing Error>> 12.238
<<Training Error>> 10.149  <<Testing Error>> 10.188
<<Training Error>> 9.279  <<Testing Error>> 24.843

*-*-*-*-*-*-*-*-*-*-*-* Ridge *-*-*-*-*-*-*-*-*-*-*-*

<<Average Training Error>> 10.049  <<Average Testing Error>> 13.476
lambda:  10.0
<<Training Error>> 11.827  <<Testing Error>> 24.896
<<Training Error>> 13.135  <<Testing Error>> 10.934
<<Training Error>> 12.595  <<Testing Error>> 17.894
<<Training Error>> 13.391  <<Testing Error>> 8.274
<<Training Error>> 12.670  <<Testing Error>> 17.654
<<Training Error>> 12.852  <<Testing Error>> 14.925
<<Training Error>> 13.276  <<Testing

In [None]:
##########################################
################# REPORT #################
##########################################

# Based on the resulted average errors from each ridge and linear regression, 
# linear regression has the best model performance with the lowest training/testing errors when degree = 2
# with lambda = 10, the average training error of 5.809, and the average testing error of 11.855.

# If I were given a choice of predicting future housing prices using one of the models, 
# I would choose to use a linear regression with polynomial transformation of degree 2
# because it produces both the lowest training and testing errors as presented.