In [1]:
import pandas as pd
import matplotlib as mat
import numpy as np
import seaborn as sns

from sklearn.linear_model import Lasso, LinearRegression, Ridge, ElasticNet # These are all the classifier we will use, we are going to compare the results
from sklearn.model_selection import train_test_split # this to split the data in trainibg and testing
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, PolynomialFeatures # this is for scaling and encoding the data and PolynomialFeatures is to generate a new feature matrix consisting of all polynomial combinations
from sklearn.metrics import mean_squared_error, r2_score # This is to obtain the score of the model
from sklearn.model_selection import GridSearchCV # this will be used to find the best parameter for each model


In [2]:
# This is the data we will be using.
data = pd.read_csv('/content/drive/MyDrive/Input/Housing.csv')
data.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [3]:
# Encoding categorical columns
col = [x for x in data.columns if data[x].dtype == "O"]
encoder = OneHotEncoder(sparse_output=False, dtype = "int" , drop = 'first')
encoded_data = encoder.fit_transform(data[col])
encoded_data = pd.DataFrame(encoded_data, columns = encoder.get_feature_names_out())
encoded_data = pd.concat([data.drop(col, axis = 1), encoded_data], axis=1)

In [4]:
num_col = [y for y in data.columns if data[y].dtype != 'O']

In [5]:
# Spliting the data for testing and training
train, test = train_test_split(encoded_data, test_size = 0.3, random_state= 50)

In [6]:
# scaling the data using minmaxscaler, this will convert all the values into the same scale
scaler = MinMaxScaler()
train[num_col] = scaler.fit_transform(train[num_col])
test[num_col] = scaler.transform(test[num_col])

In [7]:
# seprating the dependent form the independent variables
# training
x_train = train.drop(['price'], axis=1)
y_train = train['price']

# testing
x_test = test.drop(['price'], axis=1)
y_test = test['price']

1 - Regularized Regression
* Ridge Regression (L2 penalty)
* Lasso Regression (L1 penalty)
* Elastic Net (combination of L1 & L2)

2 - Polynomial Regression -
Captures non-linearity by adding polynomial terms.

Lasso - Lasso is L1 penalty
It adds a penalty equal to the absolute value of coefficients and can shrink some coefficients completely to zero, effectively selecting only the most important features.

Use Lasso if you wish to get rid of some unimportamt columns and select the only important ones.

In [8]:
# We will use GridSearchCV to find the best parameters for the model.
ls = Lasso(random_state = 50)
ls_para = {'alpha' : [0.01, 0.1, 0.2, 0.5, 1, 2, 3, 4, 5, 6, 7, 9, 10],
        'selection' : ['cyclic', 'random']}
ls_hy = GridSearchCV(estimator = ls, param_grid= ls_para, cv = 5, n_jobs = -1)
ls_hy.fit(x_train, y_train)

print("The best parameters for Lasso regression on this data is", ls_hy.best_params_)
print("The score for the best parameters is ", round(ls_hy.best_score_*100, 3))

# Let's build the model now
ls = Lasso(**ls_hy.best_params_, random_state = 50)
ls.fit(x_train, y_train)
ls_y_pred = ls.predict(x_test)

# Testing the model
ls_train_score = ls.score(x_train, y_train)
ls_test_score = r2_score(y_test, ls_y_pred)
ls_mse = round(mean_squared_error(y_test, ls_y_pred), 3)

The best parameters for Lasso regression on this data is {'alpha': 0.01, 'selection': 'cyclic'}
The score for the best parameters is  40.25


Ridge L2 penalty, this adds a penalty equal to the square of the magnitude of the coefficients.
It Doesn't shrink coefficients to zero but makes them smaller and reduces the impact of less important features.



In [9]:
# We will use GridSearchCV to find the best parameters for the model.
ri_para = {'alpha' : [0.01, 0.1, 0.2, 0.5, 1, 2, 3, 4, 5, 6, 9, 10, 15]}
ri = Ridge(random_state = 50)
ri_hy = GridSearchCV(ri, ri_para, cv = 5, n_jobs = -1)
ri_hy.fit(x_train, y_train)

print("The best parameters for Ridge regression on this data is", ri_hy.best_params_)
print("The score for the best parameters is ", round(ri_hy.best_score_*100, 3))

# Let's build the model now
ri = Ridge(**ri_hy.best_params_, random_state = 50)
ri.fit(x_train, y_train)
ri_y_pred = ri.predict(x_test)

# Testing the model
ri_train_score = ri.score(x_train, y_train)
ri_test_score = r2_score(y_test, ri_y_pred)
ri_mse = round(mean_squared_error(y_test, ri_y_pred), 3)


The best parameters for Ridge regression on this data is {'alpha': 1}
The score for the best parameters is  61.668


Elastic Net Regression
Combines Ridge and Lasso by applying both L1 & L2 penalties.
It is useful when there are many correlated variables and when ridge and lasso on there own are not enough.

In [10]:
# We will use GridSearchCV to find the best parameters for the model.
en = ElasticNet(random_state = 50)
en_para = {'alpha' : [0.01, 0.1, 0.2, 0.5, 1, 2, 3, 4, 5, 6, 7, 9, 10],
        'selection' : ['cyclic', 'random']}
en_hy = GridSearchCV(en, en_para, cv = 5, n_jobs= -1)
en_hy.fit(x_train, y_train)

print("The best parameters for ElasticNet regression on this data :", en_hy.best_params_)
print("The score for the best parameters is ", round(en_hy.best_score_*100, 3))

# Let's build the model now
en = ElasticNet(**en_hy.best_params_, random_state = 50)
en.fit(x_train, y_train)
en_y_pred = en.predict(x_test)

# Testing the model
en_train_score = en.score(x_train, y_train)
en_test_score = r2_score(y_test, en_y_pred)
en_mse = round(mean_squared_error(y_test, en_y_pred), 3)


The best parameters for ElasticNet regression on this data : {'alpha': 0.01, 'selection': 'random'}
The score for the best parameters is  53.463


Now, lets compare the scores of all the 3 models.


In [11]:
# Creating a dataframe
result = pd.DataFrame({"Models": ["Lasso", "Ridge", "ElasticNet"],
                       "Training_score": [ls_train_score, ri_train_score, en_train_score],
                       "Testing_score" : [ls_test_score, ri_test_score, en_test_score],
                       "Mean_squared_error" :[ls_mse, ri_mse, en_mse],
                       })
result

Unnamed: 0,Models,Training_score,Testing_score,Mean_squared_error
0,Lasso,0.445791,0.453936,0.015
1,Ridge,0.670938,0.687742,0.008
2,ElasticNet,0.582632,0.598462,0.011


Checking the cofficient of all the models


In [12]:
# This is for Lasso
feature = x_train.columns
ls_columns_coef = ls.coef_
ls_not_imp_features = feature[ls_columns_coef == 0]
print(ls_not_imp_features)  # these are the columns that are not important according to the model

Index(['area', 'bedrooms', 'guestroom_yes', 'basement_yes',
       'hotwaterheating_yes', 'furnishingstatus_semi-furnished'],
      dtype='object')


In [13]:
# This is for Ridge
feature = x_train.columns
ri_columns_coef = ri.coef_
ri_not_imp_features = feature[ri_columns_coef == 0]
print(ri_not_imp_features) # this wont show anything as ridge doesn't shring the value of coffficient to 0

Index([], dtype='object')


In [14]:
# this is for Elastic Net
feature = x_train.columns
en_columns_coef = en.coef_
en_not_imp_features = feature[en_columns_coef == 0]
print(en_not_imp_features)

Index(['bedrooms', 'hotwaterheating_yes', 'furnishingstatus_semi-furnished'], dtype='object')


# Ploynomial regression

PolynomialFeatures(degree=2, *, interaction_only=False, include_bias=True, order='C')

Here, degree is to control the highest degree of polynomial that we need to genereate. interaction_only if is True, this will generate interaction terms and if False, it will generate all polynomial terms up to a specified degree including power of individual features, use True if you wish to acoid multicollinearity. include_bias is used to add constant to the output, if you are going to use OLS model on the polynomial data then make include_bias = True and if you wish to use Lasso, Ridge, elasticnet and linear regression then you do not need to add this bias as these model can add this on their own.


In [17]:
result_df = pd.DataFrame(columns = ["Degree", 'Train Score', 'Test Score', "MSE"]) # initiating a data frame to store the error of the model later
degree = [2,3,4]
for i in degree:
  poly_mod = PolynomialFeatures(degree=i)
  poly_x_train = pd.DataFrame(poly_mod.fit_transform(x_train), columns = poly_mod.get_feature_names_out(input_features = x_train.columns))
  poly_x_test = pd.DataFrame(poly_mod.transform(x_test),  columns = poly_mod.get_feature_names_out(input_features = x_test.columns))

  print((poly_x_train.shape)) # to check the number of columns and rows in the polynomical data set

  # Using GridSearchCV to find the best parameters for the model
  ls = Lasso(random_state = 50)
  ls_para = {'alpha' : [0.01, 0.1, 0.2, 0.5, 1, 2, 3, 4, 5, 6, 7, 9, 10],
        'selection' : ['cyclic', 'random']}
  ls_hy = GridSearchCV(estimator = ls, param_grid= ls_para, cv = 5, n_jobs = -1)
  ls_hy.fit(poly_x_train, y_train)

  print("The best parameters for Lasso regression on this data is", ls_hy.best_params_)
  print("The score for the best parameters is ", round(ls_hy.best_score_*100, 3))

  # Building the model using the best parameters and polynomial data
  ls = Lasso(**ls_hy.best_params_, random_state = 50)
  ls.fit(poly_x_train, y_train)
  ls_y_pred = ls.predict(poly_x_test)

  ls_train_score = ls.score(poly_x_train, y_train)
  ls_test_score = r2_score(y_test, ls_y_pred)
  ls_mse = round(mean_squared_error(y_test, ls_y_pred), 3)

  result_df.loc[len(result_df)] = [i , ls_train_score, ls_test_score, ls_mse] # appending the errors to the dataframe we created earlier

print(result_df)


(381, 105)
The best parameters for Lasso regression on this data is {'alpha': 0.01, 'selection': 'random'}
The score for the best parameters is  42.936
(381, 560)
The best parameters for Lasso regression on this data is {'alpha': 0.01, 'selection': 'random'}
The score for the best parameters is  42.936
(381, 2380)
The best parameters for Lasso regression on this data is {'alpha': 0.01, 'selection': 'random'}
The score for the best parameters is  42.933
   Degree  Train Score  Test Score    MSE
0     2.0     0.467216    0.473932  0.014
1     3.0     0.467237    0.473945  0.014
2     4.0     0.467206    0.473922  0.014
