In [1]:
# Importing pandas library to read the dataset
import pandas as pd

# Reading the Boston housing dataset from a CSV file. The file path is provided.
boston = pd.read_csv(r"C:\Users\MSI-NB\Desktop\Machine_Learning_Projects\Boston_Housing_Analysis\boston.csv")

In [2]:
# Displaying general information about the dataset (data types, missing values, etc.)
boston.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 16 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   TOWN     506 non-null    object 
 1   TRACT    506 non-null    int64  
 2   LON      506 non-null    float64
 3   LAT      506 non-null    float64
 4   MEDV     506 non-null    float64
 5   CRIM     506 non-null    float64
 6   ZN       506 non-null    float64
 7   INDUS    506 non-null    float64
 8   CHAS     506 non-null    int64  
 9   NOX      506 non-null    float64
 10  RM       506 non-null    float64
 11  AGE      506 non-null    float64
 12  DIS      506 non-null    float64
 13  RAD      506 non-null    int64  
 14  TAX      506 non-null    int64  
 15  PTRATIO  506 non-null    float64
dtypes: float64(11), int64(4), object(1)
memory usage: 63.4+ KB


In [3]:
# Importing mglearn library which provides datasets and tools for visualization
import mglearn
import mglearn.datasets

# Loading the extended Boston dataset using mglearn. This dataset has more features than the standard Boston dataset.
x, y = mglearn.datasets.load_extended_boston()

In [4]:
print(x.shape)

(506, 104)


In [5]:
# Importing train_test_split from sklearn.model_selection to split the data into training and testing sets
from sklearn.model_selection import train_test_split

# Splitting the dataset into training and testing sets with a fixed random_state for reproducibility
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=0)

**RIDGE LEGRESSION**

In [6]:
# **RIDGE REGRESSION**: Using L2 regularization for linear regression.
from sklearn.linear_model import Ridge

# Using default alpha = 1.0, which is a moderate level of regularization.
# The model might overfit the data (especially if there are many features), leading to high training accuracy and low test accuracy.
ridge = Ridge().fit(x_train, y_train)

In [7]:
# Print model's accuracy on training and test sets
print(ridge.score(x_train, y_train))  # High training score could indicate overfitting
print(ridge.score(x_test, y_test))  # Low test score indicates overfitting on the training data

0.885796658517094
0.752768348174475


In [8]:
# Trying with a larger alpha = 10 to apply stronger regularization.
# The model becomes simpler and less likely to overfit, but it might underfit if the regularization is too strong.
ridge10 = Ridge(alpha=10).fit(x_train, y_train)

# Print the accuracy scores for the new model
print(ridge10.score(x_train, y_train))  # Possibly lower training accuracy, reduced overfitting
print(ridge10.score(x_test, y_test))  # Test accuracy might improve, or could still be low if underfitting

0.7882787115369614
0.6359411489177309


In [9]:
# Trying with a smaller alpha = 0.1 to reduce regularization slightly.
# The model could start fitting the data better, but might still overfit to the training data.
ridge01 = Ridge(alpha=0.1).fit(x_train, y_train)

# Print accuracy scores
print(ridge01.score(x_train, y_train))  # Possible higher training accuracy, indicating less regularization
print(ridge01.score(x_test, y_test))  # Better balance between training and test accuracy

0.9282273685001988
0.7722067936479631


**LASSO LEGRESSION**

In [10]:
# **LASSO REGRESSION**: Using L1 regularization for linear regression, which can drive some coefficients to zero.
from sklearn.linear_model import Lasso

In [11]:
# Train a Lasso regression model with default alpha = 1.0.
# Similar to Ridge, Lasso also applies regularization but has the added benefit of setting some coefficients to zero.
lasso = Lasso().fit(x_train, y_train)

In [12]:
# Print accuracy scores
print(lasso.score(x_train, y_train))  # High training accuracy might indicate overfitting
print(lasso.score(x_test, y_test))  # Low test accuracy suggests overfitting

0.29323768991114607
0.20937503255272294


In [13]:
import numpy as np
np.sum(lasso.coef_!=0)

np.int64(4)

In [14]:
# Using a smaller alpha = 0.01 to reduce regularization, allowing the model to become more complex.
lasso001 = Lasso(alpha=0.01, max_iter=100000).fit(x_train, y_train)

# Print accuracy scores
print(lasso001.score(x_train, y_train))  # Possible overfitting, the model fits training data closely
print(lasso001.score(x_test, y_test))  # Check if overfitting still occurs on test data

0.8962226511086497
0.7656571174549982


In [15]:
# Using an even smaller alpha = 0.0001 to reduce regularization further.
# This will allow the model to fit even closer to the data, which can lead to overfitting.
lasso00001 = Lasso(alpha=0.0001, max_iter=100000).fit(x_train, y_train)

# Print accuracy scores
print(lasso00001.score(x_train, y_train))  # High training accuracy
print(lasso00001.score(x_test, y_test))  # Check for overfitting or underfitting on the test set

0.9507158754515463
0.6437467421272821
