In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
# Load dataset (update path if needed)
data = pd.read_csv("House Price Prediction Dataset.csv")

print(data.shape)
data.head()

(2000, 10)


Unnamed: 0,Id,Area,Bedrooms,Bathrooms,Floors,YearBuilt,Location,Condition,Garage,Price
0,1,1360,5,4,3,1970,Downtown,Excellent,No,149919
1,2,4272,5,4,3,1958,Downtown,Excellent,No,424998
2,3,3592,2,2,3,1938,Downtown,Good,No,266746
3,4,966,4,2,2,1902,Suburban,Fair,Yes,244020
4,5,4926,1,4,2,1975,Downtown,Fair,Yes,636056


In [4]:
# Drop columns with more than 40% missing values
threshold = 0.4
data = data.loc[:, data.isnull().mean() < threshold]

# Separate target
X = data.drop("Price", axis=1)
y = data["Price"]

# Fill missing values
X = X.fillna(X.median(numeric_only=True))
X = X.fillna("None")

# One-hot encoding
X = pd.get_dummies(X, drop_first=True)

print(X.shape)

(2000, 13)


In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [6]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [7]:
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)

y_pred_lr = lr.predict(X_test_scaled)

print("Multiple Linear Regression")
print("MSE:", mean_squared_error(y_test, y_pred_lr))
print("R2 Score:", r2_score(y_test, y_pred_lr))

Multiple Linear Regression
MSE: 78279764120.86243
R2 Score: -0.006181784611834162


In [8]:
ridge = Ridge(alpha=1.0)
ridge.fit(X_train_scaled, y_train)

y_pred_ridge = ridge.predict(X_test_scaled)

print("\nRidge Regression")
print("MSE:", mean_squared_error(y_test, y_pred_ridge))
print("R2 Score:", r2_score(y_test, y_pred_ridge))


Ridge Regression
MSE: 78279030968.43915
R2 Score: -0.006172360916937736


In [9]:
lasso = Lasso(alpha=0.001, max_iter=10000)
lasso.fit(X_train_scaled, y_train)

y_pred_lasso = lasso.predict(X_test_scaled)

print("\nLasso Regression")
print("MSE:", mean_squared_error(y_test, y_pred_lasso))
print("R2 Score:", r2_score(y_test, y_pred_lasso))


Lasso Regression
MSE: 78279763978.74977
R2 Score: -0.006181782785166012


In [10]:
lasso_coefficients = pd.Series(
    lasso.coef_, index=X.columns
)

selected_features = lasso_coefficients[lasso_coefficients != 0]

print("Total features:", len(X.columns))
print("Selected features by Lasso:", len(selected_features))

Total features: 13
Selected features by Lasso: 13


In [11]:
from sklearn.linear_model import RidgeCV, LassoCV

ridge_cv = RidgeCV(alphas=[0.1, 1, 10, 50])
ridge_cv.fit(X_train_scaled, y_train)

lasso_cv = LassoCV(alphas=[0.0005, 0.001, 0.01], max_iter=10000)
lasso_cv.fit(X_train_scaled, y_train)

print("Best Ridge alpha:", ridge_cv.alpha_)
print("Best Lasso alpha:", lasso_cv.alpha_)

Best Ridge alpha: 50.0
Best Lasso alpha: 0.01
