Data Science PROJECT

BCS-7A

Predicting House Prices in USA based on Real Estate Data

GROUP 3

Faraz Majid 20L-1162

Aemon Fatima 20L-1057

Ahmad Abdullah Dhami 20L-1226

In [16]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler 
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import joblib

In [4]:
df = pd.read_csv('./data/preprocessed.csv')

In [5]:
# Adding a bias term to the features
df['bias'] = 1

In [6]:
# Splitting the data into features (X) and target variable (y)
X = df[['bias', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated']]
y = df['price']

In [7]:
# Converting DataFrame to NumPy arrays
X_np = X.to_numpy()
y_np = y.to_numpy()

In [8]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_np, y_np, test_size=0.2, random_state=3372)

In [9]:
# Standardizing features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [10]:
# Setting up Ridge regression model
ridge = Ridge()

In [11]:
# Defining a range of alpha values (regularization strength) to try
param_grid = {'alpha': np.logspace(-6, 6, 13)}

In [12]:
# Using GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(ridge, param_grid, cv=5, scoring='r2')
grid_search.fit(X_train_scaled, y_train)

In [13]:
# Finding the best hyperparameter values
best_alpha = grid_search.best_params_['alpha']

In [14]:
# Fitting the model with the best hyperparameter values
best_model = Ridge(alpha=best_alpha)
best_model.fit(X_train_scaled, y_train)

In [15]:
# Making predictions on the test set
y_pred = best_model.predict(X_test_scaled)

# Calculating R2 Score, MAE, MSE, RMSE for the best model
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print("Best Ridge Regression Metrics:")
print(f'R2 Score: {r2}')
print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')
print(f'Root Mean Squared Error: {rmse}')
print(f'Best Alpha (Regularization Strength): {best_alpha}')

Best Ridge Regression Metrics:
R2 Score: 0.7832052677738679
Mean Absolute Error: 126099.32389108541
Mean Squared Error: 31561219380.04553
Root Mean Squared Error: 177654.77584361623
Best Alpha (Regularization Strength): 10.0


In [17]:
# Saving the model to a file
joblib.dump(best_model, 'ridge_regression_model.joblib')

['ridge_regression_model.joblib']

In [18]:
# Loading the model from the file
loaded_model = joblib.load('ridge_regression_model.joblib')

In [19]:
# Example: Make predictions using the loaded model
new_data_point = np.array([1, 3, 2, 2000, 5000, 1, 0, 1, 3, 1500, 500, 1990, 2010]).reshape(1, -1)
new_data_point_scaled = scaler.transform(new_data_point)
prediction = loaded_model.predict(new_data_point_scaled)
print(f'Predicted Price: {prediction[0]}')

Predicted Price: 584143.0450036429
