In [1]:
# Import the Utility Functions
import pandas as pd
import numpy as np
import mls_utils as utils

df = pd.read_csv("Clean_Data/mls_sfr_cleaned.csv")
df.head()

Unnamed: 0,MLS #,City,Zip,Bedrooms,Total Baths,SqFt,Acres,Year Built,List Date,Closing Date,List Price,Sold Price,Days on Market,Over Asking,Lat,Lon,Cluster,Price per SqFt
0,2372641,Raleigh,27607,5,8,9376,2,2022,2021-03-16,2022-02-23,1850000,324999,344,-1525001,35.8014,-78.6877,2,34.662863
1,2410217,Whitsett,27377,5,7,6983,8,1883,2021-09-27,2023-01-09,1300000,3240000,469,1940000,36.033,-79.5972,1,463.983961
2,2375877,Wake Forest,27587,5,5,7200,7,2005,2021-04-05,2021-05-27,1800000,2900000,52,1100000,35.9815,-78.5392,2,402.777778
3,2378933,Raleigh,27615,5,7,6289,3,2022,2021-04-20,2022-09-28,2550000,3577591,526,1027591,35.8887,-78.6393,2,568.864843
4,2495996,Apex,27502,4,6,7266,5,2023,2023-02-21,2023-11-21,1755900,2598548,273,842648,35.7225,-78.8408,4,357.631159


In [2]:
#Select features and target variables
features = ['Total Baths', 'Bedrooms', 'Year Built', 'Acres', 'Zip', 'Lat', 'Lon', 'Days on Market', 'Over Asking', 'SqFt', 'Cluster'] # Dropping List Price as it is a direct predictor of Sold Price
target = 'Sold Price'

In [3]:
# Split the data
X_train, X_test, y_train, y_test = utils.split_data(df, features, target)

# Scale the data
X_train_scaled, X_test_scaled = utils.scale_data(X_train, X_test)

In [4]:
# Feature selection
selected_features = utils.select_features(X_train_scaled, y_train)
X_train_selected = X_train_scaled[:, selected_features]
X_test_selected = X_test_scaled[:, selected_features]

In [5]:
# Ridge Regression
ridge_model = utils.tune_hyperparameters(X_train_selected, y_train, 'ridge')
ridge_train_mse, ridge_test_mse, ridge_train_r2, ridge_test_r2 = utils.evaluate_model(ridge_model, X_train_selected, X_test_selected, y_train, y_test)

print(f'Ridge Regression - Train MSE: {ridge_train_mse}')
print(f'Ridge Regression - Test MSE: {ridge_test_mse}')
print(f'Ridge Regression - Train R2: {ridge_train_r2}')
print(f'Ridge Regression - Test R2: {ridge_test_r2}')

Ridge Regression - Train MSE: 33506226646.672127, Test MSE: 31129120337.533405
Ridge Regression - Train R2: 0.6188316163995848, Test R2: 0.6335148152426853


In [6]:
# Lasso Regression
lasso_model = utils.tune_hyperparameters(X_train_selected, y_train, 'lasso')
lasso_train_mse, lasso_test_mse, lasso_train_r2, lasso_test_r2 = utils.evaluate_model(lasso_model, X_train_selected, X_test_selected, y_train, y_test)

print(f'Lasso Regression - Train MSE: {lasso_train_mse}')
print(f'Lasso Regression - Test MSE: {lasso_test_mse}')
print(f'Lasso Regression - Train R2: {lasso_train_r2}')
print(f'Lasso Regression - Test R2: {lasso_test_r2}')


Lasso Regression - Train MSE: 33506226247.64209, Test MSE: 31128971052.934685
Lasso Regression - Train R2: 0.618831620938969, Test R2: 0.633516572780102


In [7]:
# Decision Tree Regressor
dt_model = utils.tune_hyperparameters(X_train_selected, y_train, 'decision_tree')
dt_train_mse, dt_test_mse, dt_train_r2, dt_test_r2 = utils.evaluate_model(dt_model, X_train_selected, X_test_selected, y_train, y_test)

print(f'Decision Tree Regressor - Train MSE: {dt_train_mse}')
print(f'Decision Tree Regressor - Test MSE: {dt_test_mse}')
print(f'Decision Tree Regressor - Train R2: {dt_train_r2}')
print(f'Decision Tree Regressor - Test R2: {dt_test_r2}')

Decision Tree Regressor - Train MSE: 11624255842.348106, Test MSE: 19991821990.906593
Decision Tree Regressor - Train R2: 0.8677619280526898, Test R2: 0.7646349624875619


In [8]:
# Random Forest Regressor
rf_model = utils.tune_hyperparameters(X_train_selected, y_train, 'random_forest')
rf_train_mse, rf_test_mse, rf_train_r2, rf_test_r2 = utils.evaluate_model(rf_model, X_train_selected, X_test_selected, y_train, y_test)

print(f'Random Forest Regressor - Train MSE: {rf_train_mse}, Test MSE: {rf_test_mse}')
print(f'Random Forest Regressor - Train R2: {rf_train_r2}, Test R2: {rf_test_r2}')

In [None]:
# Support Vector Regressor
svr_model = utils.tune_hyperparameters(X_train_selected, y_train, 'svm')
svr_train_mse, svr_test_mse, svr_train_r2, svr_test_r2 = utils.evaluate_model(svr_model, X_train_selected, X_test_selected, y_train, y_test)

print(f'Support Vector Regressor - Train MSE: {svr_train_mse}, Test MSE: {svr_test_mse}')
print(f'Support Vector Regressor - Train R2: {svr_train_r2}, Test R2: {svr_test_r2}')



In [None]:
# K-Nearest Neighbors Regressor
knn_model = utils.tune_hyperparameters(X_train_selected, y_train, 'knn')
knn_train_mse, knn_test_mse, knn_train_r2, knn_test_r2 = utils.evaluate_model(knn_model, X_train_selected, X_test_selected, y_train, y_test)

print(f'K-Nearest Neighbors Regressor - Train MSE: {knn_train_mse}, Test MSE: {knn_test_mse}')
print(f'K-Nearest Neighbors Regressor - Train R2: {knn_train_r2}, Test R2: {knn_test_r2}')
