In [2]:
# Import the Utility Functions
import pandas as pd
import numpy as np
import mls_utils as utils

df = pd.read_csv("Clean_Data/mls_th_cleaned.csv")
df.head()



Unnamed: 0,MLS #,City,Zip,Bedrooms,Total Baths,SqFt,Acres,Year Built,List Date,Closing Date,List Price,Sold Price,Days on Market,Over Asking,Lat,Lon,Cluster,Price per SqFt
0,2509707,Cary,27511,4,4,6115,1,1985,2023-05-08,2023-06-12,1049900,1275000,35,225100,35.7641,-78.7786,2,208.503679
1,2444544,Chapel Hill,27517,4,5,4049,1,2004,2022-04-25,2022-06-07,1090000,1300000,43,210000,35.9182,-79.0035,3,321.06693
2,2320632,Durham,27707,3,5,2763,1,2020,2020-05-22,2021-08-05,575000,729840,440,154840,35.9631,-78.9315,3,264.147666
3,2428221,Cary,27511,3,4,3477,1,2004,2022-01-24,2022-02-09,949900,1100000,16,150100,35.7641,-78.7786,2,316.364682
4,10018970,Chapel Hill,27517,4,5,4049,1,2004,2024-04-06,2024-04-17,1350000,1500000,11,150000,35.9182,-79.0035,3,370.461842


In [3]:
#Select features and target variables
features = ['Total Baths', 'Bedrooms', 'Year Built', 'Acres', 'Zip', 'Lat', 'Lon', 'Days on Market', 'Over Asking', 'SqFt', 'Cluster'] # Dropping List Price as it is a direct predictor of Sold Price
target = 'Sold Price'

In [4]:
# Split the data
X_train, X_test, y_train, y_test = utils.split_data(df, features, target)

# Scale the data
X_train_scaled, X_test_scaled = utils.scale_data(X_train, X_test)

In [5]:
# Feature selection
selected_features = utils.select_features(X_train_scaled, y_train)
X_train_selected = X_train_scaled[:, selected_features]
X_test_selected = X_test_scaled[:, selected_features]

In [6]:
# Ridge Regression
ridge_model = utils.tune_hyperparameters(X_train_selected, y_train, 'ridge')
ridge_train_mse, ridge_test_mse, ridge_train_r2, ridge_test_r2 = utils.evaluate_model(ridge_model, X_train_selected, X_test_selected, y_train, y_test)

print(f'Ridge Regression - Train MSE: {ridge_train_mse}')
print(f'Ridge Regression - Test MSE: {ridge_test_mse}')
print(f'Ridge Regression - Train R2: {ridge_train_r2}')
print(f'Ridge Regression - Test R2: {ridge_test_r2}')

Ridge Regression - Train MSE: 6042253787.309913
Ridge Regression - Test MSE: 7040002937.247601
Ridge Regression - Train R2: 0.555535786576741
Ridge Regression - Test R2: 0.561488204594756


In [7]:
# Lasso Regression
lasso_model = utils.tune_hyperparameters(X_train_selected, y_train, 'lasso')
lasso_train_mse, lasso_test_mse, lasso_train_r2, lasso_test_r2 = utils.evaluate_model(lasso_model, X_train_selected, X_test_selected, y_train, y_test)

print(f'Lasso Regression - Train MSE: {lasso_train_mse}')
print(f'Lasso Regression - Test MSE: {lasso_test_mse}')
print(f'Lasso Regression - Train R2: {lasso_train_r2}')
print(f'Lasso Regression - Test R2: {lasso_test_r2}')


Lasso Regression - Train MSE: 6042247121.307258
Lasso Regression - Test MSE: 7038769257.121755
Lasso Regression - Train R2: 0.5555362769235107
Lasso Regression - Test R2: 0.5615650487795899


In [8]:
# Decision Tree Regressor
dt_model = utils.tune_hyperparameters(X_train_selected, y_train, 'decision_tree')
dt_train_mse, dt_test_mse, dt_train_r2, dt_test_r2 = utils.evaluate_model(dt_model, X_train_selected, X_test_selected, y_train, y_test)

print(f'Decision Tree Regressor - Train MSE: {dt_train_mse}')
print(f'Decision Tree Regressor - Test MSE: {dt_test_mse}')
print(f'Decision Tree Regressor - Train R2: {dt_train_r2}')
print(f'Decision Tree Regressor - Test R2: {dt_test_r2}')

Decision Tree Regressor - Train MSE: 2121750318.043451
Decision Tree Regressor - Test MSE: 5248793314.674984
Decision Tree Regressor - Train R2: 0.8439254424945983
Decision Tree Regressor - Test R2: 0.673060110820204


In [9]:
# Random Forest Regressor
rf_model = utils.tune_hyperparameters(X_train_selected, y_train, 'random_forest')
rf_train_mse, rf_test_mse, rf_train_r2, rf_test_r2 = utils.evaluate_model(rf_model, X_train_selected, X_test_selected, y_train, y_test)

print(f'Random Forest Regressor - Train MSE: {rf_train_mse}, Test MSE: {rf_test_mse}')
print(f'Random Forest Regressor - Train R2: {rf_train_r2}, Test R2: {rf_test_r2}')

Random Forest Regressor - Train MSE: 514583274.4974304, Test MSE: 3196191776.1215253
Random Forest Regressor - Train R2: 0.9621475928699625, Test R2: 0.8009137486589615


In [10]:
# Support Vector Regressor
svr_model = utils.tune_hyperparameters(X_train_selected, y_train, 'svm')
svr_train_mse, svr_test_mse, svr_train_r2, svr_test_r2 = utils.evaluate_model(svr_model, X_train_selected, X_test_selected, y_train, y_test)

print(f'Support Vector Regressor - Train MSE: {svr_train_mse}, Test MSE: {svr_test_mse}')
print(f'Support Vector Regressor - Train R2: {svr_train_r2}, Test R2: {svr_test_r2}')



Support Vector Regressor - Train MSE: 13017762266.093433, Test MSE: 15531431174.066692
Support Vector Regressor - Train R2: 0.04242197203269593, Test R2: 0.03256918639643114


In [11]:
# K-Nearest Neighbors Regressor
knn_model = utils.tune_hyperparameters(X_train_selected, y_train, 'knn')
knn_train_mse, knn_test_mse, knn_train_r2, knn_test_r2 = utils.evaluate_model(knn_model, X_train_selected, X_test_selected, y_train, y_test)

print(f'K-Nearest Neighbors Regressor - Train MSE: {knn_train_mse}, Test MSE: {knn_test_mse}')
print(f'K-Nearest Neighbors Regressor - Train R2: {knn_train_r2}, Test R2: {knn_test_r2}')


K-Nearest Neighbors Regressor - Train MSE: 2571343265.619352, Test MSE: 4547255128.135616
K-Nearest Neighbors Regressor - Train R2: 0.8108536810560684, Test R2: 0.7167579291971081
