In [61]:
# Import the Utility Functions
import pandas as pd
from sklearn.metrics import classification_report
import mls_utils as utils

df = pd.read_csv("Clean_Data/mls_th_cleaned.csv")
df.head()

Unnamed: 0,MLS #,City,Zip,Bedrooms,Total Baths,SqFt,Acres,Year Built,List Date,Closing Date,List Price,Sold Price,Days on Market,Over Asking,Lat,Lon,Cluster,Price per SqFt
0,2509707,Cary,27511,4,4,6115,1,1985,2023-05-08,2023-06-12,1049900,1275000,35,225100,35.7641,-78.7786,2,208.503679
1,2444544,Chapel Hill,27517,4,5,4049,1,2004,2022-04-25,2022-06-07,1090000,1300000,43,210000,35.9182,-79.0035,1,321.06693
2,2320632,Durham,27707,3,5,2763,1,2020,2020-05-22,2021-08-05,575000,729840,440,154840,35.9631,-78.9315,1,264.147666
3,2428221,Cary,27511,3,4,3477,1,2004,2022-01-24,2022-02-09,949900,1100000,16,150100,35.7641,-78.7786,2,316.364682
4,10018970,Chapel Hill,27517,4,5,4049,1,2004,2024-04-06,2024-04-17,1350000,1500000,11,150000,35.9182,-79.0035,1,370.461842


In [62]:
#Select features and target variables
features = ['Total Baths', 'Bedrooms', 'Year Built', 'Acres', 'Zip', 'Lat', 'Lon', 'Days on Market', 'Over Asking', 'Price per SqFt', 'Cluster'] # Dropping List Price as it is a direct predictor of Sold Price
target = 'Sold Price'


In [63]:
#Call the split_data function to split the data into training and testing sets
X_train, X_test, y_train, y_test = utils.split_data(df, features, target)
X_train_scaled, X_test_scaled = utils.scale_data(X_train, X_test)

In [64]:
#Build a function to call the Regression Tree Functions within the utils file
#dt_regressor = utils.train_decision_tree_model(X_train, y_train)
dt_regressor = utils.tune_hyperparameters(X_train_scaled, y_train, 'decision_tree')
dt_train_mse, dt_test_mse, dt_train_r2, dt_test_r2 = utils.evaluate_model(dt_regressor, X_train_scaled, X_test_scaled, y_train, y_test)
dt_bias, dt_variance = utils.calculate_bias_variance(y_test, dt_regressor.predict(X_test_scaled))


print(f'Decision Tree - Train MSE: {dt_train_mse}')
print(f'Decision Tree - Test MSE: {dt_test_mse}')
print(f'Decision Tree - Train R2: {dt_train_r2}')
print(f'Decision Tree - Test R2: {dt_test_r2}')
print(f'Decision Tree - Bias: {dt_bias}')
print(f'Decision Tree - Variance: {dt_variance}')

#utils.visualize_tree(dt_regressor, features)

Decision Tree - Train MSE: 2021013340.7949984
Decision Tree - Test MSE: 4075341634.242097
Decision Tree - Train R2: 0.8546527375190924
Decision Tree - Test R2: 0.7423963212640695
Decision Tree - Bias: 15823066177.01539
Decision Tree - Variance: 11965375301.14319


In [65]:
# Linear Regression Model
lr_regressor = utils.train_linear_regression_model(X_train_scaled, y_train, alpha=1.0)
lr_train_mse, lr_test_mse, lr_train_r2, lr_test_r2 = utils.evaluate_model(lr_regressor, X_train_scaled, X_test_scaled, y_train, y_test)
lr_bias, lr_variance = utils.calculate_bias_variance(y_test, lr_regressor.predict(X_test_scaled))

    
print(f'Linear Regression - Train MSE: {lr_train_mse}')
print(f'Linear Regression - Test MSE: {lr_test_mse}')
print(f'Linear Regression - Train R2: {lr_train_r2}')
print(f'Linear Regression - Test R2: {lr_test_r2}')
print(f'Linear Regression - Bias: {lr_bias}')
print(f'Linear Regression - Variance: {lr_variance}')

Linear Regression - Train MSE: 9912924241.863848
Linear Regression - Test MSE: 11565869569.524702
Linear Regression - Train R2: 0.2870821915659557
Linear Regression - Test R2: 0.26891759850125896
Linear Regression - Bias: 15825969413.58412
Linear Regression - Variance: 4024486291.223355


In [66]:
# Random Forest Model
rf_regressor = utils.train_random_forest_model(X_train_scaled, y_train, max_depth=50)
rf_train_mse, rf_test_mse, rf_train_r2, rf_test_r2 = utils.evaluate_model(rf_regressor, X_train_scaled, X_test_scaled, y_train, y_test)
rf_bias, rf_variance = utils.calculate_bias_variance(y_test, rf_regressor.predict(X_test_scaled))

    
print(f'Random Forest - Train MSE: {rf_train_mse}')
print(f'Random Forest - Test MSE: {rf_test_mse}')
print(f'Random Forest - Train R2: {rf_train_r2}')
print(f'Random Forest - Test R2: {rf_test_r2}')
print(f'Random Forest - Bias: {rf_bias}')
print(f'Random Forest - Variance: {rf_variance}')

Random Forest - Train MSE: 322314604.5808526
Random Forest - Test MSE: 2763142605.9254904
Random Forest - Train R2: 0.9768197742747137
Random Forest - Test R2: 0.8253408513834285
Random Forest - Bias: 15821189518.13378
Random Forest - Variance: 11754319461.66262


In [67]:
#SVM Model
#svm_regressor = utils.train_svm_model(X_train, y_train)
###svm_regressor = utils.tune_hyperparameters(X_train, y_train, 'svm')
#svm_train_mse, svm_test_mse, svm_train_r2, svm_test_r2 = utils.evaluate_model(svm_regressor, X_train, X_test, y_train, y_test)
    
#print(f'SVM - Train MSE: {svm_train_mse}')
#print(f'SVM - Test MSE: {svm_test_mse}')
#print(f'SVM - Train R2: {svm_train_r2}')
#print(f'SVM - Test R2: {svm_test_r2}')

In [68]:
# KNN Model
knn_regressor = utils.tune_hyperparameters(X_train_scaled, y_train, 'knn')
knn_train_mse, knn_test_mse, knn_train_r2, knn_test_r2 = utils.evaluate_model(knn_regressor, X_train_scaled, X_test_scaled, y_train, y_test)
knn_bias, knn_variance = utils.calculate_bias_variance(y_test, knn_regressor.predict(X_test_scaled))
    
print(f'KNN - Train MSE: {knn_train_mse}')
print(f'KNN - Test MSE: {knn_test_mse}')
print(f'KNN - Train R2: {knn_train_r2}')
print(f'KNN - Test R2: {knn_test_r2}')
print(f'KNN - Bias: {knn_bias}')
print(f'KNN - Variance: {knn_variance}')


KNN - Train MSE: 4247182065.0806713
KNN - Test MSE: 6957817444.351467
KNN - Train R2: 0.6945511076267057
KNN - Test R2: 0.5601940817481187
KNN - Bias: 15845108375.606684
KNN - Variance: 8286470753.934632
