In [5]:
# Import the Utility Functions
import pandas as pd
from sklearn.metrics import classification_report
import mls_utils as utils

df = pd.read_csv("Clean_Data/mls_sfr_cleaned_nostring.csv")
df.head()

Unnamed: 0,Zip,Bedrooms,Total Baths,SqFt,Acres,Year Built,List Price,Sold Price,Days on Market,Over Asking,Lat,Lon,Cluster,Price per SqFt
0,27607,5,8,9376,2,2022,1850000,324999,344,-1525001,35.8014,-78.6877,2,34.662863
1,27377,5,7,6983,8,1883,1300000,3240000,469,1940000,36.033,-79.5972,1,463.983961
2,27587,5,5,7200,7,2005,1800000,2900000,52,1100000,35.9815,-78.5392,3,402.777778
3,27615,5,7,6289,3,2022,2550000,3577591,526,1027591,35.8887,-78.6393,3,568.864843
4,27502,4,6,7266,5,2023,1755900,2598548,273,842648,35.7225,-78.8408,2,357.631159


In [6]:
#Select features and target variables
features = ['Total Baths', 'Bedrooms', 'Year Built', 'Acres', 'Zip', 'Lat', 'Lon', 'Days on Market', 'Over Asking', 'SqFt', 'Cluster'] # Dropping List Price as it is a direct predictor of Sold Price
target = 'Sold Price'


In [7]:
#Call the split_data function to split the data into training and testing sets
X_train, X_test, y_train, y_test = utils.split_data(df, features, target)
X_train_scaled, X_test_scaled = utils.scale_data(X_train, X_test)

In [8]:
# Handle Outliers
df = utils.remove_outliers(df, features)

In [9]:
#Build a function to call the Regression Tree Functions within the utils file
#dt_regressor = utils.train_decision_tree_model(X_train, y_train)
dt_regressor = utils.tune_hyperparameters(X_train_scaled, y_train, 'decision_tree')
dt_train_mse, dt_test_mse, dt_train_r2, dt_test_r2 = utils.evaluate_model(dt_regressor, X_train_scaled, X_test_scaled, y_train, y_test)
dt_bias, dt_variance = utils.calculate_bias_variance(y_test, dt_regressor.predict(X_test_scaled))


print(f'Decision Tree - Train MSE: {dt_train_mse}')
print(f'Decision Tree - Test MSE: {dt_test_mse}')
print(f'Decision Tree - Train R2: {dt_train_r2}')
print(f'Decision Tree - Test R2: {dt_test_r2}')
print(f'Decision Tree - Bias: {dt_bias}')
print(f'Decision Tree - Variance: {dt_variance}')

#utils.visualize_tree(dt_regressor, features)

Decision Tree - Train MSE: 15678549978.731716
Decision Tree - Test MSE: 23444220504.047333
Decision Tree - Train R2: 0.8216400904938947
Decision Tree - Test R2: 0.7239896473220477
Decision Tree - Bias: 84939645076.91824
Decision Tree - Variance: 72206944941.64336


In [10]:
# Linear Regression Model
lr_regressor = utils.train_linear_regression_model(X_train_scaled, y_train, alpha=1.0)
lr_train_mse, lr_test_mse, lr_train_r2, lr_test_r2 = utils.evaluate_model(lr_regressor, X_train_scaled, X_test_scaled, y_train, y_test)
lr_bias, lr_variance = utils.calculate_bias_variance(y_test, lr_regressor.predict(X_test_scaled))

    
print(f'Linear Regression - Train MSE: {lr_train_mse}')
print(f'Linear Regression - Test MSE: {lr_test_mse}')
print(f'Linear Regression - Train R2: {lr_train_r2}')
print(f'Linear Regression - Test R2: {lr_test_r2}')
print(f'Linear Regression - Bias: {lr_bias}')
print(f'Linear Regression - Variance: {lr_variance}')

Linear Regression - Train MSE: 33351288304.73143
Linear Regression - Test MSE: 31004771073.486294
Linear Regression - Train R2: 0.6205942021415736
Linear Regression - Test R2: 0.6349787873213886
Linear Regression - Bias: 84940080033.5441
Linear Regression - Variance: 53593882002.307556


In [11]:
# Random Forest Model
rf_regressor = utils.train_random_forest_model(X_train_scaled, y_train, max_depth=50)
rf_train_mse, rf_test_mse, rf_train_r2, rf_test_r2 = utils.evaluate_model(rf_regressor, X_train_scaled, X_test_scaled, y_train, y_test)
rf_bias, rf_variance = utils.calculate_bias_variance(y_test, rf_regressor.predict(X_test_scaled))

    
print(f'Random Forest - Train MSE: {rf_train_mse}')
print(f'Random Forest - Test MSE: {rf_test_mse}')
print(f'Random Forest - Train R2: {rf_train_r2}')
print(f'Random Forest - Test R2: {rf_test_r2}')
print(f'Random Forest - Bias: {rf_bias}')
print(f'Random Forest - Variance: {rf_variance}')

Random Forest - Train MSE: 2240733869.0305576
Random Forest - Test MSE: 13923665924.038322
Random Forest - Train R2: 0.97450930789839
Random Forest - Test R2: 0.83607576367914
Random Forest - Bias: 84939760369.1727
Random Forest - Variance: 70799006249.00632


In [12]:
#SVM Model
#svm_regressor = utils.train_svm_model(X_train, y_train)
###svm_regressor = utils.tune_hyperparameters(X_train, y_train, 'svm')
#svm_train_mse, svm_test_mse, svm_train_r2, svm_test_r2 = utils.evaluate_model(svm_regressor, X_train, X_test, y_train, y_test)
    
#print(f'SVM - Train MSE: {svm_train_mse}')
#print(f'SVM - Test MSE: {svm_test_mse}')
#print(f'SVM - Train R2: {svm_train_r2}')
#print(f'SVM - Test R2: {svm_test_r2}')

In [13]:
# KNN Model
knn_regressor = utils.tune_hyperparameters(X_train_scaled, y_train, 'knn')
knn_train_mse, knn_test_mse, knn_train_r2, knn_test_r2 = utils.evaluate_model(knn_regressor, X_train_scaled, X_test_scaled, y_train, y_test)
knn_bias, knn_variance = utils.calculate_bias_variance(y_test, knn_regressor.predict(X_test_scaled))
    
print(f'KNN - Train MSE: {knn_train_mse}')
print(f'KNN - Test MSE: {knn_test_mse}')
print(f'KNN - Train R2: {knn_train_r2}')
print(f'KNN - Test R2: {knn_test_r2}')
print(f'KNN - Bias: {knn_bias}')
print(f'KNN - Variance: {knn_variance}')
