In [39]:
# Import the Utility Functions
import pandas as pd
from sklearn.metrics import classification_report
import mls_utils as utils

df = pd.read_csv("Clean_Data/mls_th_cleaned_nostring.csv")
df.head()

Unnamed: 0,Zip,Bedrooms,Total Baths,SqFt,Acres,Year Built,List Price,Sold Price,Days on Market,Over Asking,Lat,Lon,Cluster,Price per SqFt
0,27511,4,4,6115,1,1985,1049900,1275000,35,225100,35.7641,-78.7786,4,208.503679
1,27517,4,5,4049,1,2004,1090000,1300000,43,210000,35.9182,-79.0035,1,321.06693
2,27707,3,5,2763,1,2020,575000,729840,440,154840,35.9631,-78.9315,1,264.147666
3,27511,3,4,3477,1,2004,949900,1100000,16,150100,35.7641,-78.7786,4,316.364682
4,27517,4,5,4049,1,2004,1350000,1500000,11,150000,35.9182,-79.0035,1,370.461842


In [40]:
#Select features and target variables
features = ['Total Baths', 'Bedrooms', 'Year Built', 'Acres', 'Zip', 'Lat', 'Lon', 'Days on Market', 'Over Asking', 'SqFt', 'Price per SqFt', 'Cluster'] # Dropping List Price as it is a direct predictor of Sold Price
target = 'Sold Price'


In [41]:
#Call the split_data function to split the data into training and testing sets
X_train, X_test, y_train, y_test = utils.split_data(df, features, target)
X_train_scaled, X_test_scaled = utils.scale_data(X_train, X_test)

In [42]:
# Handle Outliers
df = utils.remove_outliers(df, features)

In [27]:
#Build a function to call the Regression Tree Functions within the utils file
#dt_regressor = utils.train_decision_tree_model(X_train, y_train)
dt_regressor = utils.tune_hyperparameters(X_train_scaled, y_train, 'decision_tree')
dt_train_mse, dt_test_mse, dt_train_r2, dt_test_r2 = utils.evaluate_model(dt_regressor, X_train_scaled, X_test_scaled, y_train, y_test)
dt_bias, dt_variance = utils.calculate_bias_variance(y_test, dt_regressor.predict(X_test_scaled))


print(f'Decision Tree - Train MSE: {dt_train_mse}')
print(f'Decision Tree - Test MSE: {dt_test_mse}')
print(f'Decision Tree - Train R2: {dt_train_r2}')
print(f'Decision Tree - Test R2: {dt_test_r2}')
print(f'Decision Tree - Bias: {dt_bias}')
print(f'Decision Tree - Variance: {dt_variance}')

#utils.visualize_tree(dt_regressor, features)

Decision Tree - Train MSE: 228570.88831546207
Decision Tree - Test MSE: 195015147.31539994
Decision Tree - Train R2: 0.9999842105774791
Decision Tree - Test R2: 0.9870070523672787
Decision Tree - Bias: 15009317451.387707
Decision Tree - Variance: 14745311073.118114


In [28]:
# Linear Regression Model
lr_regressor = utils.train_linear_regression_model(X_train_scaled, y_train, alpha=1.0)
lr_train_mse, lr_test_mse, lr_train_r2, lr_test_r2 = utils.evaluate_model(lr_regressor, X_train_scaled, X_test_scaled, y_train, y_test)
lr_bias, lr_variance = utils.calculate_bias_variance(y_test, lr_regressor.predict(X_test_scaled))

    
print(f'Linear Regression - Train MSE: {lr_train_mse}')
print(f'Linear Regression - Test MSE: {lr_test_mse}')
print(f'Linear Regression - Train R2: {lr_train_r2}')
print(f'Linear Regression - Test R2: {lr_test_r2}')
print(f'Linear Regression - Bias: {lr_bias}')
print(f'Linear Regression - Variance: {lr_variance}')

Linear Regression - Train MSE: 6296861210.540522
Linear Regression - Test MSE: 7387203279.233024
Linear Regression - Train R2: 0.565019837208625
Linear Regression - Test R2: 0.5078251782969974
Linear Regression - Bias: 15009677017.28667
Linear Regression - Variance: 8447130830.247521


In [29]:
# Random Forest Model
rf_regressor = utils.train_random_forest_model(X_train_scaled, y_train, max_depth=50)
rf_train_mse, rf_test_mse, rf_train_r2, rf_test_r2 = utils.evaluate_model(rf_regressor, X_train_scaled, X_test_scaled, y_train, y_test)
rf_bias, rf_variance = utils.calculate_bias_variance(y_test, rf_regressor.predict(X_test_scaled))

    
print(f'Random Forest - Train MSE: {rf_train_mse}')
print(f'Random Forest - Test MSE: {rf_test_mse}')
print(f'Random Forest - Train R2: {rf_train_r2}')
print(f'Random Forest - Test R2: {rf_test_r2}')
print(f'Random Forest - Bias: {rf_bias}')
print(f'Random Forest - Variance: {rf_variance}')

Random Forest - Train MSE: 13279505.479563836
Random Forest - Test MSE: 97249057.38211256
Random Forest - Train R2: 0.9990826665441473
Random Forest - Test R2: 0.9935207499146016
Random Forest - Bias: 15009322051.483961
Random Forest - Variance: 14457466298.857298


In [30]:
#SVM Model
#svm_regressor = utils.train_svm_model(X_train, y_train)
###svm_regressor = utils.tune_hyperparameters(X_train, y_train, 'svm')
#svm_train_mse, svm_test_mse, svm_train_r2, svm_test_r2 = utils.evaluate_model(svm_regressor, X_train, X_test, y_train, y_test)
    
#print(f'SVM - Train MSE: {svm_train_mse}')
#print(f'SVM - Test MSE: {svm_test_mse}')
#print(f'SVM - Train R2: {svm_train_r2}')
#print(f'SVM - Test R2: {svm_test_r2}')

In [31]:
# KNN Model
knn_regressor = utils.tune_hyperparameters(X_train_scaled, y_train, 'knn')
knn_train_mse, knn_test_mse, knn_train_r2, knn_test_r2 = utils.evaluate_model(knn_regressor, X_train_scaled, X_test_scaled, y_train, y_test)
knn_bias, knn_variance = utils.calculate_bias_variance(y_test, knn_regressor.predict(X_test_scaled))
    
print(f'KNN - Train MSE: {knn_train_mse}')
print(f'KNN - Test MSE: {knn_test_mse}')
print(f'KNN - Train R2: {knn_train_r2}')
print(f'KNN - Test R2: {knn_test_r2}')
print(f'KNN - Bias: {knn_bias}')
print(f'KNN - Variance: {knn_variance}')


KNN - Train MSE: 2953041859.952322
KNN - Test MSE: 5036650525.976702
KNN - Train R2: 0.7960071556251527
KNN - Test R2: 0.664431520170609
KNN - Bias: 15018371335.85957
KNN - Variance: 9696013652.141308
