In [1]:
import pandas as pd
import preprocessor_improved as prep_improved
import preprocessor_baseline as prep_baseline
from random_forest_regressor import random_forest_regressor
from linear_regression import linear_regression
from gbm import xg_boost
from sklearn.model_selection import train_test_split
import warnings
import numpy as np
warnings.filterwarnings('ignore')



# Load the datasets
singapore_df = pd.read_csv('datasets/singapore_listings.csv')
ny_df = pd.read_csv('datasets/newyorkcity_listings.csv')
madrid_df = pd.read_csv('datasets/madrid_listings.csv')


# Checking the first few rows of each dataset to understand their structure

singapore_description = singapore_df.describe()
ny_description = ny_df.describe()
madrid_description = madrid_df.describe()

print("Singapore:\n", singapore_description['price'],"\n")
print("NY:\n",ny_description['price'], "\n")
print("Madrid:\n",madrid_description['price'], "\n")



  from pandas.core import (


Singapore:
 count     7907.000000
mean       169.332996
std        340.187599
min          0.000000
25%         65.000000
50%        124.000000
75%        199.000000
max      10000.000000
Name: price, dtype: float64 

NY:
 count    48895.000000
mean       152.720687
std        240.154170
min          0.000000
25%         69.000000
50%        106.000000
75%        175.000000
max      10000.000000
Name: price, dtype: float64 

Madrid:
 count    19618.000000
mean       129.271740
std        484.143545
min          0.000000
25%         35.000000
50%         58.000000
75%        100.000000
max       9999.000000
Name: price, dtype: float64 



### Baseline Preprocessing

In [2]:


# Preprocess the data
singapore_preproccessor, singapore_X, singapore_y = prep_baseline.preprocess_data(singapore_df)
ny_preproccessor, ny_X, ny_y = prep_baseline.preprocess_data(ny_df)
madrid_preproccessor, madrid_X, madrid_y = prep_baseline.preprocess_data(madrid_df)


# Split the data into training and testing sets
singapore_X_train, singapore_X_test, singapore_y_train, singapore_y_test = train_test_split(singapore_X, singapore_y, test_size=0.3, random_state=42)
ny_X_train, ny_X_test, ny_y_train, ny_y_test = train_test_split(ny_X, ny_y, test_size=0.3, random_state=42)
madrid_X_train, madrid_X_test, madrid_y_train, madrid_y_test = train_test_split(madrid_X, madrid_y, test_size=0.3, random_state=42)


#Regression models for each city

# Singapore
rf_rsme, rf_mae, rf_variance = random_forest_regressor(singapore_X_train, singapore_X_test, singapore_y_train, singapore_y_test, singapore_preproccessor)
lr_rsme, lr_mae, lr_variance = linear_regression(singapore_X_train, singapore_X_test, singapore_y_train, singapore_y_test, singapore_preproccessor)
xg_rsme, xg_mae, xg_variance = xg_boost(singapore_X_train, singapore_X_test, singapore_y_train, singapore_y_test, singapore_preproccessor)

print("Singapore")
print(f'Random Forest Regressor RMSE: {rf_rsme}, MAE: {rf_mae}', f'Variance: {rf_variance}')
print(f'Linear Regression RMSE: {lr_rsme}, MAE: {lr_mae}', f'Variance: {lr_variance}')
print(f'XGBoost RMSE: {xg_rsme}, MAE: {xg_mae}', f'Variance: {xg_variance}')


# New York
rf_rsme, rf_mae, rf_variance = random_forest_regressor(ny_X_train, ny_X_test, ny_y_train, ny_y_test, ny_preproccessor)
lr_rsme, lr_mae, lr_variance = linear_regression(ny_X_train, ny_X_test, ny_y_train, ny_y_test, ny_preproccessor)
xg_rsme, xg_mae, xg_variance = xg_boost(ny_X_train, ny_X_test, ny_y_train, ny_y_test, ny_preproccessor)


print("\nNew York")
print(f'Random Forest Regressor RMSE: {rf_rsme}, MAE: {rf_mae}', f'Variance: {rf_variance}')
print(f'Linear Regression RMSE: {lr_rsme}, MAE: {lr_mae}', f'Variance: {lr_variance}')
print(f'XGBoost RMSE: {xg_rsme}, MAE: {xg_mae}', f'Variance: {xg_variance}')


# Madrid
rf_rsme, rf_mae, rf_variance = random_forest_regressor(madrid_X_train, madrid_X_test, madrid_y_train, madrid_y_test, madrid_preproccessor)
lr_rsme, lr_mae, lr_variance = linear_regression(madrid_X_train, madrid_X_test, madrid_y_train, madrid_y_test, madrid_preproccessor)
xg_rsme, xg_mae, xg_variance = xg_boost(madrid_X_train, madrid_X_test, madrid_y_train, madrid_y_test, madrid_preproccessor)

print("\nMadrid")
print(f'Random Forest Regressor RMSE: {rf_rsme}, MAE: {rf_mae}', f'Variance: {rf_variance}')
print(f'Linear Regression RMSE: {lr_rsme}, MAE: {lr_mae}', f'Variance: {lr_variance}')
print(f'XGBoost RMSE: {xg_rsme}, MAE: {xg_mae}', f'Variance: {xg_variance}')





Singapore
Random Forest Regressor RMSE: 314.6523661198244, MAE: 79.01992836072482 Variance: -0.5381259410149648
Linear Regression RMSE: 248.57254193395678, MAE: 84.68295482084284 Variance: 0.03840506709317548
XGBoost RMSE: 297.173075536394, MAE: 82.90453602071558 Variance: -0.3734088639942439

New York
Random Forest Regressor RMSE: 194.6486545627655, MAE: 65.22322607766945 Variance: 0.06331150711888933
Linear Regression RMSE: 185.19971902117396, MAE: 69.23408641346924 Variance: 0.15110002459484662
XGBoost RMSE: 201.18260169021912, MAE: 66.17254536945876 Variance: -0.001461049180776941

Madrid
Random Forest Regressor RMSE: 532.889682371564, MAE: 117.15156095577885 Variance: -0.06102870688916795
Linear Regression RMSE: 514.3537262176827, MAE: 124.29116131753781 Variance: 0.011576844518915497
XGBoost RMSE: 531.4788951226592, MAE: 130.83596830765202 Variance: -0.05541062194781521


### Improved Preprocessing (With Outlier Management)

In [3]:
#Preprocess the data

singapore_preproccessor, singapore_X, singapore_y = prep_improved.preprocess_data(singapore_df)
ny_preproccessor, ny_X, ny_y = prep_improved.preprocess_data(ny_df)
madrid_preproccessor, madrid_X, madrid_y = prep_improved.preprocess_data(madrid_df)


# Split the data into training and testing sets
singapore_X_train, singapore_X_test, singapore_y_train, singapore_y_test = train_test_split(singapore_X, singapore_y, test_size=0.3, random_state=42)
ny_X_train, ny_X_test, ny_y_train, ny_y_test = train_test_split(ny_X, ny_y, test_size=0.3, random_state=42)
madrid_X_train, madrid_X_test, madrid_y_train, madrid_y_test = train_test_split(madrid_X, madrid_y, test_size=0.3, random_state=42)

#Regression models for each city
# Singapore
rf_rsme, rf_mae, rf_variance = random_forest_regressor(singapore_X_train, singapore_X_test, singapore_y_train, singapore_y_test, singapore_preproccessor)
lr_rsme, lr_mae, lr_variance = linear_regression(singapore_X_train, singapore_X_test, singapore_y_train, singapore_y_test, singapore_preproccessor)
xg_rsme, xg_mae, xg_variance = xg_boost(singapore_X_train, singapore_X_test, singapore_y_train, singapore_y_test, singapore_preproccessor)

print("Singapore")
print(f'Random Forest Regressor RMSE: {rf_rsme}, MAE: {rf_mae}', f'Variance: {rf_variance}')
print(f'Linear Regression RMSE: {lr_rsme}, MAE: {lr_mae}', f'Variance: {lr_variance}')
print(f'XGBoost RMSE: {xg_rsme}, MAE: {xg_mae}', f'Variance: {xg_variance}')


# New York
rf_rsme, rf_mae, rf_variance = random_forest_regressor(ny_X_train, ny_X_test, ny_y_train, ny_y_test, ny_preproccessor)
lr_rsme, lr_mae, lr_variance = linear_regression(ny_X_train, ny_X_test, ny_y_train, ny_y_test, ny_preproccessor)
xg_rsme, xg_mae, xg_variance = xg_boost(ny_X_train, ny_X_test, ny_y_train, ny_y_test, ny_preproccessor)

print("\nNew York")
print(f'Random Forest Regressor RMSE: {rf_rsme}, MAE: {rf_mae}', f'Variance: {rf_variance}')
print(f'Linear Regression RMSE: {lr_rsme}, MAE: {lr_mae}', f'Variance: {lr_variance}')
print(f'XGBoost RMSE: {xg_rsme}, MAE: {xg_mae}', f'Variance: {xg_variance}')


# Madrid
rf_rsme, rf_mae, rf_variance = random_forest_regressor(madrid_X_train, madrid_X_test, madrid_y_train, madrid_y_test, madrid_preproccessor)
lr_rsme, lr_mae, lr_variance = linear_regression(madrid_X_train, madrid_X_test, madrid_y_train, madrid_y_test, madrid_preproccessor)
xg_rsme, xg_mae, xg_variance = xg_boost(madrid_X_train, madrid_X_test, madrid_y_train, madrid_y_test, madrid_preproccessor)

print("\nMadrid")
print(f'Random Forest Regressor RMSE: {rf_rsme}, MAE: {rf_mae}', f'Variance: {rf_variance}')
print(f'Linear Regression RMSE: {lr_rsme}, MAE: {lr_mae}', f'Variance: {lr_variance}')
print(f'XGBoost RMSE: {xg_rsme}, MAE: {xg_mae}', f'Variance: {xg_variance}')



Singapore
Random Forest Regressor RMSE: 299.33771836158274, MAE: 74.1906998173901 Variance: -0.39389816836943137
Linear Regression RMSE: 247.3652627527955, MAE: 83.57859503346596 Variance: 0.04754202363858451
XGBoost RMSE: 270.5252413532664, MAE: 79.25427337291619 Variance: -0.13901608081378125

New York
Random Forest Regressor RMSE: 201.96148471085604, MAE: 67.36795714318177 Variance: -0.007887339839541374
Linear Regression RMSE: 184.56610326779307, MAE: 68.96434298328653 Variance: 0.15696930071175952
XGBoost RMSE: 210.45952217202074, MAE: 69.0994750487186 Variance: -0.09537339835808467

Madrid
Random Forest Regressor RMSE: 527.5800912613797, MAE: 118.01075490132628 Variance: -0.03998388999588043
Linear Regression RMSE: 513.2398585620584, MAE: 123.98181595790206 Variance: 0.0158699023808333
XGBoost RMSE: 545.3569300419205, MAE: 130.8661936843452 Variance: -0.11130465071545825


### Improved Preprocessing with (HyperParameter tuning , grid search , kfold)