In [1]:
import pandas as pd
import preprocessor_improved as prep_improved
import preprocessor_baseline as prep_baseline
from random_forest_regressor import random_forest_regressor
from linear_regression import linear_regression
from gbm import xg_boost
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')



# Load the datasets
singapore_df = pd.read_csv('datasets/singapore_listings.csv')
ny_df = pd.read_csv('datasets/newyorkcity_listings.csv')
madrid_df = pd.read_csv('datasets/madrid_listings.csv')


# Checking the first few rows of each dataset to understand their structure

singapore_head = singapore_df.describe()
ny_head = ny_df.describe()
madrid_head = madrid_df.describe()


# (madrid_head, ny_head, singapore_head)


  from pandas.core import (


### Baseline Preprocessing

In [2]:


# Preprocess the data
singapore_preproccessor, singapore_X, singapore_y = prep_baseline.preprocess_data(singapore_df)
ny_preproccessor, ny_X, ny_y = prep_baseline.preprocess_data(ny_df)
madrid_preproccessor, madrid_X, madrid_y = prep_baseline.preprocess_data(madrid_df)


# Split the data into training and testing sets
singapore_X_train, singapore_X_test, singapore_y_train, singapore_y_test = train_test_split(singapore_X, singapore_y, test_size=0.3, random_state=42)
ny_X_train, ny_X_test, ny_y_train, ny_y_test = train_test_split(ny_X, ny_y, test_size=0.3, random_state=42)
madrid_X_train, madrid_X_test, madrid_y_train, madrid_y_test = train_test_split(madrid_X, madrid_y, test_size=0.3, random_state=42)


#Regression models for each city

# Singapore
rf_rsme, rf_mae, rf_variance = random_forest_regressor(singapore_X_train, singapore_X_test, singapore_y_train, singapore_y_test, singapore_preproccessor)
lr_rsme, lr_mae, lr_variance = linear_regression(singapore_X_train, singapore_X_test, singapore_y_train, singapore_y_test, singapore_preproccessor)
xg_rsme, xg_mae, xg_variance = xg_boost(singapore_X_train, singapore_X_test, singapore_y_train, singapore_y_test, singapore_preproccessor)

print("Singapore")
print(f'Random Forest Regressor RMSE: {rf_rsme}, MAE: {rf_mae}', f'Variance: {rf_variance}')
print(f'Linear Regression RMSE: {lr_rsme}, MAE: {lr_mae}', f'Variance: {lr_variance}')
print(f'XGBoost RMSE: {xg_rsme}, MAE: {xg_mae}', f'Variance: {xg_variance}')



# New York
rf_rsme, rf_mae, rf_variance = random_forest_regressor(ny_X_train, ny_X_test, ny_y_train, ny_y_test, ny_preproccessor)
lr_rsme, lr_mae, lr_variance = linear_regression(ny_X_train, ny_X_test, ny_y_train, ny_y_test, ny_preproccessor)
xg_rsme, xg_mae, xg_variance = xg_boost(ny_X_train, ny_X_test, ny_y_train, ny_y_test, ny_preproccessor)


print("\nNew York")
print(f'Random Forest Regressor RMSE: {rf_rsme}, MAE: {rf_mae}', f'Variance: {rf_variance}')
print(f'Linear Regression RMSE: {lr_rsme}, MAE: {lr_mae}', f'Variance: {lr_variance}')
print(f'XGBoost RMSE: {xg_rsme}, MAE: {xg_mae}', f'Variance: {xg_variance}')


# Madrid
rf_rsme, rf_mae, rf_variance = random_forest_regressor(madrid_X_train, madrid_X_test, madrid_y_train, madrid_y_test, madrid_preproccessor)
lr_rsme, lr_mae, lr_variance = linear_regression(madrid_X_train, madrid_X_test, madrid_y_train, madrid_y_test, madrid_preproccessor)
xg_rsme, xg_mae, xg_variance = xg_boost(madrid_X_train, madrid_X_test, madrid_y_train, madrid_y_test, madrid_preproccessor)

print("\nMadrid")
print(f'Random Forest Regressor RMSE: {rf_rsme}, MAE: {rf_mae}', f'Variance: {rf_variance}')
print(f'Linear Regression RMSE: {lr_rsme}, MAE: {lr_mae}', f'Variance: {lr_variance}')
print(f'XGBoost RMSE: {xg_rsme}, MAE: {xg_mae}', f'Variance: {xg_variance}')





Singapore
Random Forest Regressor RMSE: 313.9698391615322, MAE: 78.53337126000842 Variance: -0.5317961171861718
Linear Regression RMSE: 248.57254193395678, MAE: 84.68295482084284 Variance: 0.03840506709317548
XGBoost RMSE: 294.2075270093768, MAE: 81.59877991204115 Variance: -0.3466429080533262

New York
Random Forest Regressor RMSE: 194.61102785186864, MAE: 65.12300364552392 Variance: 0.06355688871719711
Linear Regression RMSE: 185.19971902117396, MAE: 69.23408641346924 Variance: 0.15110002459484662
XGBoost RMSE: 183.91261298277476, MAE: 63.44118624787055 Variance: 0.16286835153146362

Madrid
Random Forest Regressor RMSE: 529.1718889008359, MAE: 117.1320151893921 Variance: -0.04626431613831117
Linear Regression RMSE: 514.3537262176827, MAE: 124.29116131753781 Variance: 0.011576844518915497
XGBoost RMSE: 498.12511745130035, MAE: 117.16333132624061 Variance: 0.07286834454470603


### Improved Preprocessing

In [4]:
#Preprocess the data

singapore_preproccessor, singapore_X, singapore_y = prep_improved.preprocess_data(singapore_df)
ny_preproccessor, ny_X, ny_y = prep_improved.preprocess_data(ny_df)
madrid_preproccessor, madrid_X, madrid_y = prep_improved.preprocess_data(madrid_df)


# Split the data into training and testing sets
singapore_X_train, singapore_X_test, singapore_y_train, singapore_y_test = train_test_split(singapore_X, singapore_y, test_size=0.3, random_state=42)
ny_X_train, ny_X_test, ny_y_train, ny_y_test = train_test_split(ny_X, ny_y, test_size=0.3, random_state=42)
madrid_X_train, madrid_X_test, madrid_y_train, madrid_y_test = train_test_split(madrid_X, madrid_y, test_size=0.3, random_state=42)


#Regression models for each city

# Singapore
rf_rsme, rf_mae, rf_variance = random_forest_regressor(singapore_X_train, singapore_X_test, singapore_y_train, singapore_y_test, singapore_preproccessor)
lr_rsme, lr_mae, lr_variance = linear_regression(singapore_X_train, singapore_X_test, singapore_y_train, singapore_y_test, singapore_preproccessor)
xg_rsme, xg_mae, xg_variance = xg_boost(singapore_X_train, singapore_X_test, singapore_y_train, singapore_y_test, singapore_preproccessor)


print("Singapore")
print(f'Random Forest Regressor RMSE: {rf_rsme}, MAE: {rf_mae}', f'Variance: {rf_variance}')
print(f'Linear Regression RMSE: {lr_rsme}, MAE: {lr_mae}', f'Variance: {lr_variance}')
print(f'XGBoost RMSE: {xg_rsme}, MAE: {xg_mae}', f'Variance: {xg_variance}')



# New York
rf_rsme, rf_mae, rf_variance = random_forest_regressor(ny_X_train, ny_X_test, ny_y_train, ny_y_test, ny_preproccessor)
lr_rsme, lr_mae, lr_variance = linear_regression(ny_X_train, ny_X_test, ny_y_train, ny_y_test, ny_preproccessor)
xg_rsme, xg_mae, xg_variance = xg_boost(ny_X_train, ny_X_test, ny_y_train, ny_y_test, ny_preproccessor)


print("\nNew York")
print(f'Random Forest Regressor RMSE: {rf_rsme}, MAE: {rf_mae}', f'Variance: {rf_variance}')
print(f'Linear Regression RMSE: {lr_rsme}, MAE: {lr_mae}', f'Variance: {lr_variance}')
print(f'XGBoost RMSE: {xg_rsme}, MAE: {xg_mae}', f'Variance: {xg_variance}')



# Madrid
rf_rsme, rf_mae, rf_variance = random_forest_regressor(madrid_X_train, madrid_X_test, madrid_y_train, madrid_y_test, madrid_preproccessor)
lr_rsme, lr_mae, lr_variance = linear_regression(madrid_X_train, madrid_X_test, madrid_y_train, madrid_y_test, madrid_preproccessor)
xg_rsme, xg_mae, xg_variance = xg_boost(madrid_X_train, madrid_X_test, madrid_y_train, madrid_y_test, madrid_preproccessor)



print("\nMadrid")
print(f'Random Forest Regressor RMSE: {rf_rsme}, MAE: {rf_mae}', f'Variance: {rf_variance}')
print(f'Linear Regression RMSE: {lr_rsme}, MAE: {lr_mae}', f'Variance: {lr_variance}')
print(f'XGBoost RMSE: {xg_rsme}, MAE: {xg_mae}', f'Variance: {xg_variance}')



Singapore
Random Forest Regressor RMSE: 290.3231933227175, MAE: 76.34654018421527 Variance: -0.31074963789823107
Linear Regression RMSE: 247.06627993944798, MAE: 83.49166294828916 Variance: 0.04977632710320068
XGBoost RMSE: 285.46373587770444, MAE: 78.14160435449911 Variance: -0.2690004257263958

New York
Random Forest Regressor RMSE: 201.5083218842167, MAE: 68.25472697525394 Variance: -0.0030258465699448678
Linear Regression RMSE: 184.3295670236853, MAE: 69.54015974940067 Variance: 0.15912815494487753
XGBoost RMSE: 194.45140049805553, MAE: 65.98373213445932 Variance: 0.0646021572787846

Madrid
Random Forest Regressor RMSE: 520.2290494699406, MAE: 122.67437259652982 Variance: -0.011089495416965578
Linear Regression RMSE: 512.0300958136236, MAE: 123.09053128055223 Variance: 0.020548623396496213
XGBoost RMSE: 513.4171862609893, MAE: 118.64110432098823 Variance: 0.01508913647371568


### Improves Preprocessing with (HyperParameter tuning , grid search , kfold)