In [79]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score , mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import RandomizedSearchCV


import warnings
import numpy as np
warnings.filterwarnings('ignore')

# Load the datasets
singapore_df = pd.read_csv('datasets/singapore_listings.csv')
ny_df = pd.read_csv('datasets/newyorkcity_listings.csv')
madrid_df = pd.read_csv('datasets/madrid_listings.csv')

### Exploratory Data Analysis (EDA) - (EDA)

In [76]:
# Overview

# Singapore
summary = {
    # "Info": singapore_df.info(),
    # "Head": singapore_df.head(),
    "Describe": singapore_df.describe(),
    # "Missing": singapore_df.isnull().sum(),
}
print("Singapore", summary)


Singapore {'Describe':                  id       host_id     latitude    longitude         price  \
count  7.907000e+03  7.907000e+03  7907.000000  7907.000000   7907.000000   
mean   2.338862e+07  9.114481e+07     1.314192   103.848787    169.332996   
std    1.016416e+07  8.190910e+07     0.030577     0.043675    340.187599   
min    4.909100e+04  2.366600e+04     1.243870   103.646560      0.000000   
25%    1.582180e+07  2.305808e+07     1.295795   103.835825     65.000000   
50%    2.470627e+07  6.344891e+07     1.311030   103.849410    124.000000   
75%    3.234850e+07  1.553811e+08     1.322110   103.872535    199.000000   
max    3.811276e+07  2.885676e+08     1.454590   103.973420  10000.000000   

       minimum_nights  number_of_reviews  reviews_per_month  \
count     7907.000000        7907.000000        5149.000000   
mean        17.510054          12.807386           1.043669   
std         42.094616          29.707746           1.285851   
min          1.000000          

In [77]:
#New York

summary = {
    "Info": ny_df.info(),
    "Head": ny_df.head(),
    "Describe": ny_df.describe(),
    "Missing": ny_df.isnull().sum(),
}

print("New York", summary)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48895 entries, 0 to 48894
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              48895 non-null  int64  
 1   name                            48879 non-null  object 
 2   host_id                         48895 non-null  int64  
 3   host_name                       48874 non-null  object 
 4   neighbourhood_group             48895 non-null  object 
 5   neighbourhood                   48895 non-null  object 
 6   latitude                        48895 non-null  float64
 7   longitude                       48895 non-null  float64
 8   room_type                       48895 non-null  object 
 9   price                           48895 non-null  int64  
 10  minimum_nights                  48895 non-null  int64  
 11  number_of_reviews               48895 non-null  int64  
 12  last_review                     

In [78]:
# Madrid

summary = {
    "Info": madrid_df.info(),
    "Head": madrid_df.head(),
    "Describe": madrid_df.describe(),
    "Missing": madrid_df.isnull().sum(),
}

print("Madrid", summary)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19618 entries, 0 to 19617
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              19618 non-null  int64  
 1   name                            19615 non-null  object 
 2   host_id                         19618 non-null  int64  
 3   host_name                       19091 non-null  object 
 4   neighbourhood_group             19618 non-null  object 
 5   neighbourhood                   19618 non-null  object 
 6   latitude                        19618 non-null  float64
 7   longitude                       19618 non-null  float64
 8   room_type                       19618 non-null  object 
 9   price                           19618 non-null  int64  
 10  minimum_nights                  19618 non-null  int64  
 11  number_of_reviews               19618 non-null  int64  
 12  last_review                     

In [80]:
# Calculate metrics
def calculate_metrics(y_test, y_pred):   
    rmse = mean_squared_error(y_test, y_pred, squared=False)    
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    return rmse, mae, r2


# Preprocessor
def get_preprocessor():    
    # Defining preprocessing for numeric columns
    numeric_features = ['latitude', 'longitude', 'minimum_nights', 'number_of_reviews', 'reviews_per_month', 'calculated_host_listings_count', 'availability_365']
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    
    # Defining preprocessing for categorical columns
    categorical_features = ['neighbourhood_group', 'neighbourhood', 'room_type']
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    
    # Combining preprocessing steps
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)        
        ],
        remainder='drop'
    )
    
    return preprocessor       


In [73]:
# Models

# Random Forest
def random_forest_model(preprocessor):
    rf_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', RandomForestRegressor(random_state=42))])
    return rf_pipeline

# Linear Regression
def linear_regression_model(preprocessor):
    lr_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', LinearRegression())])
    return lr_pipeline

# XGBoost
def xgboost_model(preprocessor):
    xgb_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', XGBRegressor())])
    return xgb_pipeline

# Model Prediction
def get_prediction(X_train, y_train, X_test, model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return y_pred

preprocessor = get_preprocessor()
rf_model = random_forest_model(preprocessor)
lr_model = linear_regression_model(preprocessor)
xg_model = xgboost_model(preprocessor)


### Baseline Preprocessing

In [74]:
def get_X_y(df):   
    X = df.drop('price', axis=1)
    y = df['price']
    return X, y

singapore_X, singapore_y = get_X_y(singapore_df)
ny_X, ny_y = get_X_y(ny_df) 
madrid_X, madrid_y = get_X_y(madrid_df)

# Split the data into training and testing sets
singapore_X_train, singapore_X_test, singapore_y_train, singapore_y_test = train_test_split(singapore_X, singapore_y, test_size=0.3, random_state=42)
ny_X_train, ny_X_test, ny_y_train, ny_y_test = train_test_split(ny_X, ny_y, test_size=0.3, random_state=42)
madrid_X_train, madrid_X_test, madrid_y_train, madrid_y_test = train_test_split(madrid_X, madrid_y, test_size=0.3, random_state=42)

In [75]:
# Metrics per city

# Singapore
singapore_rf_rsme, singapore_rf_mae, singapore_rf_r2 = calculate_metrics(singapore_y_test, get_prediction(singapore_X_train, singapore_y_train, singapore_X_test, rf_model))
singapore_lr_rsme, singapore_lr_mae, singapore_lr_r2 = calculate_metrics(singapore_y_test, get_prediction(singapore_X_train, singapore_y_train, singapore_X_test, lr_model))
singapore_xg_rsme, singapore_xg_mae, singapore_xg_r2 = calculate_metrics(singapore_y_test, get_prediction(singapore_X_train, singapore_y_train, singapore_X_test, xg_model))

print("Singapore:")
print(f"Random Forest Regressor: RSME: {singapore_rf_rsme}, MAE: {singapore_rf_mae}, R2: {singapore_rf_r2}")
print(f"Linear Regression: RSME: {singapore_lr_rsme}, MAE: {singapore_lr_mae}, R2: {singapore_lr_r2}")
print(f"XGBoost: RSME: {singapore_xg_rsme}, MAE: {singapore_xg_mae}, R2: {singapore_xg_r2}")


# NY
ny_rf_rsme, ny_rf_mae, ny_rf_r2 = calculate_metrics(ny_y_test, get_prediction(ny_X_train, ny_y_train, ny_X_test, rf_model))
ny_lr_rsme, ny_lr_mae, ny_lr_r2 = calculate_metrics(ny_y_test, get_prediction(ny_X_train, ny_y_train, ny_X_test, lr_model))
ny_xg_rsme, ny_xg_mae, ny_xg_r2 = calculate_metrics(ny_y_test, get_prediction(ny_X_train, ny_y_train, ny_X_test, xg_model))

print("\nNY:")
print(f"Random Forest Regressor: RSME: {ny_rf_rsme}, MAE: {ny_rf_mae}, R2: {ny_rf_r2}")
print(f"Linear Regression: RSME: {ny_lr_rsme}, MAE: {ny_lr_mae}, R2: {ny_lr_r2}")
print(f"XGBoost: RSME: {ny_xg_rsme}, MAE: {ny_xg_mae}, R2: {ny_xg_r2}")


# Madrid
madrid_rf_rsme, madrid_rf_mae, madrid_rf_r2 = calculate_metrics(madrid_y_test, get_prediction(madrid_X_train, madrid_y_train, madrid_X_test, rf_model))
madrid_lr_rsme, madrid_lr_mae, madrid_lr_r2 = calculate_metrics(madrid_y_test, get_prediction(madrid_X_train, madrid_y_train, madrid_X_test, lr_model))
madrid_xg_rsme, madrid_xg_mae, madrid_xg_r2 = calculate_metrics(madrid_y_test, get_prediction(madrid_X_train, madrid_y_train, madrid_X_test, xg_model))

print("\nMadrid:")
print(f"Random Forest Regressor: RSME: {madrid_rf_rsme}, MAE: {madrid_rf_mae}, R2: {madrid_rf_r2}")
print(f"Linear Regression: RSME: {madrid_lr_rsme}, MAE: {madrid_lr_mae}, R2: {madrid_lr_r2}")
print(f"XGBoost: RSME: {madrid_xg_rsme}, MAE: {madrid_xg_mae}, R2: {madrid_xg_r2}")

Random Forest Regressor: RSME: 313.9698391615322, MAE: 78.53337126000842, R2: -0.5356305509962982
Linear Regression: RSME: 248.57254193395678, MAE: 84.68295482084284, R2: 0.03746353888838383
XGBoost: RSME: 297.173075536394, MAE: 82.90453602071558, R2: -0.3757192066231898

NY:
Random Forest Regressor: RSME: 194.61102785186864, MAE: 65.12300364552392, R2: 0.062243702185231564
Linear Regression: RSME: 185.19971902117396, MAE: 69.23408641346924, R2: 0.1507496323467501
XGBoost: RSME: 201.18260169021912, MAE: 66.17254536945876, R2: -0.0021573982702969285

Madrid:
Random Forest Regressor: RSME: 529.1718889008359, MAE: 117.1320151893921, R2: -0.046322072658402824
Linear Regression: RSME: 514.3537262176827, MAE: 124.29116131753781, R2: 0.011456835285585054
XGBoost: RSME: 531.4788951226592, MAE: 130.83596830765202, R2: -0.05546516360657705


### Improved Preprocessing V1

Strategy Taken: 
 - Log Transformation on the 'price' column done to manage outliers and reduce skewness.

In [26]:
def get_X_y_improved(df):
     # Log transformation to manage outliers in the 'price' column
     log_transformer = FunctionTransformer(np.log1p, validate=True)
     df['log_price'] = log_transformer.transform(df[['price']])
     X = df.drop(['price', 'log_price'], axis=1)
     y_log = df['log_price'] 
     return X, y_log

# Undo the log transformation
def undo_log_transform(y_log):
    return np.expm1(y_log)

singapore_X_improved, singapore_y_improved = get_X_y_improved(singapore_df)
ny_X_improved, ny_y_improved = get_X_y_improved(ny_df)
madrid_X_improved, madrid_y_improved = get_X_y_improved(madrid_df)

# Split the data into training and testing sets
singapore_X_train_imp, singapore_X_test_imp, singapore_y_train_imp, singapore_y_test_imp = train_test_split(singapore_X_improved, singapore_y_improved, test_size=0.3, random_state=42)
ny_X_train_imp, ny_X_test_imp, ny_y_train_imp, ny_y_test_imp = train_test_split(ny_X_improved, ny_y_improved, test_size=0.3, random_state=42)
madrid_X_train_imp, madrid_X_test_imp, madrid_y_train_imp, madrid_y_test_imp = train_test_split(madrid_X_improved, madrid_y_improved, test_size=0.3, random_state=42)

In [19]:
singapore_y_pred_rf = undo_log_transform(get_prediction(singapore_X_train_imp, singapore_y_train_imp.ravel(), singapore_X_test_imp, rf_model))
singapore_y_pred_lr = undo_log_transform(get_prediction(singapore_X_train_imp, singapore_y_train_imp.ravel(), singapore_X_test_imp, lr_model))
singapore_y_pred_xg = undo_log_transform(get_prediction(singapore_X_train_imp, singapore_y_train_imp.ravel(), singapore_X_test_imp, xg_model))

ny_y_pred_rf = undo_log_transform(get_prediction(ny_X_train_imp, ny_y_train_imp.ravel(), ny_X_test_imp, rf_model))
ny_y_pred_lr = undo_log_transform(get_prediction(ny_X_train_imp, ny_y_train_imp.ravel(), ny_X_test_imp, lr_model))
ny_y_pred_xg = undo_log_transform(get_prediction(ny_X_train_imp, ny_y_train_imp.ravel(), ny_X_test_imp, xg_model))

madrid_y_pred_rf = undo_log_transform(get_prediction(madrid_X_train_imp, madrid_y_train_imp.ravel(), madrid_X_test_imp, rf_model))
madrid_y_pred_lr = undo_log_transform(get_prediction(madrid_X_train_imp, madrid_y_train_imp.ravel(), madrid_X_test_imp, lr_model))
madrid_y_pred_xg = undo_log_transform(get_prediction(madrid_X_train_imp, madrid_y_train_imp.ravel(), madrid_X_test_imp, xg_model))

singapore_y_test_imp = undo_log_transform(singapore_y_test_imp)
ny_y_test_imp = undo_log_transform(ny_y_test_imp)
madrid_y_test_imp = undo_log_transform(madrid_y_test_imp)


#Regression models for each city
# Singapore
rf_rsme, rf_mae, rf_r2 = calculate_metrics(singapore_y_test_imp, singapore_y_pred_rf)
lr_rsme, lr_mae, lr_r2 = calculate_metrics(singapore_y_test_imp, singapore_y_pred_lr)
xg_rsme, xg_mae, xg_r2 = calculate_metrics(singapore_y_test_imp, singapore_y_pred_xg)


print("Singapore")
print(f'Random Forest Regressor RMSE: {rf_rsme}, MAE: {rf_mae}', f'R2: {rf_r2}')
print(f'Linear Regression RMSE: {lr_rsme}, MAE: {lr_mae}', f'R2: {lr_r2}')
print(f'XGBoost RMSE: {xg_rsme}, MAE: {xg_mae}', f'R2: {xg_r2}')


# New York
rf_rsme, rf_mae, rf_r2 = calculate_metrics(ny_y_test_imp, ny_y_pred_rf)
lr_rsme, lr_mae, lr_r2 = calculate_metrics(ny_y_test_imp, ny_y_pred_lr)
xg_rsme, xg_mae, xg_r2 = calculate_metrics(ny_y_test_imp, ny_y_pred_xg)

print("\nNew York")
print(f'Random Forest Regressor RMSE: {rf_rsme}, MAE: {rf_mae}', f'R2: {rf_r2}')
print(f'Linear Regression RMSE: {lr_rsme}, MAE: {lr_mae}', f'R2: {lr_r2}')
print(f'XGBoost RMSE: {xg_rsme}, MAE: {xg_mae}', f'R2: {xg_r2}')


# Madrid
rf_rsme, rf_mae, rf_r2 = calculate_metrics(madrid_y_test_imp, madrid_y_pred_rf)
lr_rsme, lr_mae, lr_r2 = calculate_metrics(madrid_y_test_imp, madrid_y_pred_lr)
xg_rsme, xg_mae, xg_r2 = calculate_metrics(madrid_y_test_imp, madrid_y_pred_xg)

print("\nMadrid")
print(f'Random Forest Regressor RMSE: {rf_rsme}, MAE: {rf_mae}', f'R2: {rf_r2}')
print(f'Linear Regression RMSE: {lr_rsme}, MAE: {lr_mae}', f'R2: {lr_r2}')
print(f'XGBoost RMSE: {xg_rsme}, MAE: {xg_mae}', f'R2: {xg_r2}')

Singapore
Random Forest Regressor RMSE: 238.78687292784335, MAE: 56.76911854447486 R2: 0.11175702969010748
Linear Regression RMSE: 244.8365474435859, MAE: 69.4583453101915 R2: 0.06617955943481235
XGBoost RMSE: 240.90086851270405, MAE: 60.39337390653704 R2: 0.09596006749824659

New York
Random Forest Regressor RMSE: 173.77488779615888, MAE: 54.51045660581714 R2: 0.2522969879142771
Linear Regression RMSE: 185.9654241013907, MAE: 57.969214702661375 R2: 0.14371269297724432
XGBoost RMSE: 174.82172369482706, MAE: 53.843374900367984 R2: 0.24326139065095775

Madrid
Random Forest Regressor RMSE: 509.2021015508586, MAE: 84.08128461523799 R2: 0.03115961973111181
Linear Regression RMSE: 518.8935854896596, MAE: 90.53526544548583 R2: -0.006070606420164548
XGBoost RMSE: 510.32689436437005, MAE: 85.16492330837607 R2: 0.026874687364152106


### Improved Preprocessing V2
Strategy Taken:
 - Drop rows above the upper bound outlier or below the lower bound outlier
 - Done since the amount of outliers for each dataset was below 10%

In [51]:
# Function to count outliers and calculate percentages
def outliers_stats(df):
    # Calculate Q1 and Q3
    Q1 = df['price'].quantile(0.25)
    Q3 = df['price'].quantile(0.75)
    # Calculate IQR
    IQR = Q3 - Q1
    # Define lower and upper bounds for outliers
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Count rows with prices above upper bound
    above_upper_limit = df[df['price'] > upper_bound].shape[0]
    # Count rows with prices below lower bound
    below_lower_limit = df[df['price'] < lower_bound].shape[0]

    # Calculate percentages based on the original DataFrame's row count
    total_rows = df.shape[0]
    percentage_above = (above_upper_limit / total_rows) * 100
    percentage_below = (below_lower_limit / total_rows) * 100

    return {
        "Above Upper Limit Count": above_upper_limit,
        "Below Lower Limit Count": below_lower_limit,
        "Total Rows": total_rows,
        "Percentage Above Upper Limit": f"{percentage_above:.2f}%",
        "Percentage Below Lower Limit": f"{percentage_below:.2f}%"
    }

print("Singapore",outliers_stats(singapore_df))
print("NY",outliers_stats(ny_df))
print("Madrid",outliers_stats(madrid_df))


Singapore {'Above Upper Limit Count': 324, 'Below Lower Limit Count': 0, 'Total Rows': 7907, 'Percentage Above Upper Limit': '4.10%', 'Percentage Below Lower Limit': '0.00%'}
NY {'Above Upper Limit Count': 2972, 'Below Lower Limit Count': 0, 'Total Rows': 48895, 'Percentage Above Upper Limit': '6.08%', 'Percentage Below Lower Limit': '0.00%'}
Madrid {'Above Upper Limit Count': 1905, 'Below Lower Limit Count': 0, 'Total Rows': 19618, 'Percentage Above Upper Limit': '9.71%', 'Percentage Below Lower Limit': '0.00%'}


In [52]:
def get_oultiers(df):
    Q1 = df['price'].quantile(0.25)
    Q3 = df['price'].quantile(0.75)
    IQR = Q3 - Q1
    
    # Calculate the lower and upper bounds to identify outliers
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Filter the outliers
    return df[(df['price'] <= lower_bound) | (df['price'] >= upper_bound)]

def get_X_y_improved_v2(df):
    # Remove outliers in the 'price' column
    df.drop(get_oultiers(df).index, inplace=True)
    X = df.drop('price', axis=1)
    y = df['price']
    return X, y

singapore_X_improvedv2, singapore_y_improvedv2 = get_X_y(singapore_df)
ny_X_improvedv2, ny_y_improvedv2 = get_X_y(ny_df) 
madrid_X_improvedv2, madrid_y_improvedv2 = get_X_y(madrid_df)

# Split the data into training and testing sets
singapore_X_train_impv2, singapore_X_test_impv2, singapore_y_train_impv2, singapore_y_test_impv2 = train_test_split(singapore_X_improvedv2, singapore_y_improvedv2, test_size=0.3, random_state=42)
ny_X_train_impv2, ny_X_test_impv2, ny_y_train_impv2, ny_y_test_impv2 = train_test_split(ny_X_improvedv2, ny_y_improvedv2, test_size=0.3, random_state=42)
madrid_X_train_impv2, madrid_X_test_impv2, madrid_y_train_impv2, madrid_y_test_impv2 = train_test_split(madrid_X_improvedv2, madrid_y_improvedv2, test_size=0.3, random_state=42)

In [53]:
singapore_lr_rsme, singapore_lr_mae, singapore_lr_r2 = calculate_metrics(singapore_y_test_impv2, get_prediction(singapore_X_train_impv2, singapore_y_train_impv2, singapore_X_test_impv2, lr_model))
ny_lr_rsme, ny_lr_mae, ny_lr_r2 = calculate_metrics(ny_y_test_impv2, get_prediction(ny_X_train_impv2, ny_y_train_impv2, ny_X_test_impv2, lr_model))
madrid_lr_rsme, madrid_lr_mae, madrid_lr_r2 = calculate_metrics(madrid_y_test_impv2, get_prediction(madrid_X_train_impv2, madrid_y_train_impv2, madrid_X_test_impv2, lr_model))

# Singapore
print("Singapore:")
print(f"Random Forest Regressor: RSME: {singapore_rf_rsme}, MAE: {singapore_rf_mae}, R2: {singapore_rf_r2}")
print(f"Linear Regression: RSME: {singapore_lr_rsme}, MAE: {singapore_lr_mae}, R2: {singapore_lr_r2}")
print(f"XGBoost: RSME: {singapore_xg_rsme}, MAE: {singapore_xg_mae}, R2: {singapore_xg_r2}")


# NY
ny_rf_rsme, ny_rf_mae, ny_rf_r2 = calculate_metrics(ny_y_test_impv2, get_prediction(ny_X_train_impv2, ny_y_train_impv2, ny_X_test_impv2, rf_model))
ny_lr_rsme, ny_lr_mae, ny_lr_r2 = calculate_metrics(ny_y_test_impv2, get_prediction(ny_X_train_impv2, ny_y_train_impv2, ny_X_test_impv2, lr_model))
ny_xg_rsme, ny_xg_mae, ny_xg_r2 = calculate_metrics(ny_y_test_impv2, get_prediction(ny_X_train_impv2, ny_y_train_impv2, ny_X_test_impv2, xg_model))

print("\nNY:")
print(f"Random Forest Regressor: RSME: {ny_rf_rsme}, MAE: {ny_rf_mae}, R2: {ny_rf_r2}")
print(f"Linear Regression: RSME: {ny_lr_rsme}, MAE: {ny_lr_mae}, R2: {ny_lr_r2}")
print(f"XGBoost: RSME: {ny_xg_rsme}, MAE: {ny_xg_mae}, R2: {ny_xg_r2}")


# Madrid
madrid_rf_rsme, madrid_rf_mae, madrid_rf_r2 = calculate_metrics(madrid_y_test_impv2, get_prediction(madrid_X_train_impv2, madrid_y_train_impv2, madrid_X_test_impv2, rf_model))
madrid_lr_rsme, madrid_lr_mae, madrid_lr_r2 = calculate_metrics(madrid_y_test_impv2, get_prediction(madrid_X_train_impv2, madrid_y_train_impv2, madrid_X_test_impv2, lr_model)) 
madrid_xg_rsme, madrid_xg_mae, madrid_xg_r2 = calculate_metrics(madrid_y_test_impv2, get_prediction(madrid_X_train_impv2, madrid_y_train_impv2, madrid_X_test_impv2, xg_model))

print("\nMadrid:")
print(f"Random Forest Regressor: RSME: {madrid_rf_rsme}, MAE: {madrid_rf_mae}, R2: {madrid_rf_r2}")
print(f"Linear Regression: RSME: {madrid_lr_rsme}, MAE: {madrid_lr_mae}, R2: {madrid_lr_r2}")
print(f"XGBoost: RSME: {madrid_xg_rsme}, MAE: {madrid_xg_mae}, R2: {madrid_xg_r2}")

Random Forest Regressor: RSME: 48.681480306187915, MAE: 34.19176651982379, R2: 0.6814551647540397
Linear Regression: RSME: 62.80099645608832, MAE: 47.889643175887244, R2: 0.4698776850085957
XGBoost: RSME: 50.549207479409915, MAE: 35.967816227858286, R2: 0.6565435169510926

NY:
Random Forest Regressor: RSME: 45.12288192640786, MAE: 32.26574912891986, R2: 0.5640894870390847
Linear Regression: RSME: 47.39287216941735, MAE: 34.689270990939676, R2: 0.5191277203575221
XGBoost: RSME: 44.60755355080319, MAE: 31.924895409369164, R2: 0.573989312007531

Madrid:
Random Forest Regressor: RSME: 29.221823065166777, MAE: 20.605178584870153, R2: 0.4540244030874403
Linear Regression: RSME: 31.83106885283368, MAE: 23.176858894473735, R2: 0.3521699902955431
XGBoost: RSME: 29.47115207234667, MAE: 20.98490497399238, R2: 0.4446678141466156


### HyperParameter Tuning using Randomized Search CV

In [34]:
# Define hyperparameters for each model
rf_params = {
    'model__n_estimators': [int(x) for x in np.linspace(100, 1000, num=20)],
    'model__max_features': ['log2', 'sqrt'],
    'model__max_depth': [int(x) for x in np.linspace(10, 150, num=11)],
    'model__min_samples_split': [2, 5, 10, 20],
    'model__min_samples_leaf': [1, 2, 4, 10],
    'model__bootstrap': [True, False],
}

lr_params = {
    'model__fit_intercept': [True, False],
}

xgb_params = {
    'model__n_estimators': [100, 200],
    'model__learning_rate': [0.01, 0.1, 0.2, 0.3],
    'model__max_depth': [3, 4, 5],
    'model__min_child_weight': [1, 2, 3],
    'model__gamma': [0, 0.1, 0.2],
    'model__subsample': [0.8, 0.9, 1.0],
    'model__colsample_bytree': [0.8, 0.9, 1.0],

    # 'learning_rate': [0.05, 0.1, 0.15, 0.2],
    # 'n_estimators': [100, 200, 300, 400, 500],
    # 'max_depth': [3, 4, 5, 6],
    # 'subsample': [0.7, 0.8, 0.9, 1.0],
    # 'colsample_bytree': [0.5, 0.6, 0.7, 0.8],
    # 'reg_alpha': [0, 0.1, 0.2, 0.3, 0.5],
    # 'reg_lambda': [0, 0.1, 0.2, 0.3, 0.5],
    # 'gamma': [0, 1, 2],
}

# Perform hyperparameter search with the subsets
def get_hyperparameter_search(model, params):
    rnd_search_cv = RandomizedSearchCV(
        model, params, cv=3, n_iter=10, random_state=42, n_jobs=1, scoring='neg_mean_squared_error'
    )
    # rnd_search_cv.fit(X_train, y_train)
    return rnd_search_cv

rf_search_model = get_hyperparameter_search(rf_model, rf_params)
lr_search_model = get_hyperparameter_search(lr_model, lr_params)
xgb_search_model = get_hyperparameter_search(xg_model, xgb_params)

def get_prediction_cv(X_train, y_train, X_test, model):
    model.fit(X_train, y_train)    
    best_model = model.best_estimator_
    y_pred = best_model.predict(X_test)
    return y_pred


##### Baseline Preprocessing

In [None]:
# Split the data into training and testing sets (Reseting the data)
singapore_X_train, singapore_X_test, singapore_y_train, singapore_y_test = train_test_split(singapore_X, singapore_y, test_size=0.3, random_state=42)
ny_X_train, ny_X_test, ny_y_train, ny_y_test = train_test_split(ny_X, ny_y, test_size=0.3, random_state=42)
madrid_X_train, madrid_X_test, madrid_y_train, madrid_y_test = train_test_split(madrid_X, madrid_y, test_size=0.3, random_state=42)

In [33]:
# Calculate metrics for each city
# Singapore
singapore_rf_rsme, singapore_rf_mae, singapore_rf_r2 = calculate_metrics(singapore_y_test, get_prediction_cv(singapore_X_train, singapore_y_train, singapore_X_test, rf_search_model))
singapore_lr_rsme, singapore_lr_mae, singapore_lr_r2 = calculate_metrics(singapore_y_test, get_prediction_cv(singapore_X_train, singapore_y_train, singapore_X_test, lr_search_model))
singapore_xgb_rsme, singapore_xgb_mae, singapore_xgb_r2 = calculate_metrics(singapore_y_test, get_prediction_cv(singapore_X_train, singapore_y_train, singapore_X_test, xgb_search_model))

print("Singapore")
print(f'Random Forest Regressor RMSE: {singapore_rf_rsme}, MAE: {singapore_rf_mae}', f'R2: {singapore_rf_r2}')
print(f'Linear Regression RMSE: {singapore_lr_rsme}, MAE: {singapore_lr_mae}', f'R2: {singapore_lr_r2}')
print(f'XGBoost RMSE: {singapore_xgb_rsme}, MAE: {singapore_xgb_mae}', f'R2: {singapore_xgb_r2}')


# New York
ny_rf_rsme, ny_rf_mae, ny_rf_r2 = calculate_metrics(ny_y_test, get_prediction_cv(ny_X_train, ny_y_train, ny_X_test, rf_search_model))
ny_lr_rsme, ny_lr_mae, ny_lr_r2 = calculate_metrics(ny_y_test, get_prediction_cv(ny_X_train, ny_y_train, ny_X_test, lr_search_model))
ny_xgb_rsme, ny_xgb_mae, ny_xgb_r2 = calculate_metrics(ny_y_test, get_prediction_cv(ny_X_train, ny_y_train, ny_X_test, xgb_search_model))

print("\nNew York")
print(f'Random Forest Regressor RMSE: {ny_rf_rsme}, MAE: {ny_rf_mae}', f'R2: {ny_rf_r2}')
print(f'Linear Regression RMSE: {ny_lr_rsme}, MAE: {ny_lr_mae}', f'R2: {ny_lr_r2}')
print(f'XGBoost RMSE: {ny_xgb_rsme}, MAE: {ny_xgb_mae}', f'R2: {ny_xgb_r2}')


# Madrid
madrid_rf_rsme, madrid_rf_mae, madrid_rf_r2 = calculate_metrics(madrid_y_test, get_prediction_cv(madrid_X_train, madrid_y_train, madrid_X_test, rf_search_model))
madrid_lr_rsme, madrid_lr_mae, madrid_lr_r2 = calculate_metrics(madrid_y_test, get_prediction_cv(madrid_X_train, madrid_y_train, madrid_X_test, lr_search_model))
madrid_xgb_rsme, madrid_xgb_mae, madrid_xgb_r2 = calculate_metrics(madrid_y_test, get_prediction_cv(madrid_X_train, madrid_y_train, madrid_X_test, xgb_search_model))

print("\nMadrid")
print(f'Random Forest Regressor RMSE: {madrid_rf_rsme}, MAE: {madrid_rf_mae}', f'R2: {madrid_rf_r2}')
print(f'Linear Regression RMSE: {madrid_lr_rsme}, MAE: {madrid_lr_mae}', f'R2: {madrid_lr_r2}')
print(f'XGBoost RMSE: {madrid_xgb_rsme}, MAE: {madrid_xgb_mae}', f'R2: {madrid_xgb_r2}')

Singapore
Random Forest Regressor RMSE: 246.08965824767526, MAE: 68.17996125888013 R2: 0.05659622658255847
Linear Regression RMSE: 248.57254193395678, MAE: 84.68295482084284 R2: 0.03746353888838383
XGBoost RMSE: 257.23205647667146, MAE: 91.20315994153383 R2: -0.03076831979388106

New York
Random Forest Regressor RMSE: 176.57299307201666, MAE: 61.25313073093433 R2: 0.2280242569672275
Linear Regression RMSE: 185.19971902117396, MAE: 69.23408641346924 R2: 0.1507496323467501
XGBoost RMSE: 181.8530788922694, MAE: 65.38270111260819 R2: 0.18116497054138825

Madrid
Random Forest Regressor RMSE: 500.5011902922965, MAE: 108.90991909973759 R2: 0.06398655721753788
Linear Regression RMSE: 514.3537262176827, MAE: 124.29116131753781 R2: 0.011456835285585054
XGBoost RMSE: 506.9700500778107, MAE: 113.60439450506689 R2: 0.03963469084645865


##### Improved Preprocessing V1

In [36]:
# Split the data into training and testing sets (Reseting the data)
singapore_X_train_imp, singapore_X_test_imp, singapore_y_train_imp, singapore_y_test_imp = train_test_split(singapore_X_improved, singapore_y_improved, test_size=0.3, random_state=42)
ny_X_train_imp, ny_X_test_imp, ny_y_train_imp, ny_y_test_imp = train_test_split(ny_X_improved, ny_y_improved, test_size=0.3, random_state=42)
madrid_X_train_imp, madrid_X_test_imp, madrid_y_train_imp, madrid_y_test_imp = train_test_split(madrid_X_improved, madrid_y_improved, test_size=0.3, random_state=42)

In [38]:
singapore_rf_y_pred = undo_log_transform(get_prediction_cv(singapore_X_train_imp, singapore_y_train_imp.ravel(), singapore_X_test_imp, rf_search_model))
singapore_lr_y_pred = undo_log_transform(get_prediction_cv(singapore_X_train_imp, singapore_y_train_imp.ravel(), singapore_X_test_imp, lr_search_model))
singapore_xg_y_pred = undo_log_transform(get_prediction_cv(singapore_X_train_imp, singapore_y_train_imp.ravel(), singapore_X_test_imp, xgb_search_model))

ny_rf_y_pred = undo_log_transform(get_prediction_cv(ny_X_train_imp, ny_y_train_imp.ravel(), ny_X_test_imp, rf_search_model))
ny_lr_y_pred = undo_log_transform(get_prediction_cv(ny_X_train_imp, ny_y_train_imp.ravel(), ny_X_test_imp, lr_search_model))
ny_xg_y_pred = undo_log_transform(get_prediction_cv(ny_X_train_imp, ny_y_train_imp.ravel(), ny_X_test_imp, xgb_search_model))

madrid_rf_y_pred = undo_log_transform(get_prediction_cv(madrid_X_train_imp, madrid_y_train_imp.ravel(), madrid_X_test_imp, rf_search_model))
madrid_lr_y_pred = undo_log_transform(get_prediction_cv(madrid_X_train_imp, madrid_y_train_imp.ravel(), madrid_X_test_imp, lr_search_model))
madrid_xg_y_pred = undo_log_transform(get_prediction_cv(madrid_X_train_imp, madrid_y_train_imp.ravel(), madrid_X_test_imp, xgb_search_model))

singapore_y_test_imp = undo_log_transform(singapore_y_test_imp)
ny_y_test_imp = undo_log_transform(ny_y_test_imp)
madrid_y_test_imp = undo_log_transform(madrid_y_test_imp)


singapore_rf_rsme, singapore_rf_mae, singapore_rf_r2 = calculate_metrics(singapore_y_test_imp, singapore_rf_y_pred)
singapore_lr_rsme, singapore_lr_mae, singapore_lr_r2 = calculate_metrics(singapore_y_test_imp, singapore_lr_y_pred)
singapore_xg_rsme, singapore_xg_mae, singapore_xg_r2 = calculate_metrics(singapore_y_test_imp, singapore_xg_y_pred)

ny_rf_rsme, ny_rf_mae, ny_rf_r2 = calculate_metrics(ny_y_test_imp, ny_rf_y_pred)
ny_lr_rsme, ny_lr_mae, ny_lr_r2 = calculate_metrics(ny_y_test_imp, ny_lr_y_pred)
ny_xg_rsme, ny_xg_mae, ny_xg_r2 = calculate_metrics(ny_y_test_imp, ny_xg_y_pred)

madrid_rf_rsme, madrid_rf_mae, madrid_rf_r2 = calculate_metrics(madrid_y_test_imp, madrid_rf_y_pred)
madrid_lr_rsme, madrid_lr_mae, madrid_lr_r2 = calculate_metrics(madrid_y_test_imp, madrid_lr_y_pred)
madrid_xg_rsme, madrid_xg_mae, madrid_xg_r2 = calculate_metrics(madrid_y_test_imp, madrid_xg_y_pred)

print("Singapore")
print(f'Random Forest Regressor RMSE: {singapore_rf_rsme}, MAE: {singapore_rf_mae}', f'R2: {singapore_rf_r2}')
print(f'Linear Regression RMSE: {singapore_lr_rsme}, MAE: {singapore_lr_mae}', f'R2: {singapore_lr_r2}')
print(f'XGBoost RMSE: {singapore_xg_rsme}, MAE: {singapore_xg_mae}', f'R2: {singapore_xg_r2}')

print("\nNew York")
print(f'Random Forest Regressor RMSE: {ny_rf_rsme}, MAE: {ny_rf_mae}', f'R2: {ny_rf_r2}')
print(f'Linear Regression RMSE: {ny_lr_rsme}, MAE: {ny_lr_mae}', f'R2: {ny_lr_r2}')
print(f'XGBoost RMSE: {ny_xg_rsme}, MAE: {ny_xg_mae}', f'R2: {ny_xg_r2}')

print("\nMadrid")
print(f'Random Forest Regressor RMSE: {madrid_rf_rsme}, MAE: {madrid_rf_mae}', f'R2: {madrid_rf_r2}')
print(f'Linear Regression RMSE: {madrid_lr_rsme}, MAE: {madrid_lr_mae}', f'R2: {madrid_lr_r2}')
print(f'XGBoost RMSE: {madrid_xg_rsme}, MAE: {madrid_xg_mae}', f'R2: {madrid_xg_r2}')

Singapore
Random Forest Regressor RMSE: 235.83956087368728, MAE: 55.180898776263554 R2: 0.13354861998563805
Linear Regression RMSE: 244.8365474435859, MAE: 69.4583453101915 R2: 0.06617955943481235
XGBoost RMSE: 237.4875220706376, MAE: 59.46888211021632 R2: 0.12139741858989728

New York
Random Forest Regressor RMSE: 180.39324999141974, MAE: 53.43768235877225 R2: 0.19425863015995704
Linear Regression RMSE: 185.96697071688146, MAE: 57.97005323605857 R2: 0.1436984499777504
XGBoost RMSE: 177.04481894793017, MAE: 53.98257291974429 R2: 0.22389310677835794

Madrid
Random Forest Regressor RMSE: 513.0329965908878, MAE: 84.38199018083628 R2: 0.016526972649531624
Linear Regression RMSE: 518.8935854896596, MAE: 90.53526544548583 R2: -0.006070606420164548
XGBoost RMSE: 510.29547380259703, MAE: 84.42998834519902 R2: 0.026994513315155344


##### Improved Preprocessing V2

In [81]:
# Split the data into training and testing sets
singapore_X_train_impv2, singapore_X_test_impv2, singapore_y_train_impv2, singapore_y_test_impv2 = train_test_split(singapore_X_improvedv2, singapore_y_improvedv2, test_size=0.3, random_state=42)
ny_X_train_impv2, ny_X_test_impv2, ny_y_train_impv2, ny_y_test_impv2 = train_test_split(ny_X_improvedv2, ny_y_improvedv2, test_size=0.3, random_state=42)
madrid_X_train_impv2, madrid_X_test_impv2, madrid_y_train_impv2, madrid_y_test_impv2 = train_test_split(madrid_X_improvedv2, madrid_y_improvedv2, test_size=0.3, random_state=42)

In [82]:
# Calculate metrics for each city
# Singapore
singapore_rf_rsme, singapore_rf_mae, singapore_rf_r2 = calculate_metrics(singapore_y_test_impv2, get_prediction_cv(singapore_X_train_impv2, singapore_y_train_impv2, singapore_X_test_impv2, rf_search_model))
singapore_lr_rsme, singapore_lr_mae, singapore_lr_r2 = calculate_metrics(singapore_y_test_impv2, get_prediction_cv(singapore_X_train_impv2, singapore_y_train_impv2, singapore_X_test_impv2, lr_search_model))
singapore_xgb_rsme, singapore_xgb_mae, singapore_xgb_r2 = calculate_metrics(singapore_y_test_impv2, get_prediction_cv(singapore_X_train_impv2, singapore_y_train_impv2, singapore_X_test_impv2, xgb_search_model))

print("Singapore")
print(f'Random Forest Regressor RMSE: {singapore_rf_rsme}, MAE: {singapore_rf_mae}', f'R2: {singapore_rf_r2}')
print(f'Linear Regression RMSE: {singapore_lr_rsme}, MAE: {singapore_lr_mae}', f'R2: {singapore_lr_r2}')
print(f'XGBoost RMSE: {singapore_xgb_rsme}, MAE: {singapore_xgb_mae}', f'R2: {singapore_xgb_r2}')


# New York
ny_rf_rsme, ny_rf_mae, ny_rf_r2 = calculate_metrics(ny_y_test_impv2, get_prediction_cv(ny_X_train_impv2, ny_y_train_impv2, ny_X_test_impv2, rf_search_model))
ny_lr_rsme, ny_lr_mae, ny_lr_r2 = calculate_metrics(ny_y_test_impv2, get_prediction_cv(ny_X_train_impv2, ny_y_train_impv2, ny_X_test_impv2, lr_search_model))
ny_xgb_rsme, ny_xgb_mae, ny_xgb_r2 = calculate_metrics(ny_y_test_impv2, get_prediction_cv(ny_X_train_impv2, ny_y_train_impv2, ny_X_test_impv2, xgb_search_model))

print("\nNew York")
print(f'Random Forest Regressor RMSE: {ny_rf_rsme}, MAE: {ny_rf_mae}', f'R2: {ny_rf_r2}')
print(f'Linear Regression RMSE: {ny_lr_rsme}, MAE: {ny_lr_mae}', f'R2: {ny_lr_r2}')
print(f'XGBoost RMSE: {ny_xgb_rsme}, MAE: {ny_xgb_mae}', f'R2: {ny_xgb_r2}')


# Madrid
madrid_rf_rsme, madrid_rf_mae, madrid_rf_r2 = calculate_metrics(madrid_y_test_impv2, get_prediction_cv(madrid_X_train_impv2, madrid_y_train_impv2, madrid_X_test_impv2, rf_search_model))
madrid_lr_rsme, madrid_lr_mae, madrid_lr_r2 = calculate_metrics(madrid_y_test_impv2, get_prediction_cv(madrid_X_train_impv2, madrid_y_train_impv2, madrid_X_test_impv2, lr_search_model))
madrid_xgb_rsme, madrid_xgb_mae, madrid_xgb_r2 = calculate_metrics(madrid_y_test_impv2, get_prediction_cv(madrid_X_train_impv2, madrid_y_train_impv2, madrid_X_test_impv2, xgb_search_model))

print("\nMadrid")
print(f'Random Forest Regressor RMSE: {madrid_rf_rsme}, MAE: {madrid_rf_mae}', f'R2: {madrid_rf_r2}')
print(f'Linear Regression RMSE: {madrid_lr_rsme}, MAE: {madrid_lr_mae}', f'R2: {madrid_lr_r2}')
print(f'XGBoost RMSE: {madrid_xgb_rsme}, MAE: {madrid_xgb_mae}', f'R2: {madrid_xgb_r2}')

Singapore
Random Forest Regressor RMSE: 49.0424508145053, MAE: 34.998454101096975 R2: 0.6767136657203048
Linear Regression RMSE: 62.79982824928116, MAE: 47.887331569708984 R2: 0.46989740720321016
XGBoost RMSE: 51.63484981411951, MAE: 36.935771177019845 R2: 0.6416323054794681

New York
Random Forest Regressor RMSE: 44.58907289095184, MAE: 32.03731061326221 R2: 0.574342226576025
Linear Regression RMSE: 47.3927223591988, MAE: 34.689129095299336 R2: 0.5191307604546569
XGBoost RMSE: 44.551839117174765, MAE: 32.01169000027764 R2: 0.5750528144839941

Madrid
Random Forest Regressor RMSE: 28.98099170728022, MAE: 20.64606061359219 R2: 0.4629866239177062
Linear Regression RMSE: 31.830699322029886, MAE: 23.17683368846895 R2: 0.352185031685183
XGBoost RMSE: 29.58741988150203, MAE: 21.304621242606466 R2: 0.4402774448889549
