In [2]:
import pandas as pd

# Load the datasets
madrid_df = pd.read_csv('datasets/madrid_listings.csv')
ny_df = pd.read_csv('datasets/newyorkcity_listings.csv')
singapore_df = pd.read_csv('datasets/singapore_listings.csv')

# Checking the first few rows of each dataset to understand their structure
madrid_head = madrid_df.head()
ny_head = ny_df.head()
singapore_head = singapore_df.head()

(madrid_head, ny_head, singapore_head)

(      id                                      name    host_id host_name  \
 0   6369  Rooftop terrace room ,  ensuite bathroom      13660     Simon   
 1  21853                      Bright and airy room      83531     Abdel   
 2  23001         Apartmento Arganzuela- Madrid Rio      82175     Jesus   
 3  24805                    Gran Via Studio Madrid  346366726         A   
 4  26825        Single Room whith private Bathroom     114340  Agustina   
 
   neighbourhood_group   neighbourhood  latitude  longitude        room_type  \
 0           Chamartín  Hispanoamérica  40.45724   -3.67688     Private room   
 1              Latina        Cármenes  40.40381   -3.74130     Private room   
 2          Arganzuela         Legazpi  40.38840   -3.69511  Entire home/apt   
 3              Centro     Universidad  40.42183   -3.70529  Entire home/apt   
 4          Arganzuela         Legazpi  40.38975   -3.69018     Private room   
 
    price  minimum_nights  number_of_reviews last_review  re

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Function to preprocess data, train and evaluate the model
def preprocess_train_evaluate(df):
    # Selecting relevant features for the model
    features = ['neighbourhood', 'latitude', 'longitude', 'room_type', 'price', 'minimum_nights', 'number_of_reviews']
    df = df[features].dropna()
    
    # Defining preprocessing for numeric columns
    numeric_features = ['latitude', 'longitude', 'minimum_nights', 'number_of_reviews']
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    
    # Defining preprocessing for categorical columns
    categorical_features = ['neighbourhood', 'room_type']
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    
    # Combining preprocessing steps
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ])
    
    return preprocessor, df.drop('price', axis=1), df['price']
    
preprocessor, X, y = preprocess_train_evaluate(madrid_df)

# Splitting the dataset
# X = df.drop('price', axis=1)
# y = df['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Creating and training the Random Forest model
rf_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))])
rf_pipeline.fit(X_train, y_train)

# Evaluating the model
y_pred = rf_pipeline.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)

(rmse, mae)

# return rmse, mae

# Running the process for each dataset
# results = {}
# results['Madrid'] = preprocess_train_evaluate(madrid_df)
# results['New York City'] = preprocess_train_evaluate(ny_df)
# results['Singapore'] = preprocess_train_evaluate(singapore_df)

# results


(557.4462802392061, 125.42886124238862)