In [43]:
import pandas as pd

# Set display options to show more rows and columns
pd.set_option('display.max_columns', None) # Show all columns

listings = pd.read_csv('final_listings.csv')
listings_without_amenities = pd.read_csv('final_listings_without_amenities.csv')

In [44]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


# Separate features and target
X = listings.drop(columns=["price"])
X_without_amenities = listings_without_amenities.drop(columns=["price"])

y = listings["price"]


# Set up 10-fold cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)


def perform_cross_validation(X, y, model, kf):
    mse_scores = []
    mae_scores = []
    rmse_scores = []
    r2_scores = []
    
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Train the model
        model.fit(X_train, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test)
        
        # Calculate evaluation metrics
        mse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        r2 = r2_score(y_test, y_pred)
        
        # Store metrics in lists
        mse_scores.append(mse)
        mae_scores.append(mae)
        rmse_scores.append(rmse)
        r2_scores.append(r2)
    
    # Return the average metrics
    return np.mean(mse_scores), np.mean(mae_scores), np.mean(rmse_scores), np.mean(r2_scores)

<h3>Linear Regression</h3>

In [None]:
from sklearn.linear_model import LinearRegression

# Define the Linear Regression model
lin_reg = LinearRegression()

# Perform cross-validation on X with amenities
mse, mae, rmse, r2 = perform_cross_validation(X, y, lin_reg, kf)
print("With amenities:")
print("Average MSE:", mse)
print("Average MAE:", mae)
print("Average RMSE:", rmse)
print("Average R²:", r2)

# Perform cross-validation on X without amenities
mse, mae, rmse, r2 = perform_cross_validation(X_without_amenities, y, lin_reg, kf)
print("Without amenities:")
print("Average MSE:", mse)
print("Average MAE:", mae)
print("Average RMSE:", rmse)
print("Average R²:", r2)

<h3>K-Nearest Neighbors</h3>

In [None]:
from sklearn.neighbors import KNeighborsRegressor

# Define the k-NN regressor model
knn = KNeighborsRegressor()

# Perform cross-validation on X with amenities
mse, mae, rmse, r2 = perform_cross_validation(X, y, knn, kf)
print("With amenities:")
print("Average MSE:", mse)
print("Average MAE:", mae)
print("Average RMSE:", rmse)
print("Average R²:", r2)

# Perform cross-validation on X without amenities
mse, mae, rmse, r2 = perform_cross_validation(X_without_amenities, y, knn, kf)
print("Without amenities:")
print("Average MSE:", mse)
print("Average MAE:", mae)
print("Average RMSE:", rmse)
print("Average R²:", r2)

<h3>Decision Tree Regression</h3>

In [None]:
from sklearn.tree import DecisionTreeRegressor

# Define the Decision Tree Regressor model
tree_model = DecisionTreeRegressor(random_state=42)

# Perform cross-validation on X with amenities
mse, mae, rmse, r2 = perform_cross_validation(X, y, tree_model, kf)
print("With amenities:")
print("Average MSE:", mse)
print("Average MAE:", mae)
print("Average RMSE:", rmse)
print("Average R²:", r2)

# Perform cross-validation on X without amenities
mse, mae, rmse, r2 = perform_cross_validation(X_without_amenities, y, tree_model, kf)
print("Without amenities:")
print("Average MSE:", mse)
print("Average MAE:", mae)
print("Average RMSE:", rmse)
print("Average R²:", r2)

<h3>Random Forest</h3>

In [47]:
from sklearn.ensemble import RandomForestRegressor

random_forest_model = RandomForestRegressor()

# Perform cross-validation on X with amenities
mse, mae, rmse, r2 = perform_cross_validation(X, y, random_forest_model, kf)
print("With amenities:")
print("Average MSE:", mse)
print("Average MAE:", mae)
print("Average RMSE:", rmse)
print("Average R²:", r2)

# Perform cross-validation on X without amenities
mse, mae, rmse, r2 = perform_cross_validation(X_without_amenities, y, random_forest_model, kf)
print("Without amenities:")
print("Average MSE:", mse)
print("Average MAE:", mae)
print("Average RMSE:", rmse)
print("Average R²:", r2)

With amenities:
Average MSE: 1138988792.4171472
Average MAE: 444.3240188754503
Average RMSE: 18174.248909317932
Average R²: 0.9417262797429787
Without amenities:
Average MSE: 843865556.1907896
Average MAE: 398.6161445386202
Average RMSE: 16086.200866245705
Average R²: 0.9392264790272611


<h3>XGBoost</h3>

In [46]:
import xgboost as xgb

# Initialize the XGBoost model for regression
xg_reg = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1, max_depth=5)

# Perform cross-validation on X with amenities
mse, mae, rmse, r2 = perform_cross_validation(X, y, xg_reg, kf)
print("With amenities:")
print("Average MSE:", mse)
print("Average MAE:", mae)
print("Average RMSE:", rmse)
print("Average R²:", r2)

# Perform cross-validation on X without amenities
mse, mae, rmse, r2 = perform_cross_validation(X_without_amenities, y, xg_reg, kf)
print("Without amenities:")
print("Average MSE:", mse)
print("Average MAE:", mae)
print("Average RMSE:", rmse)
print("Average R²:", r2)

With amenities:
Average MSE: 493530565.78459656
Average MAE: 163.69514943400245
Average RMSE: 7097.717452770211
Average R²: 0.8987949509186001
Without amenities:
Average MSE: 493530565.7176037
Average MAE: 163.69632969940045
Average RMSE: 7097.716929151854
Average R²: 0.8997578276035945


<h3>Sentiment only</h3>

In [None]:
sentiment = listings['sentiment']

sentiment

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Set up 10-fold cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)

linear_model = LinearRegression()
knn_model = KNeighborsRegressor()
tree_model = DecisionTreeRegressor()

mse, mae, rmse, r2 = perform_cross_validation(sentiment, y, linear_model, kf)
print("Linear Regression:")
print("Average MSE:", mse)
print("Average MAE:", mae)
print("Average RMSE:", rmse)
print("Average R²:", r2)

mse, mae, rmse, r2 = perform_cross_validation(sentiment, y, knn_model, kf)
print("KNN:")
print("Average MSE:", mse)
print("Average MAE:", mae)
print("Average RMSE:", rmse)
print("Average R²:", r2)

mse, mae, rmse, r2 = perform_cross_validation(sentiment, y, tree_model, kf)
print("Decision Tree:")
print("Average MSE:", mse)
print("Average MAE:", mae)
print("Average RMSE:", rmse)
print("Average R²:", r2)