In [46]:
import pandas as pd

# Set display options to show more rows and columns
pd.set_option('display.max_columns', None) # Show all columns

listings = pd.read_csv('final_listings.csv')
listings_without_amenities = pd.read_csv('final_listings_without_amenities.csv')

In [47]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


# Separate features and target
X = listings.drop(columns=["price"])
X_without_amenities = listings_without_amenities.drop(columns=["price"])

y = listings["price"]


# Set up 10-fold cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)


def perform_cross_validation(X, y, model, kf):
    mse_scores = []
    mae_scores = []
    rmse_scores = []
    r2_scores = []
    
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Ensure X_train and X_test are 2D arrays
        X_train = X_train.values.reshape(-1, 1) if X_train.ndim == 1 else X_train.values
        X_test = X_test.values.reshape(-1, 1) if X_test.ndim == 1 else X_test.values

        # Train the model
        model.fit(X_train, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test)
        
        # Calculate evaluation metrics
        mse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        r2 = r2_score(y_test, y_pred)
        
        # Store metrics in lists
        mse_scores.append(mse)
        mae_scores.append(mae)
        rmse_scores.append(rmse)
        r2_scores.append(r2)
    
    # Return the average metrics
    return np.mean(mse_scores), np.mean(mae_scores), np.mean(rmse_scores), np.mean(r2_scores)

<h3>Linear Regression</h3>

In [None]:
from sklearn.linear_model import LinearRegression

# Define the Linear Regression model
lin_reg = LinearRegression()

# Perform cross-validation on X with amenities
mse, mae, rmse, r2 = perform_cross_validation(X, y, lin_reg, kf)
print("With amenities:")
print("Average MSE:", mse)
print("Average MAE:", mae)
print("Average RMSE:", rmse)
print("Average R²:", r2)

# Perform cross-validation on X without amenities
mse, mae, rmse, r2 = perform_cross_validation(X_without_amenities, y, lin_reg, kf)
print("Without amenities:")
print("Average MSE:", mse)
print("Average MAE:", mae)
print("Average RMSE:", rmse)
print("Average R²:", r2)

In [None]:
from sklearn.linear_model import Lasso

# Define the Linear Regression model
lasso_model = Lasso()

# Perform cross-validation on X with amenities
mse, mae, rmse, r2 = perform_cross_validation(X, y, lasso_model, kf)
print("With amenities:")
print("Average MSE:", mse)
print("Average MAE:", mae)
print("Average RMSE:", rmse)
print("Average R²:", r2)

# Perform cross-validation on X without amenities
mse, mae, rmse, r2 = perform_cross_validation(X_without_amenities, y, lasso_model, kf)
print("Without amenities:")
print("Average MSE:", mse)
print("Average MAE:", mae)
print("Average RMSE:", rmse)
print("Average R²:", r2)

In [None]:
from sklearn.linear_model import Ridge

ridge_model = Ridge(alpha=0.1)

# Perform cross-validation on X with amenities
mse, mae, rmse, r2 = perform_cross_validation(X, y, ridge_model, kf)
print("With amenities:")
print("Average MSE:", mse)
print("Average MAE:", mae)
print("Average RMSE:", rmse)
print("Average R²:", r2)

# Perform cross-validation on X without amenities
mse, mae, rmse, r2 = perform_cross_validation(X_without_amenities, y, ridge_model, kf)
print("Without amenities:")
print("Average MSE:", mse)
print("Average MAE:", mae)
print("Average RMSE:", rmse)
print("Average R²:", r2)

<h3>K-Nearest Neighbors</h3>

In [None]:
from sklearn.neighbors import KNeighborsRegressor

# Define the k-NN regressor model
knn = KNeighborsRegressor()

# Perform cross-validation on X with amenities
mse, mae, rmse, r2 = perform_cross_validation(X, y, knn, kf)
print("With amenities:")
print("Average MSE:", mse)
print("Average MAE:", mae)
print("Average RMSE:", rmse)
print("Average R²:", r2)

# Perform cross-validation on X without amenities
mse, mae, rmse, r2 = perform_cross_validation(X_without_amenities, y, knn, kf)
print("Without amenities:")
print("Average MSE:", mse)
print("Average MAE:", mae)
print("Average RMSE:", rmse)
print("Average R²:", r2)

<h3>Support Vector Regression</h3>

In [None]:
from sklearn.svm import SVR

# Define the SVR model (using RBF kernel by default)
svr_model = SVR()

# Perform cross-validation on X with amenities
mse, mae, rmse, r2 = perform_cross_validation(X, y, svr_model, kf)
print("With amenities:")
print("Average MSE:", mse)
print("Average MAE:", mae)
print("Average RMSE:", rmse)
print("Average R²:", r2)

# Perform cross-validation on X without amenities
mse, mae, rmse, r2 = perform_cross_validation(X_without_amenities, y, svr_model, kf)
print("Without amenities:")
print("Average MSE:", mse)
print("Average MAE:", mae)
print("Average RMSE:", rmse)
print("Average R²:", r2)

<h3>Decision Tree Regression</h3>

In [None]:
from sklearn.tree import DecisionTreeRegressor

# Define the Decision Tree Regressor model
tree_model = DecisionTreeRegressor(random_state=42)

# Perform cross-validation on X with amenities
mse, mae, rmse, r2 = perform_cross_validation(X, y, tree_model, kf)
print("With amenities:")
print("Average MSE:", mse)
print("Average MAE:", mae)
print("Average RMSE:", rmse)
print("Average R²:", r2)

# Perform cross-validation on X without amenities
mse, mae, rmse, r2 = perform_cross_validation(X_without_amenities, y, tree_model, kf)
print("Without amenities:")
print("Average MSE:", mse)
print("Average MAE:", mae)
print("Average RMSE:", rmse)
print("Average R²:", r2)

<h3>Random Forest</h3>

In [None]:
from sklearn.ensemble import RandomForestRegressor

random_forest_model = RandomForestRegressor()

# Perform cross-validation on X with amenities
mse, mae, rmse, r2 = perform_cross_validation(X, y, random_forest_model, kf)
print("With amenities:")
print("Average MSE:", mse)
print("Average MAE:", mae)
print("Average RMSE:", rmse)
print("Average R²:", r2)

# Perform cross-validation on X without amenities
mse, mae, rmse, r2 = perform_cross_validation(X_without_amenities, y, random_forest_model, kf)
print("Without amenities:")
print("Average MSE:", mse)
print("Average MAE:", mae)
print("Average RMSE:", rmse)
print("Average R²:", r2)

<h3>Sentiment only</h3>

In [48]:
sentiment = listings['sentiment']

sentiment

0        0.161335
1       -1.162224
2       -0.318872
3        0.253010
4       -0.337820
           ...   
18920   -1.324777
18921   -1.593892
18922   -1.594129
18923   -1.171138
18924   -1.231639
Name: sentiment, Length: 18925, dtype: float64

In [49]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Set up 10-fold cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)

linear_model = LinearRegression()
knn_model = KNeighborsRegressor()
tree_model = DecisionTreeRegressor()

mse, mae, rmse, r2 = perform_cross_validation(sentiment, y, linear_model, kf)
print("Linear Regression:")
print("Average MSE:", mse)
print("Average MAE:", mae)
print("Average RMSE:", rmse)
print("Average R²:", r2)

mse, mae, rmse, r2 = perform_cross_validation(sentiment, y, knn_model, kf)
print("KNN:")
print("Average MSE:", mse)
print("Average MAE:", mae)
print("Average RMSE:", rmse)
print("Average R²:", r2)

mse, mae, rmse, r2 = perform_cross_validation(sentiment, y, tree_model, kf)
print("Decision Tree:")
print("Average MSE:", mse)
print("Average MAE:", mae)
print("Average RMSE:", rmse)
print("Average R²:", r2)

Linear Regression:
Average MSE: 7743460828407.654
Average MAE: 32963.98953672395
Average RMSE: 1071717.7779387769
Average R²: -22642713.19998171
KNN:
Average MSE: 81537387997.47916
Average MAE: 12873.860657537463
Average RMSE: 273398.04763589014
Average R²: -7736186722.226771
Decision Tree:
Average MSE: 125976224900.007
Average MAE: 11839.220767547127
Average RMSE: 328191.5372687145
Average R²: -37589421308.28575
