In [13]:
import pandas as pd
import plotly.express as px
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import plotly.graph_objects as go

# Load the dataset

In [14]:

file_path = './coffee shop.csv'
data = pd.read_csv(file_path)

# Preprocess the data

In [15]:
features = data.drop(columns=['Service Rating'])
target = data['Service Rating']

# One-hot encode categorical variables

In [16]:
features_encoded = pd.get_dummies(features, drop_first=True)

In [17]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features_encoded, target, test_size=0.2, random_state=42)

In [18]:
# Initialize and train the Random Forest model
random_forest_model = RandomForestRegressor(random_state=42)
random_forest_model.fit(X_train, y_train)

In [19]:
# Make predictions on the test set using Random Forest
rf_predictions = random_forest_model.predict(X_test)

# Calculate performance metrics for Random Forest
rf_mse = mean_squared_error(y_test, rf_predictions)
rf_r2 = r2_score(y_test, rf_predictions)

In [20]:
# Perform cross-validation on the Random Forest model
cv_scores = cross_val_score(random_forest_model, features_encoded, target, cv=5, scoring='neg_mean_squared_error')

In [21]:
# Convert negative MSE to positive
cv_mse_scores = -cv_scores

# Calculate the mean and standard deviation of the cross-validated MSE
cv_mse_mean = cv_mse_scores.mean()
cv_mse_std = cv_mse_scores.std()

In [22]:
# Define the parameter grid for Random Forest
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize the GridSearchCV object
grid_search = GridSearchCV(estimator=random_forest_model, param_grid=param_grid, 
                           cv=5, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)

In [23]:
# Perform the grid search
grid_search.fit(features_encoded, target)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = -grid_search.best_score_

# Create a scatter plot for actual vs predicted values
fig = go.Figure()
fig.add_trace(go.Scatter(x=y_test, y=rf_predictions, mode='markers', name='Predictions'))

# Add a line for perfect predictions
fig.add_trace(go.Scatter(x=[y_test.min(), y_test.max()], y=[y_test.min(), y_test.max()], mode='lines', name='Perfect Fit', line=dict(color='red', dash='dash')))

Fitting 5 folds for each of 108 candidates, totalling 540 fits


ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [25]:
import joblib

# Save the trained model
joblib.dump(random_forest_model, 'random_forest_model.joblib')

# Later, to load the model:
loaded_model = joblib.load('random_forest_model.joblib')