In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, Lasso 
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
import ast
import re
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as mcolors
from adjustText import adjust_text
from sklearn.dummy import DummyRegressor


In [None]:
df = pd.read_csv('./Data/cleaned_with_sentiment_scores_gt20_fuzzy.csv').convert_dtypes()


In [None]:
columns_to_check = [
    'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness',
    'review_scores_checkin', 'review_scores_communication', 'review_scores_location',
    'review_scores_value', 'median_sentiment_clean', 'average_sentiment_clean',
    'median_sentiment_accurate', 'average_sentiment_accurate', 'median_sentiment_checkin',
    'average_sentiment_checkin', 'median_sentiment_communication', 'average_sentiment_communication',
    'median_sentiment_location', 'average_sentiment_location', 'median_sentiment_value',
    'average_sentiment_value', 'median_sentiment_overall', 'average_sentiment_overall'
]
df = df[(df[columns_to_check] != 0).all(axis=1)]
# df = df.dropna()

In [None]:
na_columns = df.iloc[0].isna()

na_columns[na_columns].index.tolist()

In [None]:
# df = df[df['review_scores_rating'] > 4]

In [None]:
# plt.subplot(1, 1,1)
# plt.scatter(df[f'median_sentiment_overall'], df['review_scores_rating'], alpha=0.5)
# plt.title('Mean Overall Sentiment vs Overall Rating (>4)')
# plt.xlabel('Mean Sentiment')
# plt.ylabel('Overall Rating')

In [None]:
%%capture
models = {
    "Ridge": Ridge(),
    "RandomForest": RandomForestRegressor(random_state=42),
    "GradientBoosting": GradientBoostingRegressor(random_state=42),
    "DummyRegressor": DummyRegressor()
}

categories = [
    ('overall', 'review_scores_rating'),
    ('accurate', 'review_scores_accuracy'),
    ('clean', 'review_scores_cleanliness'),
    ('checkin', 'review_scores_checkin'),
    ('communication', 'review_scores_communication'),
    ('location', 'review_scores_location'),
    ('value', 'review_scores_value')
]
scoring = make_scorer(mean_squared_error, greater_is_better=False)
# Define hyperparameters
param_grids = {
    "Ridge": {
        'alpha': [0.0001, 0.001, 0.01, 0.1, 1],
        'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga', 'lbfgs']
    },
    "RandomForest": {
        'n_estimators': [100, 200, 300],
        'max_features': ['sqrt', 'log2'],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    "GradientBoosting": {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    "DummyRegressor":{
        'strategy': ['mean','median'],
    }
}

# Perform grid search with cross-validation and evaluate each model
results = []
for min_reviews in [3,10,20]:
    df_test= df[df['review_count'] > min_reviews]
    for category in categories:
        X = df_test[[f'average_sentiment_{category[0]}', f'median_sentiment_{category[0]}']]
        y = df_test[f'{category[1]}']

        # Split data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)
        for name, model in models.items():
                print(f"Training {name}...")
                grid_search = GridSearchCV(estimator=model, param_grid=param_grids[name],scoring=scoring,cv=5, n_jobs=-1, verbose=0)
                grid_search.fit(X_train, y_train)
                
                best_params = grid_search.best_params_
                best_model = grid_search.best_estimator_
                y_pred = best_model.predict(X_test)
                
                mse = mean_squared_error(y_test, y_pred)
                r2 = r2_score(y_test, y_pred)
                mae = mean_absolute_error(y_test, y_pred)
                
                results.append({
                    'Model': name,
                    'Best Params': best_params,
                    'MSE': mse,
                    'MAE': mae,
                    'R^2': r2,
                    'Category': category,
                    'min_reviews':min_reviews
                })
            
# Convert results to DataFrame and display
results_df = pd.DataFrame(results)
results_df.to_csv('multi_model_evaluation_all_categories_cv5.csv');


In [4]:
df = pd.read_csv('./multi_model_evaluation_all_categories.csv')
df2 = pd.read_csv('./multi_model_evaluation_all_categories_dummy.csv')
df = pd.concat([df,df2],ignore_index=True).reset_index()
df.drop(columns=['index'],inplace=True)
rx = re.compile(r"[^A-Za-z0-9\s]")
df[['Category']] = df[['Category']].map(lambda x: rx.sub('',x).split()[0])

df.drop(columns='Unnamed: 0',inplace=True)

In [None]:
# df = df[df['Model'] == 'DummyRegressor']

In [None]:
df['median_sentiment']

In [None]:
category_labels

In [None]:
category_labels = df['Category'].unique().tolist()
bar_width = 0.2  # Adjusted width for bars

fig, ax = plt.subplots(figsize=(14, 8))

# Plotting
for i, min_reviews in enumerate([3, 10, 20]):
    mse_values = []
    models = []
    for category in category_labels:
        cat_data = df[(df['Category'] == category) & (df['min_reviews'] == min_reviews)]
        min_mse = cat_data.loc[cat_data['R^2'].idxmax()]
        mse_values.append(min_mse['R^2'])
        models.append(min_mse['Model'])


    # Create custom positions for bars to increase spacing
    bar_positions = [j + i * bar_width for j in range(len(category_labels))]
    
    bars = ax.bar(bar_positions, mse_values, width=bar_width, label=f'min_reviews = {min_reviews}', alpha=.7)

    # Annotate the bars with model names and MSE values
    for bar, model, mse_value in zip(bars, models, mse_values):
        if model is not None:
            height = bar.get_height()
            ax.text(bar.get_x() + bar.get_width() / 2, bar.get_y() + height / 3,
                    f'{model} ({mse_value:.4f})', ha='center', fontsize=8,rotation=90)

# Adjust x-ticks to be in the center of the grouped bars
center_positions = [j for j in range(len(category_labels))]
ax.set_xticks(center_positions)
ax.set_xticklabels(category_labels)

ax.grid(True, which='Major', linestyle='--', linewidth=0.5, color='gray', alpha=0.7)
ax.set_xlabel('Category',fontsize=13)
ax.set_ylabel('R^2',fontsize=13)
ax.legend(title='Min Reviews')
ax.set_title('R^2 by Category and Min Reviews Group for dummy model')

plt.show()

In [None]:
category_labels = df['Category'].unique().tolist()
bar_width = 0.2  # Adjusted width for bars

fig, ax = plt.subplots(figsize=(14, 8))

# Plotting
for i, min_reviews in enumerate([3, 10, 20]):
    mse_values = []
    models = []
    for category in category_labels:
        


    # Create custom positions for bars to increase spacing
    bar_positions = [j + i * bar_width for j in range(len(category_labels))]
    
    bars = ax.bar(bar_positions, mse_values, width=bar_width, label=f'min_reviews = {min_reviews}', alpha=.7)

    # Annotate the bars with model names and MSE values
    for bar, model, mse_value in zip(bars, models, mse_values):
        if model is not None:
            height = bar.get_height()
            ax.text(bar.get_x() + bar.get_width() / 2, bar.get_y() + height / 3,
                    f'{model} ({mse_value:.4f})', ha='center', fontsize=8,rotation=90)

In [None]:
accuracy = df['review_scores_rating'].dropna()
dummy = df['median_sentiment_overall'].dropna()

In [None]:
len(dummy)

In [None]:
dummy = dummy.apply(lambda x: x * 5)

In [None]:
np.mean((accuracy - dummy)** 2)