## Package & Data Imports

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import math

import os
os.chdir('/content/drive/MyDrive/Code + Data')
import tobit
from tobit import TobitModel

from statsmodels.regression import quantile_regression
import statsmodels.api as sm
from statsmodels.formula.api import ols

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.metrics import ndcg_score

In [None]:
yelp_data = pd.read_parquet('/content/drive/MyDrive/Code + Data/yelp_data.parquet')
print(yelp_data.shape)

(1870042, 13)


In [None]:
# Downsample reviews
print(f"Original dataset size: {len(yelp_data)}")
print("0 helpful reviews", len(yelp_data[yelp_data['helpful'] == 0]))
minority_count = len(yelp_data[yelp_data['helpful'] > 0])
print(f"helpful > 0 reviews: {minority_count}")

majority_class = yelp_data[yelp_data['helpful'] == 0]
minority_class = yelp_data[yelp_data['helpful'] > 0]

minority_count = len(minority_class)
target_majority_size = int(minority_count)  # Keep 1x as many majority class reviews; ~280k

# Downsample the majority class
downsampled_majority = majority_class.sample(n=target_majority_size, random_state=42)
balanced_data = pd.concat([downsampled_majority, minority_class])

# Shuffle
balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)
yelp_data = balanced_data

print(f"Original dataset size: {len(yelp_data)}")
print("0 helpful reviews", len(yelp_data[yelp_data['helpful'] == 0]))
minority_count = len(yelp_data[yelp_data['helpful'] > 0])
print(f"helpful > 0 reviews: {minority_count}")

Original dataset size: 1870042
0 helpful reviews 1147389
helpful > 0 reviews: 722652
Original dataset size: 1445304
0 helpful reviews 722652
helpful > 0 reviews: 722652


In [None]:
cols = yelp_data.columns.tolist()
for c in cols:
  print(c)

review_id
user_id
business_id
stars_reviewer
useful
text
name
postal_code
stars_business
categories
total_reviews_for_business
helpful
num_sentences


In [None]:
# Convert helpfulness to be between 0-100 percent, as oppose to 0-1
yelp_data['helpful'] = yelp_data['helpful'].apply(lambda x: x * 100)

In [None]:
yelp_data['helpful'].describe()

Unnamed: 0,helpful
count,1445304.0
mean,0.190393
std,0.4748418
min,0.0
25%,0.0
50%,0.007516536
75%,0.228833
max,38.08463


# Review's Star Rating Model 1 (All Reviewers)


In [None]:
# 60/20/20 Train/Val/Test
X = yelp_data[['stars_reviewer']].copy()
X.loc[:,'cens'] = yelp_data['helpful'].apply(lambda x: -1 if x==0 else 0) # Left-bounded for Tobit Regression
Y = yelp_data['helpful'].copy()


X_train_val, X_test, Y_train_val, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
X_train, X_val, Y_train, Y_val = train_test_split(X_train_val, Y_train_val, test_size=0.25, random_state=42)

In [None]:
# TRAIN Review Stars Rating Model 1 - Predict Usefulness Based on Review's Star Rating
model1 = TobitModel()
results = model1.fit(X_train[['stars_reviewer']], Y_train, cens=X_train['cens'], verbose=True, tol=1e-5)

# Print estimated coefficients
print("Estimated coefficients:", model1.coef_)
print("Intercept:", model1.intercept_)
print("Sigma (standard deviation):", model1.sigma_)

Optimization terminated successfully.
         Current function value: 10473.519073
         Iterations: 9
         Function evaluations: 12
         Gradient evaluations: 12
  message: Optimization terminated successfully.
  success: True
   status: 0
      fun: 10473.51907253898
        x: [ 9.242e-01  1.353e-01  1.657e+00]
      nit: 9
      jac: [ 1.261e-07  5.255e-07  8.576e-08]
 hess_inv: [[ 1.091e-02 -2.572e-03 -6.733e-06]
            [-2.572e-03  6.353e-04  1.458e-07]
            [-6.733e-06  1.458e-07  2.566e-04]]
     nfev: 12
     njev: 12
Estimated coefficients: [0.13526586]
Intercept: 0.135265864636048
Sigma (standard deviation): 1.6573901194402858


In [None]:
# VAL Review Stars Rating Model 1 - Predict Usefulness Based on Review's Star Rating
predictions = model1.predict(X_val[['stars_reviewer']])

# Root Mean Squared Error
mse = mean_squared_error(Y_val, predictions)
rmse = np.sqrt(mse)
print(f'Root Mean Squared Error: {rmse}')

# Mean Absolute Error
abs_err = np.abs(Y_val - predictions)
mae = np.mean(abs_err)
print(f'Mean Absolute Error: {mae}')

# NDCG
true_helpful = np.asarray([Y_val])
pred_helpful = np.asarray([predictions])

model_1_ndcg_5 = ndcg_score(true_helpful, pred_helpful, k=5)
model_1_ndcg_25 = ndcg_score(true_helpful, pred_helpful, k=25)
model_1_ndcg_50 = ndcg_score(true_helpful, pred_helpful, k=50)
model_1_ndcg_100 = ndcg_score(true_helpful, pred_helpful, k=100)
model_1_ndcg_1000 = ndcg_score(true_helpful, pred_helpful, k=1000)
model_1_ndcg_all = ndcg_score(true_helpful, pred_helpful)

# Print nDCG scores for each k value
print(f'NDCG Score for k=5: {model_1_ndcg_5}')
print(f'NDCG Score for k=25: {model_1_ndcg_25}')
print(f'NDCG Score for k=50: {model_1_ndcg_50}')
print(f'NDCG Score for k=100: {model_1_ndcg_100}')
print(f'NDCG Score for k=1000: {model_1_ndcg_1000}')
print(f'NDCG Score for all: {model_1_ndcg_all}')

Root Mean Squared Error: 0.6719733389212516
Mean Absolute Error: 0.4770690594433869
NDCG Score for k=5: 0.00879457001035935
NDCG Score for k=25: 0.011326926828843478
NDCG Score for k=50: 0.013357539711401236
NDCG Score for k=100: 0.0162622248624018
NDCG Score for k=1000: 0.03916675700112661
NDCG Score for all: 0.7847562502106377


# Review's Star Rating Model 2 (Top Reviewers Only)


In [None]:
# For the top reviewers, product rating is a significant predictor of review helpfulness
# Based on ‌A. H. Huang, K. Chen, D. C. Yen, and T. P. Tran, “A study of factors that contribute to online review helpfulness,” Computers in Human Behavior, vol. 48, pp. 17–27, Jul. 2015, doi: https://doi.org/10.1016/j.chb.2015.01.010.
# Supported hypothesis that determine who is a top reviewer: reviewer_cum_helpful # Ratio of total yes votes to total num of votes.

# Get top 100 Reviewers based on review count
top_100_active = yelp_data.groupby('user_id').size().nlargest(100).reset_index(name='review_count')

# Determine reviewer_cum_helpful for each top 100 reviewer
avg_helpful_scores = yelp_data[yelp_data['user_id'].isin(top_100_active['user_id'])].groupby('user_id')
reviewer_cum_helpful = avg_helpful_scores['helpful'].mean()
reviewer_cum_helpful.sort_values(ascending=False)

# Grab top 50 as the top reviewers and get subset with only their reviews
top_50_reviewers = reviewer_cum_helpful.nlargest(50).reset_index(name='reviewer_cum_helpful')
top_reviewer_reviews = yelp_data[yelp_data['user_id'].isin(top_50_reviewers['user_id'])]

top_reviewer_reviews = top_reviewer_reviews.merge(top_50_reviewers, on='user_id')

In [None]:
# 60/20/20 Train/Val/Test
X = top_reviewer_reviews[['stars_reviewer', 'reviewer_cum_helpful']].copy()

X.loc[:,'cens'] = top_reviewer_reviews['helpful'].apply(lambda x: -1 if x==0 else 0) # Left-bounded for Tobit Regression
Y = top_reviewer_reviews['helpful'].copy()


X_train_val, X_test, Y_train_val, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
X_train, X_val, Y_train, Y_val = train_test_split(X_train_val, Y_train_val, test_size=0.25, random_state=42)

In [None]:
# TRAIN Review Stars Rating Model 2 - Predict Usefulness Based on Review's Star Rating for Top Reviewers
model2 = TobitModel()
results = model2.fit(X_train[['stars_reviewer', 'reviewer_cum_helpful']], Y_train, cens=X_train['cens'], verbose=True, tol=1e-3)

# Print estimated coefficients
print("Estimated coefficients:", model2.coef_)
print("Intercept:", model2.intercept_)
print("Sigma (standard deviation):", model2.sigma_)

Optimization terminated successfully.
         Current function value: 9596.072257
         Iterations: 8
         Function evaluations: 11
         Gradient evaluations: 11
  message: Optimization terminated successfully.
  success: True
   status: 0
      fun: 9596.072257219299
        x: [-8.510e-02  2.980e-03  1.047e+00  1.410e+00]
      nit: 8
      jac: [ 4.392e-05  1.828e-04  6.303e-05 -2.505e-05]
 hess_inv: [[ 8.378e-03 -1.795e-03 -5.097e-04 -5.246e-06]
            [-1.795e-03  4.674e-04 -6.582e-05  4.850e-07]
            [-5.097e-04 -6.582e-05  5.250e-04  2.038e-06]
            [-5.246e-06  4.850e-07  2.038e-06  1.851e-04]]
     nfev: 11
     njev: 11
Estimated coefficients: [0.00297986 1.04669508]
Intercept: 0.0029798606703386244
Sigma (standard deviation): 1.4097892124021723


In [None]:
# VAL Review Stars Rating Model 2 - Predict Usefulness Based on Review's Star Rating for Top Reviewers
predictions = model2.predict(X_val[['stars_reviewer', 'reviewer_cum_helpful']])

# Root Mean Squared Error
mse = mean_squared_error(Y_val, predictions)
rmse = np.sqrt(mse)
print(f'Root Mean Squared Error: {rmse}')

# Mean Absolute Error
abs_err = np.abs(Y_val - predictions)
mae = np.mean(abs_err)
print(f'Mean Absolute Error: {mae}')

# NDCG
true_helpful = np.asarray([Y_val])
pred_helpful = np.asarray([predictions])

model_2_ndcg_5 = ndcg_score(true_helpful, pred_helpful, k=5)
model_2_ndcg_25 = ndcg_score(true_helpful, pred_helpful, k=25)
model_2_ndcg_50 = ndcg_score(true_helpful, pred_helpful, k=50)
model_2_ndcg_100 = ndcg_score(true_helpful, pred_helpful, k=100)
model_2_ndcg_1000 = ndcg_score(true_helpful, pred_helpful, k=1000)
model_2_ndcg_all = ndcg_score(true_helpful, pred_helpful)

# Print nDCG scores for each k value
print(f'NDCG Score for k=5: {model_2_ndcg_5}')
print(f'NDCG Score for k=25: {model_2_ndcg_25}')
print(f'NDCG Score for k=50: {model_2_ndcg_50}')
print(f'NDCG Score for k=100: {model_2_ndcg_100}')
print(f'NDCG Score for k=1000: {model_2_ndcg_1000}')
print(f'NDCG Score for all: {model_2_ndcg_all}')

Root Mean Squared Error: 1.4530196648073952
Mean Absolute Error: 0.9069543668945138
NDCG Score for k=5: 0.126133902136557
NDCG Score for k=25: 0.3141936651776436
NDCG Score for k=50: 0.4455529807009656
NDCG Score for k=100: 0.4928606611753458
NDCG Score for k=1000: 0.7260476600397412
NDCG Score for all: 0.8267604846857396


# Summary of Stars Models

1. Tried just stars (model 1) with Tobit regression to predict helpfulness.

 Validation Set Results:

 - Root Mean Squared Error: 0.6719733389212516
 - Mean Absolute Error: 0.4770690594433869
 - NDCG Score for k=5: 0.00879457001035935
 - NDCG Score for k=25: 0.011326926828843478
 - NDCG Score for k=50: 0.013357539711401236
 - NDCG Score for k=100: 0.0162622248624018
 - NDCG Score for k=1000: 0.03916675700112661
 - NDCG Score for all: 0.7847562502106377
2. Update model 1 to model 2 by only examining top 50 reviewers determined by average cumulative helpfulness across all their reviews.

3. Trained model 2

  Validation Set Results:

 - Root Mean Squared Error: 1.4530196648073952
 - Mean Absolute Error: 0.9069543668945138
 - NDCG Score for k=5: 0.126133902136557
 - NDCG Score for k=25: 0.3141936651776436
 - NDCG Score for k=50: 0.4455529807009656
 - NDCG Score for k=100: 0.4928606611753458
 - NDCG Score for k=1000: 0.7260476600397412
 - NDCG Score for all: 0.8267604846857396