## Package & Data Imports

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import os
os.chdir('/content/drive/MyDrive/Code + Data')
import tobit
from tobit import TobitModel

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.metrics import ndcg_score
from sklearn.preprocessing import StandardScaler



In [None]:
# Read in the processed data
train = pd.read_parquet('/content/drive/MyDrive/Code + Data/bn_train_data_final_user_cum_helpful.parquet')
val = pd.read_parquet('/content/drive/MyDrive/Code + Data/bn_val_data_final_user_cum_helpful.parquet')
test = pd.read_parquet('/content/drive/MyDrive/Code + Data/bn_test_data_final_user_cum_helpful.parquet')

# Business Nature Model 1 (Tobit Regression w/ Nature + Review Stars, Majority Nature Class)


Hypothesis: For experience-type nature, reviews with extreme ratings are less helpful than reviews with moderate ratings.


In [None]:
# TRAIN - Focus on Nature + Review Extremity (Stars) First: Helpfulness % = β1Rating + β2Rating2 + β3 Business Nature + β4 Rating × Business Nature + β5 Rating^2 × Business type + ε
bus_nature_df = pd.DataFrame()

# Create interaction terms
bus_nature_df['stars_reviewer'] = train['stars_reviewer'] - train['stars_reviewer'].mean() # Center stars_reviewer
bus_nature_df['stars_reviewer2'] = bus_nature_df['stars_reviewer'] ** 2 - (bus_nature_df['stars_reviewer'] ** 2).mean() # Center
bus_nature_df['nature'] = train['nature']
bus_nature_df['stars_reviewer_nature'] = bus_nature_df['stars_reviewer'] * bus_nature_df['nature'] - (bus_nature_df['stars_reviewer'] * bus_nature_df['nature']).mean() # Center
bus_nature_df['stars_reviewer2_nature'] = bus_nature_df['stars_reviewer2'] * bus_nature_df['nature'] - (bus_nature_df['stars_reviewer2'] * bus_nature_df['nature']).mean() # Center

# X = bus_nature_df[['stars_reviewer', 'stars_reviewer2', 'nature']]
X = bus_nature_df[['stars_reviewer', 'stars_reviewer2', 'nature', 'stars_reviewer_nature', 'stars_reviewer2_nature']]
Y_train = train['helpful'].copy()

# Scale X Features to help with numerical stability
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train = pd.DataFrame(X_scaled, columns=X.columns, index = X.index)


# Create + fit tobit regression model
bus_nature_df['cens'] = train['helpful'].apply(lambda x: -1 if x==0 else 0)
model1 = TobitModel()
results = model1.fit(X_train, Y_train, cens=bus_nature_df['cens'], verbose=True, tol=1e-2)

# Print estimated coefficients
print("Estimated coefficients:", model1.coef_)
print("Intercept:", model1.intercept_)
print("Sigma (standard deviation):", model1.sigma_)

Optimization terminated successfully.
         Current function value: 762761.513419
         Iterations: 20
         Function evaluations: 35
         Gradient evaluations: 35
  message: Optimization terminated successfully.
  success: True
   status: 0
      fun: 762761.5134192283
        x: [-1.422e-01 -8.348e-02 -1.386e-02 -1.582e-02 -6.795e-03
            -8.837e-03  7.602e-01]
      nit: 20
      jac: [ 1.821e-03 -2.288e-03 -6.974e-04  3.271e-03  5.167e-05
            -4.437e-03  1.462e-03]
 hess_inv: [[ 1.064e-06 -3.251e-07 ... -2.132e-07 -3.914e-07]
            [-3.251e-07  1.024e-04 ... -7.537e-05 -3.423e-07]
            ...
            [-2.132e-07 -7.537e-05 ...  7.797e-05 -8.598e-09]
            [-3.914e-07 -3.423e-07 ... -8.598e-09  7.984e-07]]
     nfev: 35
     njev: 35
Estimated coefficients: [-0.08347831 -0.01386088 -0.01581892 -0.00679544 -0.00883721]
Intercept: -0.08347831253437357
Sigma (standard deviation): 0.7601836834087684


In [None]:
# VAL - Focus on Nature + Review Extremity (Stars) First: Helpfulness % = β1Rating + β2Rating2 + β3 Business Nature + β4 Rating × Business Nature + β5 Rating^2 × Business Nature + ε
bus_nature_df = pd.DataFrame()

# Create interaction terms
bus_nature_df['stars_reviewer'] = val['stars_reviewer'] - val['stars_reviewer'].mean() # Center stars_reviewer
bus_nature_df['stars_reviewer2'] = bus_nature_df['stars_reviewer'] ** 2 - (bus_nature_df['stars_reviewer'] ** 2).mean() # Center
bus_nature_df['nature'] = val['nature']
bus_nature_df['stars_reviewer_nature'] = bus_nature_df['stars_reviewer'] * bus_nature_df['nature'] - (bus_nature_df['stars_reviewer'] * bus_nature_df['nature']).mean() # Center
bus_nature_df['stars_reviewer2_nature'] = bus_nature_df['stars_reviewer2'] * bus_nature_df['nature'] - (bus_nature_df['stars_reviewer2'] * bus_nature_df['nature']).mean() # Center

# X = bus_nature_df[['stars_reviewer', 'stars_reviewer2', 'nature']]
X_val = bus_nature_df[['stars_reviewer', 'stars_reviewer2', 'nature', 'stars_reviewer_nature', 'stars_reviewer2_nature']]
Y_val = val['helpful'].copy()

# Scale X Features to help with numerical stability
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_val)
X_val = pd.DataFrame(X_scaled, columns=X_val.columns, index = X_val.index)

# Predict
predictions = model1.predict(X_val)

# Root Mean Squared Error
mse = mean_squared_error(Y_val, predictions)
rmse = np.sqrt(mse)
print(f'Root Mean Squared Error: {rmse}')

# Mean Absolute Error
abs_err = np.abs(Y_val - predictions)
mae = np.mean(abs_err)
print(f'Mean Absolute Error: {mae}')

# NDCG
true_helpful = np.asarray([Y_val])
pred_helpful = np.asarray([predictions])

model_1_ndcg_5 = ndcg_score(true_helpful, pred_helpful, k=5)
model_1_ndcg_25 = ndcg_score(true_helpful, pred_helpful, k=25)
model_1_ndcg_50 = ndcg_score(true_helpful, pred_helpful, k=50)
model_1_ndcg_100 = ndcg_score(true_helpful, pred_helpful, k=100)
model_1_ndcg_1000 = ndcg_score(true_helpful, pred_helpful, k=1000)
model_1_ndcg_all = ndcg_score(true_helpful, pred_helpful)

# Print nDCG scores for each k value
print(f'NDCG Score for k=5: {model_1_ndcg_5}')
print(f'NDCG Score for k=25: {model_1_ndcg_25}')
print(f'NDCG Score for k=50: {model_1_ndcg_50}')
print(f'NDCG Score for k=100: {model_1_ndcg_100}')
print(f'NDCG Score for k=1000: {model_1_ndcg_1000}')
print(f'NDCG Score for all: {model_1_ndcg_all}')

Root Mean Squared Error: 0.5472726188327716
Mean Absolute Error: 0.28086447005869786
NDCG Score for k=5: 0.008835828879690501
NDCG Score for k=25: 0.011380066003743932
NDCG Score for k=50: 0.013420205309024453
NDCG Score for k=100: 0.016338517507731964
NDCG Score for k=1000: 0.03935050402995587
NDCG Score for all: 0.7845123995756825


# Business Nature Model 2 (Tobit Regression w/ Nature + Review Stars + Word Count + Total Votes, Majority Nature Class)


In [None]:
# TRAIN Previous Research Model (Tobit Regression): Helpfulness % = β1Rating + β2Rating2 + β3 Business Nature + β4 Word Count + β5 Total Votes + β6 Rating × Business Nature + β7 Rating^2 × Business Nature + β8 Word Count × Business Nature + ε
bus_nature_df = pd.DataFrame()

# Create interaction terms
bus_nature_df['stars_reviewer'] = train['stars_reviewer'] - train['stars_reviewer'].mean() # Center stars_reviewer
bus_nature_df['stars_reviewer2'] = bus_nature_df['stars_reviewer'] ** 2 - (bus_nature_df['stars_reviewer'] ** 2).mean() # Center
bus_nature_df['nature'] = train['nature']
bus_nature_df['word_count'] = train['num_words'] - train['num_words'].mean() # Center
bus_nature_df['tot_votes'] = train['useful'] - train['useful'].mean() # Center
bus_nature_df['stars_reviewer_nature'] = bus_nature_df['stars_reviewer'] * bus_nature_df['nature'] - (bus_nature_df['stars_reviewer'] * bus_nature_df['nature']).mean() # Center
bus_nature_df['stars_reviewer2_nature'] = bus_nature_df['stars_reviewer2'] * bus_nature_df['nature'] - (bus_nature_df['stars_reviewer2'] * bus_nature_df['nature']).mean() # Center
bus_nature_df['tot_votes_nature'] = bus_nature_df['tot_votes'] * bus_nature_df['nature'] - (bus_nature_df['tot_votes'] * bus_nature_df['nature']).mean() # Center

X = bus_nature_df[['stars_reviewer', 'stars_reviewer2', 'nature', 'word_count', 'tot_votes', 'stars_reviewer_nature', 'stars_reviewer2_nature', 'tot_votes_nature']]
Y_train = train['helpful'].copy()

# Scale X Features to help with numerical stability
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train = pd.DataFrame(X_scaled, columns=X.columns, index = X.index)


# Create + fit tobit regression model
bus_nature_df['cens'] = train['helpful'].apply(lambda x: -1 if x==0 else 0)
model2 = TobitModel()
results = model2.fit(X_train, Y_train, cens=bus_nature_df['cens'], verbose=True, tol=1e-2)

# Print estimated coefficients
print("Estimated coefficients:", model2.coef_)
print("Intercept:", model2.intercept_)
print("Sigma (standard deviation):", model2.sigma_)

Optimization terminated successfully.
         Current function value: 420295.901917
         Iterations: 21
         Function evaluations: 34
         Gradient evaluations: 34
  message: Optimization terminated successfully.
  success: True
   status: 0
      fun: 420295.9019168681
        x: [-1.948e-02 -9.480e-03 -1.361e-02  7.776e-04  5.399e-02
             2.519e-01 -1.948e-02  6.109e-03  2.171e-01  3.998e-01]
      nit: 21
      jac: [ 1.848e-03 -1.092e-03  1.713e-03  4.371e-04 -1.078e-03
             1.576e-03 -1.113e-03  1.795e-03  1.428e-03  3.418e-04]
 hess_inv: [[ 3.232e-07 -3.512e-08 ... -1.549e-08 -1.062e-07]
            [-3.512e-08  1.763e-05 ... -2.513e-07 -3.916e-09]
            ...
            [-1.549e-08 -2.513e-07 ...  2.718e-06  2.293e-08]
            [-1.062e-07 -3.916e-09 ...  2.293e-08  1.939e-07]]
     nfev: 34
     njev: 34
Estimated coefficients: [-0.00948029 -0.01361236  0.00077762  0.05399246  0.25187897 -0.01948386
  0.00610922  0.21709646]
Intercept: -0.00

In [None]:
# VAL Previous Research Model (Tobit Regression): Helpfulness % = β1Rating + β2Rating2 + β3 Business Nature + β4 Word Count + β5 Total Votes + β6 Rating × Business Nature + β7 Rating^2 × Business Nature + β8 Word Count × Business Nature + ε
bus_nature_df = pd.DataFrame()

# Create interaction terms
bus_nature_df['stars_reviewer'] = val['stars_reviewer'] - val['stars_reviewer'].mean() # Center stars_reviewer
bus_nature_df['stars_reviewer2'] = bus_nature_df['stars_reviewer'] ** 2 - (bus_nature_df['stars_reviewer'] ** 2).mean() # Center
bus_nature_df['nature'] = val['nature']
bus_nature_df['word_count'] = val['num_words'] - val['num_words'].mean() # Center
bus_nature_df['tot_votes'] = val['useful'] - val['useful'].mean() # Center
bus_nature_df['stars_reviewer_nature'] = bus_nature_df['stars_reviewer'] * bus_nature_df['nature'] - (bus_nature_df['stars_reviewer'] * bus_nature_df['nature']).mean() # Center
bus_nature_df['stars_reviewer2_nature'] = bus_nature_df['stars_reviewer2'] * bus_nature_df['nature'] - (bus_nature_df['stars_reviewer2'] * bus_nature_df['nature']).mean() # Center
bus_nature_df['tot_votes_nature'] = bus_nature_df['tot_votes'] * bus_nature_df['nature'] - (bus_nature_df['tot_votes'] * bus_nature_df['nature']).mean() # Center

X = bus_nature_df[['stars_reviewer', 'stars_reviewer2', 'nature', 'word_count', 'tot_votes', 'stars_reviewer_nature', 'stars_reviewer2_nature', 'tot_votes_nature']]
Y_val = val['helpful'].copy()

# Scale X Features to help with numerical stability
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_val = pd.DataFrame(X_scaled, columns=X.columns, index = X.index)

predictions = model2.predict(X_val)

# Root Mean Squared Error
mse = mean_squared_error(Y_val, predictions)
rmse = np.sqrt(mse)
print(f'Root Mean Squared Error: {rmse}')

# Mean Absolute Error
abs_err = np.abs(Y_val - predictions)
mae = np.mean(abs_err)
print(f'Mean Absolute Error: {mae}')

# NDCG
true_helpful = np.asarray([Y_val])
pred_helpful = np.asarray([predictions])

model_2_ndcg_5 = ndcg_score(true_helpful, pred_helpful, k=5)
model_2_ndcg_25 = ndcg_score(true_helpful, pred_helpful, k=25)
model_2_ndcg_50 = ndcg_score(true_helpful, pred_helpful, k=50)
model_2_ndcg_100 = ndcg_score(true_helpful, pred_helpful, k=100)
model_2_ndcg_1000 = ndcg_score(true_helpful, pred_helpful, k=1000)
model_2_ndcg_all = ndcg_score(true_helpful, pred_helpful)

# Print nDCG scores for each k value
print(f'NDCG Score for k=5: {model_2_ndcg_5}')
print(f'NDCG Score for k=25: {model_2_ndcg_25}')
print(f'NDCG Score for k=50: {model_2_ndcg_50}')
print(f'NDCG Score for k=100: {model_2_ndcg_100}')
print(f'NDCG Score for k=1000: {model_2_ndcg_1000}')
print(f'NDCG Score for all: {model_2_ndcg_all}')

Root Mean Squared Error: 0.3514392901789795
Mean Absolute Error: 0.24441682575072612
NDCG Score for k=5: 0.8365084025722667
NDCG Score for k=25: 0.7889819593586276
NDCG Score for k=50: 0.7440122753694345
NDCG Score for k=100: 0.7107093769030592
NDCG Score for k=1000: 0.7905152099190159
NDCG Score for all: 0.9630823356314763


# Business Nature Model 3 (Tobit Regression w/ Nature + Review Stars + Word Count + Total Votes, Majority Nature Class + Nature Weights based on Extractive Summary of Review (NTLK Embeddings))


In [None]:
# TRAIN New Model (Tobit Regression):
# Helpfulness % = β1Rating + β2Rating2 + β3 Business Nature + β4 Word Count + β5 Total Votes + β6 Rating × Business Nature +
# β7 Rating^2 × Business Nature + β8 Word Count × Business Nature + β9 Experience Similarity + β10 Rating × Experience Similarity + β11 Rating^2 × Experience Similarity +
# β12 Search Similarity + β13 Word Count x Search Similarity  + ε
bus_nature_df = pd.DataFrame()

# Create interaction terms + Center
bus_nature_df['stars_reviewer'] = train['stars_reviewer'] - train['stars_reviewer'].mean() # Center stars_reviewer
bus_nature_df['stars_reviewer2'] = bus_nature_df['stars_reviewer'] ** 2 - (bus_nature_df['stars_reviewer'] ** 2).mean()
bus_nature_df['nature'] = train['nature']
bus_nature_df['word_count'] = train['num_words'] - train['num_words'].mean() # Center
bus_nature_df['tot_votes'] = train['useful'] - train['useful'].mean() # Center
bus_nature_df['stars_reviewer_nature'] = bus_nature_df['stars_reviewer'] * bus_nature_df['nature'] - (bus_nature_df['stars_reviewer'] * bus_nature_df['nature']).mean() # Center
bus_nature_df['stars_reviewer2_nature'] = bus_nature_df['stars_reviewer2'] * bus_nature_df['nature'] - (bus_nature_df['stars_reviewer2'] * bus_nature_df['nature']).mean() # Center
bus_nature_df['tot_votes_nature'] = bus_nature_df['tot_votes'] * bus_nature_df['nature'] - (bus_nature_df['tot_votes'] * bus_nature_df['nature']).mean() # Center
bus_nature_df['experience_similarity'] = train['experience_similarity'] - train['experience_similarity'].mean() # Center
bus_nature_df['stars_reviewer_experience_similarity'] = bus_nature_df['stars_reviewer'] * bus_nature_df['experience_similarity'] - (bus_nature_df['stars_reviewer'] * bus_nature_df['experience_similarity'] ).mean() # Center
bus_nature_df['stars_reviewer2_experience_similarity'] = bus_nature_df['stars_reviewer2'] * bus_nature_df['experience_similarity'] - (bus_nature_df['stars_reviewer2'] * bus_nature_df['experience_similarity'] ).mean() # Center
bus_nature_df['search_similarity'] = train['search_similarity'] - train['search_similarity'].mean() # Center
bus_nature_df['word_count_search_similarity'] = bus_nature_df['word_count'] * bus_nature_df['search_similarity'] - (bus_nature_df['word_count'] * bus_nature_df['search_similarity']).mean() # Center
bus_nature_df['word_count_experience_similarity'] = bus_nature_df['word_count'] * bus_nature_df['experience_similarity'] - (bus_nature_df['word_count'] * bus_nature_df['experience_similarity']).mean() # Center

X = bus_nature_df[['stars_reviewer', 'stars_reviewer2', 'nature', 'word_count', 'tot_votes',
                   'stars_reviewer_nature', 'stars_reviewer2_nature', 'tot_votes_nature', 'experience_similarity',
                   'stars_reviewer_experience_similarity', 'stars_reviewer2_experience_similarity',
                   'search_similarity', 'word_count_search_similarity', 'word_count_experience_similarity']]
Y_train = train['helpful'].copy()

# Scale X Features to help with numerical stability
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train = pd.DataFrame(X_scaled, columns=X.columns, index = X.index)


# Create + fit tobit regression model
bus_nature_df['cens'] = train['helpful'].apply(lambda x: -1 if x==0 else 0)
model3 = TobitModel()
results = model3.fit(X_train, Y_train, cens=bus_nature_df['cens'], verbose=True, tol=1e-2)

# Print estimated coefficients
print("Estimated coefficients:", model3.coef_)
print("Intercept:", model3.intercept_)
print("Sigma (standard deviation):", model3.sigma_)

Optimization terminated successfully.
         Current function value: 419926.219441
         Iterations: 29
         Function evaluations: 46
         Gradient evaluations: 46
  message: Optimization terminated successfully.
  success: True
   status: 0
      fun: 419926.21944146464
        x: [-1.947e-02 -1.077e-02 ...  8.453e-03  3.996e-01]
      nit: 29
      jac: [-1.160e-03 -7.335e-06 ...  1.209e-04 -4.181e-04]
 hess_inv: [[ 3.662e-07 -5.655e-08 ... -1.009e-08 -1.045e-07]
            [-5.655e-08  1.762e-05 ... -2.927e-08 -4.968e-09]
            ...
            [-1.009e-08 -2.927e-08 ...  3.459e-07  7.352e-10]
            [-1.045e-07 -4.968e-09 ...  7.352e-10  2.009e-07]]
     nfev: 46
     njev: 46
Estimated coefficients: [-1.07684636e-02 -1.34584014e-02 -6.23663992e-04  5.54613487e-02
  2.53327581e-01 -1.84242633e-02  6.33343634e-03  2.15223690e-01
  1.03355047e-02  2.06934658e-04  1.78848306e-04 -1.94441908e-04
 -3.29168201e-03  8.45274750e-03]
Intercept: -0.01076846359678909
S

In [None]:
# VAL New Model (Tobit Regression):
# Helpfulness % = β1Rating + β2Rating2 + β3 Business Nature + β4 Word Count + β5 Total Votes + β6 Rating × Business Nature +
# β7 Rating^2 × Business Nature + β8 Word Count × Business Nature + β9 Search Similarity + β10 Rating × Search Similarity + β11 Rating^2 × Search Similarity +
# β12 Experience Similarity + β13 Experience Similarity x Word Count + ε
bus_nature_df = pd.DataFrame()

# Create interaction terms + Center
bus_nature_df['stars_reviewer'] = val['stars_reviewer'] - val['stars_reviewer'].mean() # Center stars_reviewer
bus_nature_df['stars_reviewer2'] = bus_nature_df['stars_reviewer'] ** 2 - (bus_nature_df['stars_reviewer'] ** 2).mean()
bus_nature_df['nature'] = val['nature']
bus_nature_df['word_count'] = val['num_words'] - val['num_words'].mean() # Center
bus_nature_df['tot_votes'] = val['useful'] - val['useful'].mean() # Center
bus_nature_df['stars_reviewer_nature'] = bus_nature_df['stars_reviewer'] * bus_nature_df['nature'] - (bus_nature_df['stars_reviewer'] * bus_nature_df['nature']).mean() # Center
bus_nature_df['stars_reviewer2_nature'] = bus_nature_df['stars_reviewer2'] * bus_nature_df['nature'] - (bus_nature_df['stars_reviewer2'] * bus_nature_df['nature']).mean() # Center
bus_nature_df['tot_votes_nature'] = bus_nature_df['tot_votes'] * bus_nature_df['nature'] - (bus_nature_df['tot_votes'] * bus_nature_df['nature']).mean() # Center
bus_nature_df['experience_similarity'] = val['experience_similarity'] - val['experience_similarity'].mean() # Center
bus_nature_df['stars_reviewer_experience_similarity'] = bus_nature_df['stars_reviewer'] * bus_nature_df['experience_similarity'] - (bus_nature_df['stars_reviewer'] * bus_nature_df['experience_similarity'] ).mean() # Center
bus_nature_df['stars_reviewer2_experience_similarity'] = bus_nature_df['stars_reviewer2'] * bus_nature_df['experience_similarity'] - (bus_nature_df['stars_reviewer2'] * bus_nature_df['experience_similarity'] ).mean() # Center
bus_nature_df['search_similarity'] = val['search_similarity'] - val['search_similarity'].mean() # Center
bus_nature_df['word_count_search_similarity'] = bus_nature_df['word_count'] * bus_nature_df['search_similarity'] - (bus_nature_df['word_count'] * bus_nature_df['search_similarity']).mean() # Center
bus_nature_df['word_count_experience_similarity'] = bus_nature_df['word_count'] * bus_nature_df['experience_similarity'] - (bus_nature_df['word_count'] * bus_nature_df['experience_similarity']).mean() # Center

X = bus_nature_df[['stars_reviewer', 'stars_reviewer2', 'nature', 'word_count', 'tot_votes',
                   'stars_reviewer_nature', 'stars_reviewer2_nature', 'tot_votes_nature', 'experience_similarity',
                   'stars_reviewer_experience_similarity', 'stars_reviewer2_experience_similarity',
                   'search_similarity', 'word_count_search_similarity', 'word_count_experience_similarity']]
Y_val = val['helpful'].copy()

# Scale X Features to help with numerical stability
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_val = pd.DataFrame(X_scaled, columns=X.columns, index = X.index)

predictions = model3.predict(X_val)

# Root Mean Squared Error
mse = mean_squared_error(Y_val, predictions)
rmse = np.sqrt(mse)
print(f'Root Mean Squared Error: {rmse}')

# Mean Absolute Error
abs_err = np.abs(Y_val - predictions)
mae = np.mean(abs_err)
print(f'Mean Absolute Error: {mae}')

# NDCG
true_helpful = np.asarray([Y_val])
pred_helpful = np.asarray([predictions])

model_3_ndcg_5 = ndcg_score(true_helpful, pred_helpful, k=5)
model_3_ndcg_25 = ndcg_score(true_helpful, pred_helpful, k=25)
model_3_ndcg_50 = ndcg_score(true_helpful, pred_helpful, k=50)
model_3_ndcg_100 = ndcg_score(true_helpful, pred_helpful, k=100)
model_3_ndcg_1000 = ndcg_score(true_helpful, pred_helpful, k=1000)
model_3_ndcg_all = ndcg_score(true_helpful, pred_helpful)

# Print nDCG scores for each k value
print(f'NDCG Score for k=5: {model_3_ndcg_5}')
print(f'NDCG Score for k=25: {model_3_ndcg_25}')
print(f'NDCG Score for k=50: {model_3_ndcg_50}')
print(f'NDCG Score for k=100: {model_3_ndcg_100}')
print(f'NDCG Score for k=1000: {model_3_ndcg_1000}')
print(f'NDCG Score for all: {model_3_ndcg_all}')

Root Mean Squared Error: 0.3521639588525739
Mean Absolute Error: 0.24556029119742065
NDCG Score for k=5: 0.8365084025722667
NDCG Score for k=25: 0.7889819593586276
NDCG Score for k=50: 0.7441018249843373
NDCG Score for k=100: 0.7108162974692939
NDCG Score for k=1000: 0.7906573864873877
NDCG Score for all: 0.9631022803315374


# Business Nature Model 4 (Tobit Regression w/ Nature + Review Stars + Word Count + Total Votes + Reviewer Cum. Helpfulness, Majority Nature Class + Nature Weights based on Extractive Summary of Review (NTLK Embeddings))


In [None]:
# TRAIN New Model with reviewer_cum_helpful (Tobit Regression):
# Helpfulness % = β1Rating + β2Rating2 + β3 Business Nature + β4 Word Count + β5 Total Votes + β6 Rating × Business Nature +
# β7 Rating^2 × Business Nature + β8 Word Count × Business Nature + β9 Experience Similarity + β10 Rating × Experience Similarity + β11 Rating^2 × Experience Similarity +
# β12 Search Similarity + β13 Word Count x Search Similarity + β14 Reviewer Cum Helpful + ε
bus_nature_df = pd.DataFrame()

# Create interaction terms + Center
bus_nature_df['stars_reviewer'] = train['stars_reviewer'] - train['stars_reviewer'].mean() # Center stars_reviewer
bus_nature_df['stars_reviewer2'] = bus_nature_df['stars_reviewer'] ** 2 - (bus_nature_df['stars_reviewer'] ** 2).mean()
bus_nature_df['nature'] = train['nature']
bus_nature_df['word_count'] = train['num_words'] - train['num_words'].mean() # Center
bus_nature_df['tot_votes'] = train['useful'] - train['useful'].mean() # Center
bus_nature_df['stars_reviewer_nature'] = bus_nature_df['stars_reviewer'] * bus_nature_df['nature'] - (bus_nature_df['stars_reviewer'] * bus_nature_df['nature']).mean() # Center
bus_nature_df['stars_reviewer2_nature'] = bus_nature_df['stars_reviewer2'] * bus_nature_df['nature'] - (bus_nature_df['stars_reviewer2'] * bus_nature_df['nature']).mean() # Center
bus_nature_df['tot_votes_nature'] = bus_nature_df['tot_votes'] * bus_nature_df['nature'] - (bus_nature_df['tot_votes'] * bus_nature_df['nature']).mean() # Center
bus_nature_df['experience_similarity'] = train['experience_similarity'] - train['experience_similarity'].mean() # Center
bus_nature_df['stars_reviewer_experience_similarity'] = bus_nature_df['stars_reviewer'] * bus_nature_df['experience_similarity'] - (bus_nature_df['stars_reviewer'] * bus_nature_df['experience_similarity'] ).mean() # Center
bus_nature_df['stars_reviewer2_experience_similarity'] = bus_nature_df['stars_reviewer2'] * bus_nature_df['experience_similarity'] - (bus_nature_df['stars_reviewer2'] * bus_nature_df['experience_similarity'] ).mean() # Center
bus_nature_df['search_similarity'] = train['search_similarity'] - train['search_similarity'].mean() # Center
bus_nature_df['word_count_search_similarity'] = bus_nature_df['word_count'] * bus_nature_df['search_similarity'] - (bus_nature_df['word_count'] * bus_nature_df['search_similarity']).mean() # Center
bus_nature_df['word_count_experience_similarity'] = bus_nature_df['word_count'] * bus_nature_df['experience_similarity'] - (bus_nature_df['word_count'] * bus_nature_df['experience_similarity']).mean() # Center
bus_nature_df['reviewer_cum_helpful'] = train['reviewer_cum_helpful'] - train['reviewer_cum_helpful'].mean() # Center

X = bus_nature_df[['stars_reviewer', 'stars_reviewer2', 'nature', 'word_count', 'tot_votes',
                   'stars_reviewer_nature', 'stars_reviewer2_nature', 'tot_votes_nature', 'experience_similarity',
                   'stars_reviewer_experience_similarity', 'stars_reviewer2_experience_similarity',
                   'search_similarity', 'word_count_search_similarity', 'word_count_experience_similarity', 'reviewer_cum_helpful']]
Y_train = train['helpful'].copy()

# Scale X Features to help with numerical stability
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train = pd.DataFrame(X_scaled, columns=X.columns, index = X.index)


# Create + fit tobit regression model
bus_nature_df['cens'] = train['helpful'].apply(lambda x: -1 if x==0 else 0)
model4 = TobitModel()
results = model4.fit(X_train, Y_train, cens=bus_nature_df['cens'], verbose=True, tol=1e-2)

# Print estimated coefficients
print("Estimated coefficients:", model4.coef_)
print("Intercept:", model4.intercept_)
print("Sigma (standard deviation):", model4.sigma_)

Optimization terminated successfully.
         Current function value: 382023.188232
         Iterations: 31
         Function evaluations: 47
         Gradient evaluations: 47
  message: Optimization terminated successfully.
  success: True
   status: 0
      fun: 382023.18823195633
        x: [-9.502e-03 -4.261e-03 ...  1.691e-01  3.719e-01]
      nit: 31
      jac: [ 9.391e-04 -1.210e-03 ...  1.370e-03  4.745e-04]
 hess_inv: [[ 3.098e-07 -1.938e-08 ... -1.140e-08 -1.052e-07]
            [-1.938e-08  1.604e-05 ... -4.612e-08 -1.590e-08]
            ...
            [-1.140e-08 -4.612e-08 ...  5.587e-07  1.351e-08]
            [-1.052e-07 -1.590e-08 ...  1.351e-08  1.750e-07]]
     nfev: 47
     njev: 47
Estimated coefficients: [-0.00426127  0.00605166 -0.00125167  0.0395064   0.16773353 -0.01460555
  0.00048588  0.17772824  0.00843208  0.00039455  0.00046968 -0.00035106
 -0.00254738  0.00647079  0.16906978]
Intercept: -0.0042612702310075605
Sigma (standard deviation): 0.37190562433312

In [None]:
# VAL New Model with reviewer_cum_helpful (Tobit Regression):
# Helpfulness % = β1Rating + β2Rating2 + β3 Business Nature + β4 Word Count + β5 Total Votes + β6 Rating × Business Nature +
# β7 Rating^2 × Business Nature + β8 Word Count × Business Nature + β9 Search Similarity + β10 Rating × Search Similarity + β11 Rating^2 × Search Similarity +
# β12 Experience Similarity + β13 Experience Similarity x Word Count + β14 Reviewer Cum Helpful + ε
bus_nature_df = pd.DataFrame()

# Create interaction terms + Center
bus_nature_df['stars_reviewer'] = val['stars_reviewer'] - val['stars_reviewer'].mean() # Center stars_reviewer
bus_nature_df['stars_reviewer2'] = bus_nature_df['stars_reviewer'] ** 2 - (bus_nature_df['stars_reviewer'] ** 2).mean()
bus_nature_df['nature'] = val['nature']
bus_nature_df['word_count'] = val['num_words'] - val['num_words'].mean() # Center
bus_nature_df['tot_votes'] = val['useful'] - val['useful'].mean() # Center
bus_nature_df['stars_reviewer_nature'] = bus_nature_df['stars_reviewer'] * bus_nature_df['nature'] - (bus_nature_df['stars_reviewer'] * bus_nature_df['nature']).mean() # Center
bus_nature_df['stars_reviewer2_nature'] = bus_nature_df['stars_reviewer2'] * bus_nature_df['nature'] - (bus_nature_df['stars_reviewer2'] * bus_nature_df['nature']).mean() # Center
bus_nature_df['tot_votes_nature'] = bus_nature_df['tot_votes'] * bus_nature_df['nature'] - (bus_nature_df['tot_votes'] * bus_nature_df['nature']).mean() # Center
bus_nature_df['experience_similarity'] = val['experience_similarity'] - val['experience_similarity'].mean() # Center
bus_nature_df['stars_reviewer_experience_similarity'] = bus_nature_df['stars_reviewer'] * bus_nature_df['experience_similarity'] - (bus_nature_df['stars_reviewer'] * bus_nature_df['experience_similarity'] ).mean() # Center
bus_nature_df['stars_reviewer2_experience_similarity'] = bus_nature_df['stars_reviewer2'] * bus_nature_df['experience_similarity'] - (bus_nature_df['stars_reviewer2'] * bus_nature_df['experience_similarity'] ).mean() # Center
bus_nature_df['search_similarity'] = val['search_similarity'] - val['search_similarity'].mean() # Center
bus_nature_df['word_count_search_similarity'] = bus_nature_df['word_count'] * bus_nature_df['search_similarity'] - (bus_nature_df['word_count'] * bus_nature_df['search_similarity']).mean() # Center
bus_nature_df['word_count_experience_similarity'] = bus_nature_df['word_count'] * bus_nature_df['experience_similarity'] - (bus_nature_df['word_count'] * bus_nature_df['experience_similarity']).mean() # Center
bus_nature_df['reviewer_cum_helpful'] = train['reviewer_cum_helpful'] - train['reviewer_cum_helpful'].mean() # Center

X = bus_nature_df[['stars_reviewer', 'stars_reviewer2', 'nature', 'word_count', 'tot_votes',
                   'stars_reviewer_nature', 'stars_reviewer2_nature', 'tot_votes_nature', 'experience_similarity',
                   'stars_reviewer_experience_similarity', 'stars_reviewer2_experience_similarity',
                   'search_similarity', 'word_count_search_similarity', 'word_count_experience_similarity', 'reviewer_cum_helpful']]

# X = bus_nature_df[['stars_reviewer', 'stars_reviewer2', 'nature', 'word_count', 'tot_votes',
#                    'stars_reviewer_nature', 'stars_reviewer2_nature', 'tot_votes_nature', 'experience_similarity',
#                    'stars_reviewer_experience_similarity', 'stars_reviewer2_experience_similarity',
#                    'search_similarity', 'word_count_search_similarity', 'word_count_experience_similarity']]
Y_val = val['helpful'].copy()

# Scale X Features to help with numerical stability
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_val = pd.DataFrame(X_scaled, columns=X.columns, index = X.index)

pred = model4.predict(X_val)

# Root Mean Squared Error
mse = mean_squared_error(Y_val, pred)
rmse = np.sqrt(mse)
print(f'Root Mean Squared Error: {rmse}')

# Mean Absolute Error
abs_err = np.abs(Y_val - pred)
mae = np.mean(abs_err)
print(f'Mean Absolute Error: {mae}')

# NDCG
true_helpful = np.asarray([Y_val])
pred_helpful = np.asarray([pred])

model_4_ndcg_5 = ndcg_score(true_helpful, pred_helpful, k=5)
model_4_ndcg_25 = ndcg_score(true_helpful, pred_helpful, k=25)
model_4_ndcg_50 = ndcg_score(true_helpful, pred_helpful, k=50)
model_4_ndcg_100 = ndcg_score(true_helpful, pred_helpful, k=100)
model_4_ndcg_1000 = ndcg_score(true_helpful, pred_helpful, k=1000)
model_4_ndcg_all = ndcg_score(true_helpful, pred_helpful)

# Print nDCG scores for each k value
print(f'NDCG Score for k=5: {model_4_ndcg_5}')
print(f'NDCG Score for k=25: {model_4_ndcg_25}')
print(f'NDCG Score for k=50: {model_4_ndcg_50}')
print(f'NDCG Score for k=100: {model_4_ndcg_100}')
print(f'NDCG Score for k=1000: {model_4_ndcg_1000}')
print(f'NDCG Score for all: {model_4_ndcg_all}')

Root Mean Squared Error: 0.37637321938822416
Mean Absolute Error: 0.24898061412299852
NDCG Score for k=5: 0.8365084025722667
NDCG Score for k=25: 0.7919706041603981
NDCG Score for k=50: 0.723105860420288
NDCG Score for k=100: 0.6940531710092211
NDCG Score for k=1000: 0.7252158883716079
NDCG Score for all: 0.944766037466722


# Business Nature Model 5 (Tobit Regression w/ Nature + Review Stars + Word Count + Total Votes + Reviewer Cum. Helpfulness, Majority Nature Class + Nature Weights based on Extractive Summary of Review (BERT embeddings))


In [None]:
train = pd.read_parquet('/content/drive/MyDrive/Code + Data/bn_train_data_final_bert.parquet', columns=['review_id', 'stars_reviewer', 'nature', 'num_words', 'useful', 'bert_experience_similarity', 'bert_search_similarity', 'reviewer_cum_helpful', 'helpful'])
val = pd.read_parquet('/content/drive/MyDrive/Code + Data/bn_val_data_final_bert.parquet', columns=['review_id', 'stars_reviewer', 'nature', 'num_words', 'useful', 'bert_experience_similarity', 'bert_search_similarity', 'reviewer_cum_helpful', 'helpful'])
test = pd.read_parquet('/content/drive/MyDrive/Code + Data/bn_test_data_final_bert.parquet', columns=['review_id', 'stars_reviewer', 'nature', 'num_words', 'useful', 'bert_experience_similarity', 'bert_search_similarity', 'reviewer_cum_helpful', 'helpful'])

In [None]:
# TRAIN New Model with reviewer_cum_helpful (Tobit Regression):
# Helpfulness % = β1Rating + β2Rating2 + β3 Business Nature + β4 Word Count + β5 Total Votes + β6 Rating × Business Nature +
# β7 Rating^2 × Business Nature + β8 Word Count × Business Nature + β9 Experience Similarity + β10 Rating × Experience Similarity + β11 Rating^2 × Experience Similarity +
# β12 Search Similarity + β13 Word Count x Search Similarity + β14 Reviewer Cum Helpful + ε
bus_nature_df = pd.DataFrame()

# Create interaction terms + Center
bus_nature_df['stars_reviewer'] = train['stars_reviewer'] - train['stars_reviewer'].mean() # Center stars_reviewer
bus_nature_df['stars_reviewer2'] = bus_nature_df['stars_reviewer'] ** 2 - (bus_nature_df['stars_reviewer'] ** 2).mean()
bus_nature_df['nature'] = train['nature']
bus_nature_df['word_count'] = train['num_words'] - train['num_words'].mean() # Center
bus_nature_df['tot_votes'] = train['useful'] - train['useful'].mean() # Center
bus_nature_df['stars_reviewer_nature'] = bus_nature_df['stars_reviewer'] * bus_nature_df['nature'] - (bus_nature_df['stars_reviewer'] * bus_nature_df['nature']).mean() # Center
bus_nature_df['stars_reviewer2_nature'] = bus_nature_df['stars_reviewer2'] * bus_nature_df['nature'] - (bus_nature_df['stars_reviewer2'] * bus_nature_df['nature']).mean() # Center
bus_nature_df['tot_votes_nature'] = bus_nature_df['tot_votes'] * bus_nature_df['nature'] - (bus_nature_df['tot_votes'] * bus_nature_df['nature']).mean() # Center
bus_nature_df['bert_experience_similarity'] = train['bert_experience_similarity'] - train['bert_experience_similarity'].mean() # Center
bus_nature_df['stars_reviewer_bert_experience_similarity'] = bus_nature_df['stars_reviewer'] * bus_nature_df['bert_experience_similarity'] - (bus_nature_df['stars_reviewer'] * bus_nature_df['bert_experience_similarity'] ).mean() # Center
bus_nature_df['stars_reviewer2_bert_experience_similarity'] = bus_nature_df['stars_reviewer2'] * bus_nature_df['bert_experience_similarity'] - (bus_nature_df['stars_reviewer2'] * bus_nature_df['bert_experience_similarity'] ).mean() # Center
bus_nature_df['bert_search_similarity'] = train['bert_search_similarity'] - train['bert_search_similarity'].mean() # Center
bus_nature_df['word_count_bert_search_similarity'] = bus_nature_df['word_count'] * bus_nature_df['bert_search_similarity'] - (bus_nature_df['word_count'] * bus_nature_df['bert_search_similarity']).mean() # Center
bus_nature_df['word_count_bert_experience_similarity'] = bus_nature_df['word_count'] * bus_nature_df['bert_experience_similarity'] - (bus_nature_df['word_count'] * bus_nature_df['bert_experience_similarity']).mean() # Center
bus_nature_df['reviewer_cum_helpful'] = train['reviewer_cum_helpful'] - train['reviewer_cum_helpful'].mean() # Center

X = bus_nature_df[['stars_reviewer', 'stars_reviewer2', 'nature', 'word_count', 'tot_votes',
                   'stars_reviewer_nature', 'stars_reviewer2_nature', 'tot_votes_nature', 'bert_experience_similarity',
                   'stars_reviewer_bert_experience_similarity', 'stars_reviewer2_bert_experience_similarity',
                   'bert_search_similarity', 'word_count_bert_search_similarity', 'word_count_bert_experience_similarity', 'reviewer_cum_helpful']]
Y_train = train['helpful'].copy()

# Scale X Features to help with numerical stability
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train = pd.DataFrame(X_scaled, columns=X.columns, index = X.index)


# Create + fit tobit regression model
bus_nature_df['cens'] = train['helpful'].apply(lambda x: -1 if x==0 else 0)
model5 = TobitModel()
results = model5.fit(X_train, Y_train, cens=bus_nature_df['cens'], verbose=True, tol=1e-4)

# Print estimated coefficients
print("Estimated coefficients:", model5.coef_)
print("Intercept:", model5.intercept_)
print("Sigma (standard deviation):", model5.sigma_)

Optimization terminated successfully.
         Current function value: 380642.899500
         Iterations: 30
         Function evaluations: 46
         Gradient evaluations: 46
  message: Optimization terminated successfully.
  success: True
   status: 0
      fun: 380642.8995000776
        x: [-1.051e-02 -2.426e-03 ...  1.684e-01  3.718e-01]
      nit: 30
      jac: [-1.338e-06 -4.689e-06 ... -9.155e-06  6.873e-06]
 hess_inv: [[ 3.000e-07 -5.380e-08 ... -7.124e-09 -9.154e-08]
            [-5.380e-08  1.645e-05 ... -6.839e-08 -2.827e-08]
            ...
            [-7.124e-09 -6.839e-08 ...  5.530e-07  1.532e-08]
            [-9.154e-08 -2.827e-08 ...  1.532e-08  1.670e-07]]
     nfev: 46
     njev: 46
Estimated coefficients: [-0.00242579  0.00340079 -0.00401012  0.04708106  0.16585489 -0.0136683
  0.00456875  0.17992534  0.01700744 -0.00211266 -0.00399082 -0.00400449
 -0.00692555 -0.02529484  0.16840437]
Intercept: -0.002425789989561379
Sigma (standard deviation): 0.3718372463887286


In [None]:
# VAL New Model with reviewer_cum_helpful (Tobit Regression):
# Helpfulness % = β1Rating + β2Rating2 + β3 Business Nature + β4 Word Count + β5 Total Votes + β6 Rating × Business Nature +
# β7 Rating^2 × Business Nature + β8 Word Count × Business Nature + β9 Search Similarity + β10 Rating × Search Similarity + β11 Rating^2 × Search Similarity +
# β12 Experience Similarity + β13 Experience Similarity x Word Count + β14 Reviewer Cum Helpful + ε
bus_nature_df = pd.DataFrame()

# Create interaction terms + Center
bus_nature_df['stars_reviewer'] = val['stars_reviewer'] - val['stars_reviewer'].mean() # Center stars_reviewer
bus_nature_df['stars_reviewer2'] = bus_nature_df['stars_reviewer'] ** 2 - (bus_nature_df['stars_reviewer'] ** 2).mean()
bus_nature_df['nature'] = val['nature']
bus_nature_df['word_count'] = val['num_words'] - val['num_words'].mean() # Center
bus_nature_df['tot_votes'] = val['useful'] - val['useful'].mean() # Center
bus_nature_df['stars_reviewer_nature'] = bus_nature_df['stars_reviewer'] * bus_nature_df['nature'] - (bus_nature_df['stars_reviewer'] * bus_nature_df['nature']).mean() # Center
bus_nature_df['stars_reviewer2_nature'] = bus_nature_df['stars_reviewer2'] * bus_nature_df['nature'] - (bus_nature_df['stars_reviewer2'] * bus_nature_df['nature']).mean() # Center
bus_nature_df['tot_votes_nature'] = bus_nature_df['tot_votes'] * bus_nature_df['nature'] - (bus_nature_df['tot_votes'] * bus_nature_df['nature']).mean() # Center
bus_nature_df['bert_experience_similarity'] = val['bert_experience_similarity'] - val['bert_experience_similarity'].mean() # Center
bus_nature_df['stars_reviewer_bert_experience_similarity'] = bus_nature_df['stars_reviewer'] * bus_nature_df['bert_experience_similarity'] - (bus_nature_df['stars_reviewer'] * bus_nature_df['bert_experience_similarity'] ).mean() # Center
bus_nature_df['stars_reviewer2_bert_experience_similarity'] = bus_nature_df['stars_reviewer2'] * bus_nature_df['bert_experience_similarity'] - (bus_nature_df['stars_reviewer2'] * bus_nature_df['bert_experience_similarity'] ).mean() # Center
bus_nature_df['bert_search_similarity'] = val['bert_search_similarity'] - val['bert_search_similarity'].mean() # Center
bus_nature_df['word_count_bert_search_similarity'] = bus_nature_df['word_count'] * bus_nature_df['bert_search_similarity'] - (bus_nature_df['word_count'] * bus_nature_df['bert_search_similarity']).mean() # Center
bus_nature_df['word_count_bert_experience_similarity'] = bus_nature_df['word_count'] * bus_nature_df['bert_experience_similarity'] - (bus_nature_df['word_count'] * bus_nature_df['bert_experience_similarity']).mean() # Center
bus_nature_df['reviewer_cum_helpful'] = val['reviewer_cum_helpful'] - val['reviewer_cum_helpful'].mean() # Center

X = bus_nature_df[['stars_reviewer', 'stars_reviewer2', 'nature', 'word_count', 'tot_votes',
                   'stars_reviewer_nature', 'stars_reviewer2_nature', 'tot_votes_nature', 'bert_experience_similarity',
                   'stars_reviewer_bert_experience_similarity', 'stars_reviewer2_bert_experience_similarity',
                   'bert_search_similarity', 'word_count_bert_search_similarity', 'word_count_bert_experience_similarity', 'reviewer_cum_helpful']]
Y_val = val['helpful'].copy()

# Scale X Features to help with numerical stability
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_val = pd.DataFrame(X_scaled, columns=X.columns, index = X.index)

predictions_final = model5.predict(X_val)

# Root Mean Squared Error
mse = mean_squared_error(Y_val, predictions_final)
rmse = np.sqrt(mse)
print(f'Root Mean Squared Error: {rmse}')

# Mean Absolute Error
abs_err = np.abs(Y_val - predictions_final)
mae = np.mean(abs_err)
print(f'Mean Absolute Error: {mae}')

# NDCG
true_helpful = np.asarray([Y_val])
pred_helpful = np.asarray([predictions_final])

model_5_ndcg_5 = ndcg_score(true_helpful, pred_helpful, k=5)
model_5_ndcg_25 = ndcg_score(true_helpful, pred_helpful, k=25)
model_5_ndcg_50 = ndcg_score(true_helpful, pred_helpful, k=50)
model_5_ndcg_100 = ndcg_score(true_helpful, pred_helpful, k=100)
model_5_ndcg_1000 = ndcg_score(true_helpful, pred_helpful, k=1000)
model_5_ndcg_all = ndcg_score(true_helpful, pred_helpful)

# Print nDCG scores for each k value
print(f'NDCG Score for k=5: {model_5_ndcg_5}')
print(f'NDCG Score for k=25: {model_5_ndcg_25}')
print(f'NDCG Score for k=50: {model_5_ndcg_50}')
print(f'NDCG Score for k=100: {model_5_ndcg_100}')
print(f'NDCG Score for k=1000: {model_5_ndcg_1000}')
print(f'NDCG Score for all: {model_5_ndcg_all}')

Root Mean Squared Error: 0.3372890311560322
Mean Absolute Error: 0.2385652342541225
NDCG Score for k=5: 0.8436593709380522
NDCG Score for k=25: 0.8105832473470997
NDCG Score for k=50: 0.744044301718769
NDCG Score for k=100: 0.7429902426396873
NDCG Score for k=1000: 0.8075936777497842
NDCG Score for all: 0.9672940015869753


In [None]:
val['helpful_predictions_final'] = predictions_final # For stacked model

In [None]:
val.to_parquet('/content/drive/MyDrive/Code + Data/bn_val_final_bert_predictions.parquet') # For stacked model


In [None]:
# TEST New Model with reviewer_cum_helpful (Tobit Regression):
# Helpfulness % = β1Rating + β2Rating2 + β3 Business Nature + β4 Word Count + β5 Total Votes + β6 Rating × Business Nature +
# β7 Rating^2 × Business Nature + β8 Word Count × Business Nature + β9 Search Similarity + β10 Rating × Search Similarity + β11 Rating^2 × Search Similarity +
# β12 Experience Similarity + β13 Experience Similarity x Word Count + β14 Reviewer Cum Helpful + ε
bus_nature_df = pd.DataFrame()

# Create interaction terms + Center
bus_nature_df['stars_reviewer'] = test['stars_reviewer'] - test['stars_reviewer'].mean() # Center stars_reviewer
bus_nature_df['stars_reviewer2'] = bus_nature_df['stars_reviewer'] ** 2 - (bus_nature_df['stars_reviewer'] ** 2).mean()
bus_nature_df['nature'] = test['nature']
bus_nature_df['word_count'] = test['num_words'] - test['num_words'].mean() # Center
bus_nature_df['tot_votes'] = test['useful'] - test['useful'].mean() # Center
bus_nature_df['stars_reviewer_nature'] = bus_nature_df['stars_reviewer'] * bus_nature_df['nature'] - (bus_nature_df['stars_reviewer'] * bus_nature_df['nature']).mean() # Center
bus_nature_df['stars_reviewer2_nature'] = bus_nature_df['stars_reviewer2'] * bus_nature_df['nature'] - (bus_nature_df['stars_reviewer2'] * bus_nature_df['nature']).mean() # Center
bus_nature_df['tot_votes_nature'] = bus_nature_df['tot_votes'] * bus_nature_df['nature'] - (bus_nature_df['tot_votes'] * bus_nature_df['nature']).mean() # Center
bus_nature_df['bert_experience_similarity'] = test['bert_experience_similarity'] - test['bert_experience_similarity'].mean() # Center
bus_nature_df['stars_reviewer_bert_experience_similarity'] = bus_nature_df['stars_reviewer'] * bus_nature_df['bert_experience_similarity'] - (bus_nature_df['stars_reviewer'] * bus_nature_df['bert_experience_similarity'] ).mean() # Center
bus_nature_df['stars_reviewer2_bert_experience_similarity'] = bus_nature_df['stars_reviewer2'] * bus_nature_df['bert_experience_similarity'] - (bus_nature_df['stars_reviewer2'] * bus_nature_df['bert_experience_similarity'] ).mean() # Center
bus_nature_df['bert_search_similarity'] = test['bert_search_similarity'] - test['bert_search_similarity'].mean() # Center
bus_nature_df['word_count_bert_search_similarity'] = bus_nature_df['word_count'] * bus_nature_df['bert_search_similarity'] - (bus_nature_df['word_count'] * bus_nature_df['bert_search_similarity']).mean() # Center
bus_nature_df['word_count_bert_experience_similarity'] = bus_nature_df['word_count'] * bus_nature_df['bert_experience_similarity'] - (bus_nature_df['word_count'] * bus_nature_df['bert_experience_similarity']).mean() # Center
bus_nature_df['reviewer_cum_helpful'] = test['reviewer_cum_helpful'] - test['reviewer_cum_helpful'].mean() # Center

X = bus_nature_df[['stars_reviewer', 'stars_reviewer2', 'nature', 'word_count', 'tot_votes',
                   'stars_reviewer_nature', 'stars_reviewer2_nature', 'tot_votes_nature', 'bert_experience_similarity',
                   'stars_reviewer_bert_experience_similarity', 'stars_reviewer2_bert_experience_similarity',
                   'bert_search_similarity', 'word_count_bert_search_similarity', 'word_count_bert_experience_similarity', 'reviewer_cum_helpful']]
Y_test = test['helpful'].copy()

# Scale X Features to help with numerical stability
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test = pd.DataFrame(X_scaled, columns=X.columns, index = X.index)

predictions_final_test = model5.predict(X_test)

# Root Mean Squared Error
mse = mean_squared_error(Y_test, predictions_final)
rmse = np.sqrt(mse)
print(f'Root Mean Squared Error: {rmse}')

# Mean Absolute Error
abs_err = np.abs(Y_test - predictions_final)
mae = np.mean(abs_err)
print(f'Mean Absolute Error: {mae}')

# NDCG
true_helpful = np.asarray([Y_test])
pred_helpful = np.asarray([predictions_final])

model_5_ndcg_5 = ndcg_score(true_helpful, pred_helpful, k=5)
model_5_ndcg_25 = ndcg_score(true_helpful, pred_helpful, k=25)
model_5_ndcg_50 = ndcg_score(true_helpful, pred_helpful, k=50)
model_5_ndcg_100 = ndcg_score(true_helpful, pred_helpful, k=100)
model_5_ndcg_1000 = ndcg_score(true_helpful, pred_helpful, k=1000)
model_5_ndcg_all = ndcg_score(true_helpful, pred_helpful)

# Print nDCG scores for each k value
print(f'NDCG Score for k=5: {model_5_ndcg_5}')
print(f'NDCG Score for k=25: {model_5_ndcg_25}')
print(f'NDCG Score for k=50: {model_5_ndcg_50}')
print(f'NDCG Score for k=100: {model_5_ndcg_100}')
print(f'NDCG Score for k=1000: {model_5_ndcg_1000}')
print(f'NDCG Score for all: {model_5_ndcg_all}')

Root Mean Squared Error: 0.3372890311560322
Mean Absolute Error: 0.2385652342541225
NDCG Score for k=5: 0.8436593709380522
NDCG Score for k=25: 0.8105832473470997
NDCG Score for k=50: 0.744044301718769
NDCG Score for k=100: 0.7429902426396873
NDCG Score for k=1000: 0.8075936777497842
NDCG Score for all: 0.9672940015869753


In [None]:
test['helpful_predictions_final'] = predictions_final_test # For stacked model
test.to_parquet('/content/drive/MyDrive/Code + Data/bn_test_final_bert_predictions.parquet') # For stacked model
