In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install pyspellchecker
!pip install py-readability-metrics
!pip install textstat
!pip install pyarrow
!pip install transformers
!pip install tqdm
!pip install datasets
!pip install tensorflow
!pip install torch

Collecting pyspellchecker
  Downloading pyspellchecker-0.8.1-py3-none-any.whl.metadata (9.4 kB)
Downloading pyspellchecker-0.8.1-py3-none-any.whl (6.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m20.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyspellchecker
Successfully installed pyspellchecker-0.8.1
Collecting py-readability-metrics
  Downloading py_readability_metrics-1.4.5-py3-none-any.whl.metadata (8.8 kB)
Downloading py_readability_metrics-1.4.5-py3-none-any.whl (26 kB)
Installing collected packages: py-readability-metrics
Successfully installed py-readability-metrics-1.4.5
Collecting textstat
  Downloading textstat-0.7.4-py3-none-any.whl.metadata (14 kB)
Collecting pyphen (from textstat)
  Downloading pyphen-0.17.0-py3-none-any.whl.metadata (3.2 kB)
Downloading textstat-0.7.4-py3-none-any.whl (105 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.1/105.1 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf

import textstat
from sklearn.feature_extraction.text import CountVectorizer
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from spellchecker import SpellChecker
from readability import Readability

from transformers import pipeline
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments
from transformers import TFAutoModelForSequenceClassification

from datasets import Dataset

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, ndcg_score

import os
os.chdir('/content/drive/MyDrive/Code + Data')
import tobit
from tobit import TobitModel

import gc
from tqdm import tqdm


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# (1) Baseline Model: Review Length

In [5]:
## read yelp_dataset_for_model.csv
chunk_size = 100000

# Initialize an empty DataFrame to concatenate chunks
yelp_data_full = pd.DataFrame()

# Read CSV in chunks
with pd.read_csv('/content/drive/MyDrive/Code + Data/yelp_dataset_for_model.csv', chunksize=chunk_size) as reader:
    for i, chunk in enumerate(reader):
        yelp_data_full = pd.concat([yelp_data_full, chunk], ignore_index=True)
        del chunk
        gc.collect()

        if (i + 1) % 5 == 0:
            print(f'Progress: {(i + 1) * chunk_size} rows processed')

Progress: 500000 rows processed


  for i, chunk in enumerate(reader):
  for i, chunk in enumerate(reader):


Progress: 1000000 rows processed
Progress: 1500000 rows processed


  for i, chunk in enumerate(reader):


In [6]:
yelp_data = yelp_data_full
del yelp_data_full
print(len(yelp_data))
print(yelp_data.columns)

1872289
Index(['review_id', 'user_id', 'business_id', 'stars_reviewer', 'useful',
       'text', 'name', 'postal_code', 'stars_business', 'categories',
       'total_reviews_for_business', 'helpful', 'num_sentences'],
      dtype='object')


Delete Non-English Reviews

In [7]:
non_english_ids = pd.read_csv("/content/drive/MyDrive/Code + Data/non_english_ids.csv")
# Check rows before drop
print(f'There are {yelp_data.shape[0]} rows before dropping')
# Create a boolean mask
mask = ~yelp_data['review_id'].isin(non_english_ids['review_id'])
yelp_data = yelp_data[mask]
# Check rows after drop
print(f'There are {yelp_data.shape[0]} rows after dropping')

There are 1872289 rows before dropping
There are 1870042 rows after dropping


In [8]:
def word_count(line):
  return len(line.split())

In [9]:
## number of words
yelp_data['num_words'] = yelp_data['text'].apply(lambda x: word_count(x))
print(yelp_data.iloc[0][['num_words']])

num_words    152
Name: 0, dtype: object


### Baseline Model- Linear Regression Model to predict helpfulness based on num_words

In [10]:
### Baseline Model- predict helpfulness based on num_words

X = yelp_data[['num_words']]
Y = yelp_data['helpful']
# Y = yelp_data['useful']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

baseline_model = LinearRegression()
baseline_model.fit(X_train, Y_train)

baseline_y_pred = baseline_model.predict(X_test)

baseline_mse = mean_squared_error(Y_test, baseline_y_pred)

print(f'MSE: {baseline_mse}')
print(f'Coefficient (for num_words) {baseline_model.coef_[0]}')
print(f'Intercept: {baseline_model.intercept_}')


MSE: 1.7162108848641004e-05
Coefficient (for num_words) 1.1207402393122102e-05
Intercept: 0.0003017793755250274


In [11]:
print("Minimum word count:", yelp_data['num_words'].min())
print("Maximum word count:", yelp_data['num_words'].max())
print("Mean word count:", round(yelp_data['num_words'].mean(), 2))
print("Median word count:", yelp_data['num_words'].median())

Minimum word count: 1
Maximum word count: 1032
Mean word count: 104.22
Median word count: 74.0


# Tobit for Baseline

In [13]:
X = yelp_data[['num_words']]
Y = yelp_data['helpful']

# censorship
cens = pd.Series(0, index=Y.index)
cens[Y <= 0] = -1  # Left-censored
cens[Y >= 10] = 1  # Right-censored

# Center features
X['num_words_centered'] = X['num_words'] - X['num_words'].mean()

# Train-test split
X_train, X_test, Y_train, Y_test, cens_train, cens_test = train_test_split(
    X[['num_words_centered']], Y, cens, test_size=0.2, random_state=42
)

tobit_model = TobitModel(fit_intercept=True)
tobit_model.fit(X_train, Y_train, cens_train, verbose=True)

# Predict on test data
tobit_y_pred = tobit_model.predict(X_test)

# Calculate performance metrics
tobit_mse = mean_squared_error(Y_test, tobit_y_pred)
tobit_rmse = np.sqrt(tobit_mse)
tobit_mae = mean_absolute_error(Y_test, tobit_y_pred)

print(f'Tobit MSE: {tobit_mse}')
print(f'Tobit RMSE: {tobit_rmse}')
print(f'Tobit MAE: {tobit_mae}')
print(f'Tobit Coefficient (for num_words_centered): {tobit_model.coef_[0]}')
print(f'Tobit Intercept: {tobit_model.intercept_}')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['num_words_centered'] = X['num_words'] - X['num_words'].mean()


         Current function value: -1550548.597557
         Iterations: 21
         Function evaluations: 26
         Gradient evaluations: 26
  message: Desired error not necessarily achieved due to precision loss.
  success: False
   status: 2
      fun: -1550548.5975574781
        x: [-3.302e-03  2.679e-05  7.579e-03]
      nit: 21
      jac: [-7.830e-04 -3.609e-01  6.467e-03]
 hess_inv: [[ 8.009e-11 -1.396e-13 -3.916e-11]
            [-1.396e-13  5.118e-15  1.167e-13]
            [-3.916e-11  1.167e-13  5.668e-11]]
     nfev: 26
     njev: 26
Tobit MSE: 2.1570952985241883e-05
Tobit RMSE: 0.004644454002920245
Tobit MAE: 0.002507376175627481
Tobit Coefficient (for num_words_centered): 2.679071142156193e-05
Tobit Intercept: 2.679071142156193e-05


  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)


# NDCG for Baseline Linear Regression

In [None]:
true_helpful_baseline = np.asarray([Y_test])  # Ground truth, reshaped to 2D
pred_helpful_baseline = np.asarray([baseline_y_pred])  # Predicted values, reshaped to 2D

# Calculate NDCG score at different values of k
baseline_ndcg_5 = ndcg_score(true_helpful_baseline, pred_helpful_baseline, k=5)
baseline_ndcg_25 = ndcg_score(true_helpful_baseline, pred_helpful_baseline, k=25)
baseline_ndcg_50 = ndcg_score(true_helpful_baseline, pred_helpful_baseline, k=50)
baseline_ndcg_100 = ndcg_score(true_helpful_baseline, pred_helpful_baseline, k=100)
baseline_ndcg_1000 = ndcg_score(true_helpful_baseline, pred_helpful_baseline, k=1000)
baseline_ndcg_all = ndcg_score(true_helpful_baseline, pred_helpful_baseline)

# Print the NDCG scores
print("NDCG Scores for Baseline Model (Linear Regression):")
print(f"NDCG@ 5: {baseline_ndcg_5}")
print(f"NDCG@ 25: {baseline_ndcg_25}")
print(f"NDCG@ 50: {baseline_ndcg_50}")
print(f"NDCG@ 100: {baseline_ndcg_100}")
print(f"NDCG@ 1000: {baseline_ndcg_1000}")
print(f"Overall NDCG: {baseline_ndcg_all}")

NDCG Scores for Baseline Model (Linear Regression):
NDCG@ 5: 0.00915340980076519
NDCG@ 25: 0.008654044902126736
NDCG@ 50: 0.017524051077344165
NDCG@ 100: 0.028503686439487724
NDCG@ 1000: 0.09019583058810085
Overall NDCG: 0.8138128068007071


# NDCG for Baseline Tobit Regression

In [14]:
true_helpful_tobit = np.asarray([Y_test])  # Ground truth
pred_helpful_tobit = np.asarray([tobit_y_pred])  # Predicted values

# Calculate NDCG score at different values of k
tobit_ndcg_5 = ndcg_score(true_helpful_tobit, pred_helpful_tobit, k=5)
tobit_ndcg_25 = ndcg_score(true_helpful_tobit, pred_helpful_tobit, k=25)
tobit_ndcg_50 = ndcg_score(true_helpful_tobit, pred_helpful_tobit, k=50)
tobit_ndcg_100 = ndcg_score(true_helpful_tobit, pred_helpful_tobit, k=100)
tobit_ndcg_1000 = ndcg_score(true_helpful_tobit, pred_helpful_tobit, k=1000)
tobit_ndcg_all = ndcg_score(true_helpful_tobit, pred_helpful_tobit)

# Print the NDCG scores
print("NDCG Scores for Tobit Regression Model:")
print(f"NDCG@ 5: {tobit_ndcg_5}")
print(f"NDCG@ 25: {tobit_ndcg_25}")
print(f"NDCG@ 50: {tobit_ndcg_50}")
print(f"NDCG@ 100: {tobit_ndcg_100}")
print(f"NDCG@ 1000: {tobit_ndcg_1000}")
print(f"Overall NDCG: {tobit_ndcg_all}")

NDCG Scores for Tobit Regression Model:
NDCG@ 5: 0.00915340980076519
NDCG@ 25: 0.008654044902126736
NDCG@ 50: 0.017524051077344165
NDCG@ 100: 0.028503686439487724
NDCG@ 1000: 0.09019583058810085
Overall NDCG: 0.8138128068007071
