In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install pyspellchecker
!pip install py-readability-metrics
!pip install textstat
!pip install pyarrow
!pip install transformers
!pip install tqdm
!pip install datasets
!pip install tensorflow
!pip install torch



In [3]:
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf
import torch

import textstat
from sklearn.feature_extraction.text import CountVectorizer
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from spellchecker import SpellChecker
from readability import Readability

from transformers import pipeline
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments
from transformers import TFAutoModelForSequenceClassification

from datasets import Dataset

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error

import gc
from tqdm import tqdm


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
## read yelp_dataset_for_model.csv
chunk_size = 100000

# Initialize an empty DataFrame to concatenate chunks
yelp_data_full = pd.DataFrame()

# Read CSV in chunks
with pd.read_csv('/content/drive/MyDrive/Code + Data/yelp_dataset_for_model_final.csv', chunksize=chunk_size) as reader:
    for i, chunk in enumerate(reader):
        yelp_data_full = pd.concat([yelp_data_full, chunk], ignore_index=True)
        del chunk
        gc.collect()

        if (i + 1) % 5 == 0:
            print(f'Progress: {(i + 1) * chunk_size} rows processed')

Progress: 500000 rows processed


  for i, chunk in enumerate(reader):
  for i, chunk in enumerate(reader):


Progress: 1000000 rows processed
Progress: 1500000 rows processed


  for i, chunk in enumerate(reader):


In [5]:
print(len(yelp_data_full))
yelp_data = yelp_data_full
del yelp_data_full

1872289


In [6]:
yelp_data.drop('review_type', axis=1, inplace=True)

In [7]:
print(yelp_data.columns)

Index(['review_id', 'user_id', 'business_id', 'stars_reviewer', 'useful',
       'text', 'name', 'postal_code', 'stars_business', 'categories',
       'total_reviews_for_business', 'helpful', 'num_sentences',
       'num_characters', 'num_words'],
      dtype='object')


In [8]:
print(yelp_data.isnull().sum())

review_id                     0
user_id                       0
business_id                   0
stars_reviewer                0
useful                        0
text                          0
name                          0
postal_code                   0
stars_business                0
categories                    0
total_reviews_for_business    0
helpful                       0
num_sentences                 0
num_characters                0
num_words                     0
dtype: int64


In [9]:
print(yelp_data.columns)
yelp_data['useful'] = pd.to_numeric(yelp_data['useful'], errors='coerce')

Index(['review_id', 'user_id', 'business_id', 'stars_reviewer', 'useful',
       'text', 'name', 'postal_code', 'stars_business', 'categories',
       'total_reviews_for_business', 'helpful', 'num_sentences',
       'num_characters', 'num_words'],
      dtype='object')


In [10]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-mnli")
model = AutoModelForSequenceClassification.from_pretrained("facebook/bart-large-mnli").to("cuda")
model.eval()  # Set model to evaluation mode

# Define the labels and the corresponding hypotheses
labels = ["regular", "comparative", "suggestive"]

# Example review
review_test = "This is the phone you should buy."

# Generate hypotheses based on labels
hypotheses = [f"This review is {label}." for label in labels]

# Tokenize the premise (review) and each hypothesis
premise = review_test  # The input review is the premise
tokenized_inputs = [
    tokenizer(
        premise,
        hypothesis,
        padding=True,
        truncation=True,
        max_length=256,
        return_tensors="pt"
    ).to("cuda")
    for hypothesis in hypotheses
]

# Perform inference for each hypothesis
with torch.no_grad():
    logits = [model(**inputs).logits for inputs in tokenized_inputs]

entailment_scores = torch.stack([logit[:, 2] for logit in logits]).squeeze()

# Find the label with the highest entailment score
predicted_index = torch.argmax(entailment_scores).item()
predicted_label = labels[predicted_index]

print(f"Predicted label: {predicted_label}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Predicted label: suggestive


In [11]:

# # classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=0)

# tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-mnli")
# model = AutoModelForSequenceClassification.from_pretrained("facebook/bart-large-mnli").to("cuda")
# model.eval()  # Set model to evaluation mode

# ## pros of facebook/bart:
# # fine-tuned on natural language and determining relationships between text and categories.
# ## does not require labeled data
# ### exposed via Hugging Face pipeline

# labels = ["regular", "comparative", "suggestive"]

# # review_test = "This phone is better than the last model I had."
# review_test = "This is the phone you should buy."
# inputs = tokenizer(
#     review_test,
#     padding=True,
#     truncation=True,
#     max_length=256,
#     return_tensors="pt"
# ).to("cuda")

# # Run inference
# with torch.no_grad():
#     logits = model(**inputs).logits

# # Get the predicted label index
# predicted_index = torch.argmax(logits, axis=1).item()

# # Map the index to the label name
# predicted_label = labels[predicted_index]

# print(predicted_label)

Categorize the reviews using no-shot

In [11]:
# BATCH_SIZE = 256

# MAX_LENGTH = 256  # Truncate to 256 tokens

# # Prepare a list to store the predicted labels
# predicted_labels = []

# # Process the dataset in batches
# for i in tqdm(range(0, len(yelp_data), BATCH_SIZE), desc="Categorizing Reviews"):
#     # Get a batch of reviews
#     batch_reviews = yelp_data['text'][i:i + BATCH_SIZE].tolist()

#     # Tokenize the batch
#     inputs = tokenizer(
#         batch_reviews,
#         padding=True,
#         truncation=True,
#         max_length=MAX_LENGTH,
#         return_tensors="pt"
#     ).to("cuda")

#     # Perform inference without gradients
#     with torch.no_grad():
#         logits = model(**inputs).logits

#     # Get the predicted label indices
#     batch_predicted_indices = torch.argmax(logits, axis=1).cpu().numpy()

#     # Map indices to label names
#     batch_predicted_labels = [labels[idx] for idx in batch_predicted_indices]

#     # Append the results to the list
#     predicted_labels.extend(batch_predicted_labels)

# # Add the predicted labels to the DataFrame
# yelp_data['review_type'] = predicted_labels



labels = ["regular", "comparative", "suggestive"]

# Parameters
BATCH_SIZE = 312  # Adjust this based on available memory
MAX_LENGTH = 100  # Truncate to 100 tokens

# Prepare a list to store the predicted labels
predicted_labels = []

# Process the dataset in batches
for i in tqdm(range(0, len(yelp_data), BATCH_SIZE), desc="Categorizing Reviews"):
    # Get a batch of reviews
    batch_reviews = yelp_data['text'][i:i + BATCH_SIZE].tolist()

    # Generate hypotheses for each review and flatten them
    hypotheses = [[f"This review is {label}." for label in labels] for _ in batch_reviews]
    hypotheses = [item for sublist in hypotheses for item in sublist]

    # Repeat each review for all hypotheses
    premises = [review for review in batch_reviews for _ in labels]

    # Tokenize the batch of premise-hypothesis pairs
    inputs = tokenizer(
        premises,
        hypotheses,
        padding=True,
        truncation=True,
        max_length=MAX_LENGTH,
        return_tensors="pt"
    ).to("cuda")

    # Perform inference without gradients
    with torch.no_grad():
        logits = model(**inputs).logits

    # Reshape logits to match batch size and number of labels
    entailment_scores = logits[:, 2].view(len(batch_reviews), len(labels))

    # Get the predicted label index for each review
    batch_predicted_indices = torch.argmax(entailment_scores, axis=1).cpu().numpy()

    # Map indices to label names
    batch_predicted_labels = [labels[idx] for idx in batch_predicted_indices]

    # Append the results to the list
    predicted_labels.extend(batch_predicted_labels)

# Add the predicted labels to the DataFrame
yelp_data['review_type'] = predicted_labels

Categorizing Reviews: 100%|██████████| 6001/6001 [6:32:53<00:00,  3.93s/it]


In [12]:
print(yelp_data.shape)
print(yelp_data.head())
print(yelp_data.columns)

(1872289, 16)
                review_id                 user_id             business_id  \
0  6AxgBCNX_PNTOxmbRSwcKQ  r3zeYsv1XFBRA4dJpL78cw  gmjsEdUsKpj9Xxu6pdjH0g   
1  pUycOfUwM8vqX7KjRRhUEA  59MxRhNVhU9MYndMkz0wtw  gebiRewfieSdtt17PTW6Zg   
2  l3Wk_mvAog6XANIuGQ9C7Q  ZbqSHbgCjzVAqaa7NKWn5A  EQ-TZ2eeD_E0BHuvoaeG5Q   
3  XW_LfMv0fV21l9c6xQd_lw  9OAtfnWag-ajVxRbUTGIyg  lj-E32x9_FA7GmUrBGBEWg   
4  8JFGBuHMoiNDyfcxuWNtrA  smOvOajNG0lS4Pq7d8g4JQ  RZtGWDLCAtuipwaZ-UfjmQ   

   stars_reviewer  useful                                               text  \
0               5       0  Loved this tour! I grabbed a groupon and the p...   
1               3       0  Had a party of 6 here for hibachi. Our waitres...   
2               4       0  Locals recommended Milktooth, and it's an amaz...   
3               4       0  Love going here for happy hour or dinner!  Gre...   
4               4       0  Good food--loved the gnocchi with marinara\nth...   

                              name postal_

In [13]:
# write review_type to csv

yelp_data.to_csv("/content/drive/MyDrive/Code + Data/yelp_dataset_for_model_no_shot_final.csv", index=False)