In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install pyspellchecker
!pip install py-readability-metrics
!pip install textstat
!pip install pyarrow
!pip install transformers
!pip install tqdm
!pip install datasets
!pip install tensorflow
!pip install torch



In [None]:
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf
import torch

import textstat
from sklearn.feature_extraction.text import CountVectorizer
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from spellchecker import SpellChecker
from readability import Readability

from transformers import pipeline
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments
from transformers import TFAutoModelForSequenceClassification

from datasets import Dataset

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error

import gc
from tqdm import tqdm


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
## read yelp_dataset_for_model.csv
chunk_size = 100000

yelp_data_full = pd.DataFrame()

# Read CSV in chunks
with pd.read_csv('/content/drive/MyDrive/Code + Data/yelp_dataset_for_model_final.csv', chunksize=chunk_size) as reader:
    for i, chunk in enumerate(reader):
        yelp_data_full = pd.concat([yelp_data_full, chunk], ignore_index=True)
        del chunk
        gc.collect()

        if (i + 1) % 5 == 0:
            print(f'Progress: {(i + 1) * chunk_size} rows processed')

Progress: 500000 rows processed


  for i, chunk in enumerate(reader):
  for i, chunk in enumerate(reader):


Progress: 1000000 rows processed
Progress: 1500000 rows processed


  for i, chunk in enumerate(reader):


In [None]:
print(len(yelp_data_full))
yelp_data = yelp_data_full
del yelp_data_full

1872289


In [None]:
print(yelp_data.isnull().sum())

review_id                     0
user_id                       0
business_id                   0
stars_reviewer                0
useful                        0
text                          0
name                          0
postal_code                   0
stars_business                0
categories                    0
total_reviews_for_business    0
helpful                       0
num_sentences                 0
num_characters                0
num_words                     0
review_type                   0
dtype: int64


In [None]:
print(yelp_data.columns)
yelp_data['useful'] = pd.to_numeric(yelp_data['useful'], errors='coerce')

Index(['review_id', 'user_id', 'business_id', 'stars_reviewer', 'useful',
       'text', 'name', 'postal_code', 'stars_business', 'categories',
       'total_reviews_for_business', 'helpful', 'num_sentences',
       'num_characters', 'num_words', 'review_type'],
      dtype='object')


In [None]:
# drop extraneous na rows
yelp_data = yelp_data.dropna(subset=['text'])
yelp_data = yelp_data.dropna(subset=['useful'])
yelp_data = yelp_data.dropna(subset=['postal_code'])
print(yelp_data.isnull().sum())

review_id                     0
user_id                       0
business_id                   0
stars_reviewer                0
useful                        0
text                          0
name                          0
postal_code                   0
stars_business                0
categories                    0
total_reviews_for_business    0
helpful                       0
num_sentences                 0
num_characters                0
num_words                     0
review_type                   0
dtype: int64


In [None]:
# total number of *characters* in each review
yelp_data['num_characters'] = yelp_data['text'].apply(len)

In [None]:
def word_count(line):
  return len(line.split())

In [None]:
## number of words
yelp_data['num_words'] = yelp_data['text'].apply(lambda x: word_count(x))
print(yelp_data.iloc[0][['num_characters', 'num_words']])

num_characters    804
num_words         152
Name: 0, dtype: object


In [None]:
# use Facebook's Large Bart model
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=0)

labels = ["regular", "comparative", "suggestive"]

# review_test = "This phone is better than the last model I had."
review_test = "This is the phone you should buy."
result = classifier(review_test, candidate_labels=labels)

print(result)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


{'sequence': 'This is the phone you should buy.', 'labels': ['suggestive', 'comparative', 'regular'], 'scores': [0.7375675439834595, 0.23703476786613464, 0.025397656485438347]}


In [None]:
labeled_data = pd.read_csv("/content/drive/MyDrive/Code + Data/yelp_data_sample_to_label.csv")

# Define labels and mappings
labels = ["regular", "comparative", "suggestive"]
label_mapping = {"regular": 0, "comparative": 1, "suggestive": 2}

# Create premise-hypothesis pairs
def create_premise_hypothesis_dataset(data):
    premise_hypothesis_data = []
    for _, row in data.iterrows():
        for label in labels:
            hypothesis = f"This review is {label}."
            label_value = 2 if row["review_type"] == label else 0
            premise_hypothesis_data.append({
                "premise": row["text"],
                "hypothesis": hypothesis,
                "label": label_value
            })
    return pd.DataFrame(premise_hypothesis_data)

# Transform labeled data into premise-hypothesis format
premise_hypothesis_df = create_premise_hypothesis_dataset(labeled_data)

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(premise_hypothesis_df)

# Tokenize the data
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-mnli")

def tokenize_function(examples):
    return tokenizer(
        examples["premise"],
        examples["hypothesis"],
        padding=True,
        truncation=True,
        max_length=512
    )

dataset = dataset.map(tokenize_function, batched=True)
dataset = dataset.map(lambda x: {"label": x["label"]})

# Split into training and validation sets
split_dataset = dataset.train_test_split(test_size=0.2)
train_dataset = split_dataset["train"]
val_dataset = split_dataset["test"]


Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

In [None]:
os.environ["WANDB_DISABLED"] = "true" # no API key if this is disabled

model = AutoModelForSequenceClassification.from_pretrained("facebook/bart-large-mnli", num_labels=3).to("cuda")

# Define training arguments
training_args = TrainingArguments(
    output_dir="/tmp/results",
    evaluation_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.1,
    save_strategy="no",
)

# Define trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Fine-tune the model
trainer.train()


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss
1,No log,0.626948
2,No log,0.600739
3,No log,0.540086


TrainOutput(global_step=90, training_loss=0.5878372616238065, metrics={'train_runtime': 62.0188, 'train_samples_per_second': 11.609, 'train_steps_per_second': 1.451, 'total_flos': 782486005432320.0, 'train_loss': 0.5878372616238065, 'epoch': 3.0})

In [None]:
# Save fine tuned model weights
model.save_pretrained("/content/drive/MyDrive/Code + Data/fine_tuned_bart_mnli")
tokenizer.save_pretrained("/content/drive/MyDrive/Code + Data/fine_tuned_bart_mnli")

('/content/drive/MyDrive/Code + Data/fine_tuned_bart_mnli/tokenizer_config.json',
 '/content/drive/MyDrive/Code + Data/fine_tuned_bart_mnli/special_tokens_map.json',
 '/content/drive/MyDrive/Code + Data/fine_tuned_bart_mnli/vocab.json',
 '/content/drive/MyDrive/Code + Data/fine_tuned_bart_mnli/merges.txt',
 '/content/drive/MyDrive/Code + Data/fine_tuned_bart_mnli/added_tokens.json',
 '/content/drive/MyDrive/Code + Data/fine_tuned_bart_mnli/tokenizer.json')

In [None]:
# Check GPU availability
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
torch.cuda.empty_cache()

Num GPUs Available:  1


Narrative of no shot approach first to classify reviews then implemented few shot approach with manually labeled data.

In [None]:
# Load weights
model = AutoModelForSequenceClassification.from_pretrained("/content/drive/MyDrive/Code + Data/fine_tuned_bart_mnli").to("cuda")
tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/Code + Data/fine_tuned_bart_mnli")

# Define labels
labels = ["regular", "comparative", "suggestive"]

# Predict on the full dataset 256 reviews at a time
batch_size = 256  # Check the first 256 tokens of the review for categorization
predicted_labels = []

for i in tqdm(range(0, len(yelp_data), batch_size), desc="Processing Reviews"):
    batch_reviews = yelp_data["text"][i:i + batch_size].tolist()

    # Generate hypotheses for each review
    hypotheses = [[f"This review is {label}." for label in labels] for _ in batch_reviews]
    hypotheses = [item for sublist in hypotheses for item in sublist]  # Flatten

    # Repeat each review for all hypotheses
    premises = [review for review in batch_reviews for _ in labels]

    # Tokenize premise-hypothesis pairs
    inputs = tokenizer(premises, hypotheses, padding=True, truncation=True, max_length=110, return_tensors="pt").to("cuda")

    # Inference using GPU
    with torch.no_grad():
        logits = model(**inputs).logits
        entailment_scores = logits[:, 2]
        entailment_scores = entailment_scores.view(len(batch_reviews), len(labels))

    # Get the predicted label for each review
    batch_predicted_indices = torch.argmax(entailment_scores, axis=1).cpu().numpy()
    batch_predicted_labels = [labels[idx] for idx in batch_predicted_indices]

    predicted_labels.extend(batch_predicted_labels)

# Add predicted labels to the DataFrame
yelp_data["review_type"] = predicted_labels

Processing Reviews: 100%|██████████| 7314/7314 [7:41:38<00:00,  3.79s/it]


In [None]:
print(yelp_data.shape)
print(yelp_data.head())
print(yelp_data.columns)

(1872289, 16)
                review_id                 user_id             business_id  \
0  6AxgBCNX_PNTOxmbRSwcKQ  r3zeYsv1XFBRA4dJpL78cw  gmjsEdUsKpj9Xxu6pdjH0g   
1  pUycOfUwM8vqX7KjRRhUEA  59MxRhNVhU9MYndMkz0wtw  gebiRewfieSdtt17PTW6Zg   
2  l3Wk_mvAog6XANIuGQ9C7Q  ZbqSHbgCjzVAqaa7NKWn5A  EQ-TZ2eeD_E0BHuvoaeG5Q   
3  XW_LfMv0fV21l9c6xQd_lw  9OAtfnWag-ajVxRbUTGIyg  lj-E32x9_FA7GmUrBGBEWg   
4  8JFGBuHMoiNDyfcxuWNtrA  smOvOajNG0lS4Pq7d8g4JQ  RZtGWDLCAtuipwaZ-UfjmQ   

   stars_reviewer  useful                                               text  \
0               5       0  Loved this tour! I grabbed a groupon and the p...   
1               3       0  Had a party of 6 here for hibachi. Our waitres...   
2               4       0  Locals recommended Milktooth, and it's an amaz...   
3               4       0  Love going here for happy hour or dinner!  Gre...   
4               4       0  Good food--loved the gnocchi with marinara\nth...   

                              name postal_

In [None]:
# save review_type to csv
yelp_data.to_csv("/content/drive/MyDrive/Code + Data/yelp_dataset_for_model_few_shot_final.csv", index=False)