# RoboReviews Project
#### The new product review aggregator

## Phase 1: Setting up the environment

In [1]:
import pandas as pd
import torch
import matplotlib.pyplot as plt
import re
from transformers import RobertaTokenizer
from transformers import pipeline
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import resample
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from datasets import Dataset



2024-10-16 15:53:49.480077: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-16 15:53:49.480176: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-16 15:53:49.481162: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-16 15:53:49.487624: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


#### Loading the Dataset and Explore

In [2]:
# Load the dataset
file_path = '/notebooks/Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products_May19.csv'  # Replace with the correct file path
df = pd.read_csv(file_path)

# Select the specified columns
columns_to_keep = ['name', 'brand', 'primaryCategories', 'reviews.text', 'reviews.rating']
df_selected = df[columns_to_keep]

# Show the first few rows to verify
df_selected.head()


Unnamed: 0,name,brand,primaryCategories,reviews.text,reviews.rating
0,AmazonBasics AAA Performance Alkaline Batterie...,Amazonbasics,Health & Beauty,I order 3 of them and one of the item is bad q...,3
1,AmazonBasics AAA Performance Alkaline Batterie...,Amazonbasics,Health & Beauty,Bulk is always the less expensive way to go fo...,4
2,AmazonBasics AAA Performance Alkaline Batterie...,Amazonbasics,Health & Beauty,Well they are not Duracell but for the price i...,5
3,AmazonBasics AAA Performance Alkaline Batterie...,Amazonbasics,Health & Beauty,Seem to work as well as name brand batteries a...,5
4,AmazonBasics AAA Performance Alkaline Batterie...,Amazonbasics,Health & Beauty,These batteries are very long lasting the pric...,5


Balance the data

In [3]:
from sklearn.utils import resample

# Separate the dataset by rating categories (assume 1-2 is negative, 3 is neutral, 4-5 is positive)
positive_reviews = df_selected[df_selected['reviews.rating'] >= 4]
neutral_reviews = df_selected[df_selected['reviews.rating'] == 3]
negative_reviews = df_selected[df_selected['reviews.rating'] <= 2]

# Find the class with the most samples
max_class_size = max(len(positive_reviews), len(neutral_reviews), len(negative_reviews))

# Oversample the minority classes to match the largest class
positive_upsampled = resample(positive_reviews, replace=True, n_samples=max_class_size, random_state=42)
neutral_upsampled = resample(neutral_reviews, replace=True, n_samples=max_class_size, random_state=42)
negative_upsampled = resample(negative_reviews, replace=True, n_samples=max_class_size, random_state=42)

# Combine the upsampled datasets
df_balanced = pd.concat([positive_upsampled, neutral_upsampled, negative_upsampled])

# Shuffle the dataset
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# After balancing the dataset
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Map the labels to a new range
df_balanced['reviews.rating'] = df_balanced['reviews.rating'].map({0: 0, 1: 1, 2: 2, 3: 0, 4: 1, 5: 2})



# Verify class balance
print(df_balanced['reviews.rating'].value_counts())


reviews.rating
2    29856
0    25545
1    21234
Name: count, dtype: int64


In [4]:
def map_to_sentiment(rating):
    if rating <= 2:
        return 0  # negative
    elif rating == 3:
        return 1  # neutral
    else:
        return 2  # positive

df_balanced['labels'] = df_balanced['reviews.rating'].apply(map_to_sentiment)


Split the data

In [5]:
from sklearn.model_selection import train_test_split

# Split the dataset before tokenization
X_train, X_test, y_train, y_test = train_test_split(
    df_balanced['reviews.text'], df_balanced['reviews.rating'], test_size=0.2, random_state=42
)


Tokenization

In [6]:
from transformers import DistilBertTokenizer

# Load the DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenize training and testing sets with padding and truncation
train_encodings = tokenizer(X_train.tolist(), truncation=True, padding='max_length', max_length=512, return_tensors='pt')
test_encodings = tokenizer(X_test.tolist(), truncation=True, padding='max_length', max_length=512, return_tensors='pt')



In [10]:
# Print the shapes to verify consistency
print(f"Train encodings shape: {train_encodings['input_ids'].shape}")
print(f"Test encodings shape: {test_encodings['input_ids'].shape}")

# Check attention mask shapes
print(f"Train attention mask shape: {train_encodings['attention_mask'].shape}")
print(f"Test attention mask shape: {test_encodings['attention_mask'].shape}")

Train encodings shape: torch.Size([61308, 512])
Test encodings shape: torch.Size([15327, 512])
Train attention mask shape: torch.Size([61308, 512])
Test attention mask shape: torch.Size([15327, 512])


Create the Datasets with matching lenghts:

In [8]:
from datasets import Dataset

# Create HuggingFace Datasets for training and test data
train_dataset = Dataset.from_dict({
    'input_ids': train_encodings['input_ids'].tolist(),
    'attention_mask': train_encodings['attention_mask'].tolist(),
    'labels': y_train.tolist()
})

test_dataset = Dataset.from_dict({
    'input_ids': test_encodings['input_ids'].tolist(),
    'attention_mask': test_encodings['attention_mask'].tolist(),
    'labels': y_test.tolist()
})


In [9]:
# Verify the shape of the dataset to ensure proper tokenization
print(f"Train dataset length: {len(train_dataset)}")
print(f"Test dataset length: {len(test_dataset)}")


Train dataset length: 61308
Test dataset length: 15327


Integrating LoRA for fine-tuning

In [12]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from datasets import Dataset
import torch

# Assuming you have already tokenized the data correctly as seen in the previous images
# Tokenization (already done by you)
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')


In [13]:
from peft import LoraConfig, TaskType, get_peft_model

# Define LoRA configuration
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,  # Sequence classification task
    r=8,  # Low-rank adaptation dimension
    lora_alpha=32,  # LoRA scaling factor
    lora_dropout=0.1,  # Dropout in LoRA layers
    target_modules=["q_lin", "v_lin"]  # Target modules for LoRA in DistilBERT (MultiHeadAttention)
)

# Load pretrained model
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased', 
    num_labels=3  # Adjust based on your labels (e.g. positive, negative, neutral)
)

# Wrap the model with LoRA using PEFT
model = get_peft_model(model, lora_config)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
from transformers import Trainer, TrainingArguments

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,  # Adjust based on your need
    weight_decay=0.01,
    evaluation_strategy="epoch",
    logging_dir='./logs',
    logging_steps=10,
    save_total_limit=1,
    no_cuda=False  # Assuming you have a GPU, if not set to True
)


In [15]:
# Create HuggingFace Datasets for training and testing data (already done by you)
train_dataset = Dataset.from_dict({
    'input_ids': train_encodings['input_ids'].tolist(),
    'attention_mask': train_encodings['attention_mask'].tolist(),
    'labels': y_train.tolist()
})

test_dataset = Dataset.from_dict({
    'input_ids': test_encodings['input_ids'].tolist(),
    'attention_mask': test_encodings['attention_mask'].tolist(),
    'labels': y_test.tolist()
})

# Define the Trainer instance
trainer = Trainer(
    model=model,  # PEFT-wrapped model
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# Fine-tune the model with LoRA
trainer.train()


[34m[1mwandb[0m: Currently logged in as: [33mfmroldanrivero[0m ([33mfredsmeds[0m). Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,0.7868,0.738162
2,0.8136,0.653341
3,0.7267,0.624068


TrainOutput(global_step=11496, training_loss=0.7425901326522615, metrics={'train_runtime': 1805.0683, 'train_samples_per_second': 101.893, 'train_steps_per_second': 6.369, 'total_flos': 2.4782679657455616e+16, 'train_loss': 0.7425901326522615, 'epoch': 3.0})

Error in callback <bound method _WandbInit._pause_backend of <wandb.sdk.wandb_init._WandbInit object at 0x7fc0c83720d0>> (for post_run_cell), with arguments args (<ExecutionResult object at 7fc0cae69b50, execution_count=15 error_before_exec=None error_in_exec=None info=<ExecutionInfo object at 7fc0af08c110, raw_cell="# Create HuggingFace Datasets for training and tes.." store_history=True silent=False shell_futures=True cell_id=vscode-notebook-cell:/c%3A/Users/fmrol/Documents/GitHub/RobotReviews/RR3.ipynb#X43sZmlsZQ%3D%3D> result=TrainOutput(global_step=11496, training_loss=0.7425901326522615, metrics={'train_runtime': 1805.0683, 'train_samples_per_second': 101.893, 'train_steps_per_second': 6.369, 'total_flos': 2.4782679657455616e+16, 'train_loss': 0.7425901326522615, 'epoch': 3.0})>,),kwargs {}:


TypeError: _WandbInit._pause_backend() takes 1 positional argument but 2 were given

Evlatuate the Model

# Evaluate the fine-tuned model
results = trainer.evaluate()

print(results)


Save the Model:


model.save_pretrained('./fine_tuned_model')


Data Preparation

The dataset is very imbalanced. This means that later we will have to use some methods to deal with that.
- Class Weights: Adjust the loss function to account for class imbalances during training.
- Oversampling/Undersampling: Oversample the minority classes (e.g., negative reviews) or undersample the majority class (positive reviews) before training.
- SMOTE: Synthetic Minority Oversampling Technique (SMOTE) could also be used to generate synthetic examples for the minority class.