# RoboReviews Project
#### The new product review aggregator

## Phase 1: Setting up the environment

In [6]:
import pandas as pd
import torch
import matplotlib.pyplot as plt
import re
from transformers import RobertaTokenizer



#### Loading the Dataset and Explore

In [None]:
# Load the dataset
dataset_path = '/notebooks/combined_amazon_reviews.csv'
df = pd.read_csv(dataset_path, low_memory=False)

# Define the columns to keep
columns_to_keep = ['name', 'brand', 'primaryCategories', 'reviews.rating', 'reviews.text']
df_cleaned = df[columns_to_keep]

# Handle missing values in 'reviews.text' and 'reviews.rating'
# We use .loc to avoid SettingWithCopyWarning
df_cleaned.loc[:, 'reviews.text'] = df_cleaned['reviews.text'].fillna('No review text provided.')
df_cleaned.loc[:, 'reviews.rating'] = df_cleaned['reviews.rating'].fillna(df_cleaned['reviews.rating'].median())

# Display the cleaned dataset
print(df_cleaned.head())

# Data Visualization
# Check and visualize the distribution of ratings
rating_counts = df_cleaned['reviews.rating'].value_counts().sort_index()
print(rating_counts)

# Plot the distribution of ratings
plt.figure(figsize=(8, 6))
rating_counts.plot(kind='bar', color='skyblue')
plt.title('Distribution of Ratings from 0 to 5')
plt.xlabel('Ratings')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.grid(True)  # Added gridlines for better readability
plt.show()



Preprocessing the text

In [None]:

def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove special characters, numbers, and extra spaces
    text = re.sub(r"[^a-z\s]", "", text)
    
    return text

# Apply the preprocessing function to the 'reviews.text' column
df_cleaned['reviews.text'] = df_cleaned['reviews.text'].apply(preprocess_text)

# Display the first few preprocessed reviews
print(df_cleaned['reviews.text'].head())


### Tokenizing and Predicting Sentiment

Load the model and Tokenizer

In [10]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load the sentiment analysis model and tokenizer
model_name = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)


Tokenize and Predict Sentiment

We’ll process the reviews in batches, tokenize them as a group, and then predict their sentiments. Instead of using apply, we can directly tokenize and predict for all the reviews together.



In [None]:
import torch

# Function to predict sentiment for a batch of reviews
def predict_sentiment_batch(reviews):
    # Tokenize the batch of review texts
    inputs = tokenizer(reviews, truncation=True, padding=True, return_tensors="pt", max_length=512)
    
    # Perform sentiment prediction
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Convert logits to probabilities
    probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
    
    # Get the predicted sentiment labels (0 = Negative, 1 = Neutral, 2 = Positive)
    predicted_labels = torch.argmax(probabilities, dim=1).tolist()
    
    return predicted_labels

# Batch size to process reviews in chunks (choose an appropriate size)
batch_size = 32

# Apply sentiment prediction in batches
sentiment_predictions = []
for i in range(0, len(df_cleaned), batch_size):
    batch_reviews = df_cleaned['reviews.text'][i:i+batch_size].tolist()  # Get batch of reviews
    batch_predictions = predict_sentiment_batch(batch_reviews)  # Predict sentiment for the batch
    sentiment_predictions.extend(batch_predictions)  # Store the batch predictions

# Add predicted sentiments back to the dataframe
df_cleaned['sentiment'] = sentiment_predictions

# Display the first few rows with predicted sentiment labels
print(df_cleaned[['reviews.text', 'sentiment']].head())


Visualize Sentiment Predictions

In [None]:
import matplotlib.pyplot as plt

# Check the distribution of sentiment labels
sentiment_counts = df_cleaned['sentiment'].value_counts()

# Plot the distribution of sentiments
plt.figure(figsize=(6, 4))
sentiment_counts.plot(kind='bar', color='skyblue')
plt.title('Distribution of Sentiment Labels')
plt.xlabel('Sentiment (0 = Negative, 1 = Neutral, 2 = Positive)')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.grid(True)
plt.show()


Model Building

In [39]:
# Select relevant columns for classification (e.g., reviews.text and primaryCategories)
df_classification = df_cleaned.select(['reviews.text', 'primaryCategories'])

# Optional: If product categories have many subcategories, map them to broader groups
# For example, mapping different subcategories to broader product categories
category_mapping = {
    "Electronics": ["Phones", "Computers", "Tablets"],
    "Health & Beauty": ["Skincare", "Makeup"]
}

df_classification = df_classification.with_columns([
    pl.when(pl.col("primaryCategories").is_in(category_mapping["Electronics"])).then("Electronics")
      .when(pl.col("primaryCategories").is_in(category_mapping["Health & Beauty"])).then("Health & Beauty")
      .otherwise(pl.col("primaryCategories")).alias("primaryCategories")
])

# Check the classification dataset
print(df_classification.head())


Error in callback <bound method _WandbInit._resume_backend of <wandb.sdk.wandb_init._WandbInit object at 0x7fee3e0ba2d0>> (for pre_run_cell), with arguments args (<ExecutionInfo object at 7fee355d3790, raw_cell="# Select relevant columns for classification (e.g..." store_history=True silent=False shell_futures=True cell_id=vscode-notebook-cell:/c%3A/Users/fmrol/Documents/GitHub/RobotReviews/RR3.ipynb#Y234sZmlsZQ%3D%3D>,),kwargs {}:


TypeError: _WandbInit._resume_backend() takes 1 positional argument but 2 were given

AttributeError: 'DataFrame' object has no attribute 'select'

Error in callback <bound method _WandbInit._pause_backend of <wandb.sdk.wandb_init._WandbInit object at 0x7fee3e0ba2d0>> (for post_run_cell), with arguments args (<ExecutionResult object at 7fee358e1110, execution_count=39 error_before_exec=None error_in_exec='DataFrame' object has no attribute 'select' info=<ExecutionInfo object at 7fee355d3790, raw_cell="# Select relevant columns for classification (e.g..." store_history=True silent=False shell_futures=True cell_id=vscode-notebook-cell:/c%3A/Users/fmrol/Documents/GitHub/RobotReviews/RR3.ipynb#Y234sZmlsZQ%3D%3D> result=None>,),kwargs {}:


TypeError: _WandbInit._pause_backend() takes 1 positional argument but 2 were given

In [None]:
# Select relevant columns for classification (e.g., reviews.text and primaryCategories)
df_classification = df_cleaned.select(['reviews.text', 'primaryCategories'])

# Optional: If product categories have many subcategories, map them to broader groups
# For example, mapping different subcategories to broader product categories
category_mapping = {
    "Electronics": ["Phones", "Computers", "Tablets"],
    "Health & Beauty": ["Skincare", "Makeup"]
}

df_classification = df_classification.with_columns([
    pl.when(pl.col("primaryCategories").is_in(category_mapping["Electronics"])).then("Electronics")
      .when(pl.col("primaryCategories").is_in(category_mapping["Health & Beauty"])).then("Health & Beauty")
      .otherwise(pl.col("primaryCategories")).alias("primaryCategories")
])

# Check the classification dataset
print(df_classification.head())


Error in callback <bound method _WandbInit._resume_backend of <wandb.sdk.wandb_init._WandbInit object at 0x7fee3e0ba2d0>> (for pre_run_cell), with arguments args (<ExecutionInfo object at 7fee355d3790, raw_cell="# Select relevant columns for classification (e.g..." store_history=True silent=False shell_futures=True cell_id=vscode-notebook-cell:/c%3A/Users/fmrol/Documents/GitHub/RobotReviews/RR3.ipynb#Y234sZmlsZQ%3D%3D>,),kwargs {}:


TypeError: _WandbInit._resume_backend() takes 1 positional argument but 2 were given

AttributeError: 'DataFrame' object has no attribute 'select'

Error in callback <bound method _WandbInit._pause_backend of <wandb.sdk.wandb_init._WandbInit object at 0x7fee3e0ba2d0>> (for post_run_cell), with arguments args (<ExecutionResult object at 7fee358e1110, execution_count=39 error_before_exec=None error_in_exec='DataFrame' object has no attribute 'select' info=<ExecutionInfo object at 7fee355d3790, raw_cell="# Select relevant columns for classification (e.g..." store_history=True silent=False shell_futures=True cell_id=vscode-notebook-cell:/c%3A/Users/fmrol/Documents/GitHub/RobotReviews/RR3.ipynb#Y234sZmlsZQ%3D%3D> result=None>,),kwargs {}:


TypeError: _WandbInit._pause_backend() takes 1 positional argument but 2 were given

Fine Tuning the model

Data Preparation

In [None]:
from sklearn.model_selection import train_test_split

# Convert text and sentiment columns to lists for splitting
texts = df_cleaned['reviews.text'].to_list()
labels = df_cleaned['sentiment'].to_list()

# Split the data into training and test sets (80/20 split)
train_texts, test_texts, train_labels, test_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)


# Save the fine-tuned model
model.save_pretrained('./fine-tuned-roberta-sentiment')
tokenizer.save_pretrained('./fine-tuned-roberta-sentiment')


The dataset is very imbalanced. This means that later we will have to use some methods to deal with that.
- Class Weights: Adjust the loss function to account for class imbalances during training.
- Oversampling/Undersampling: Oversample the minority classes (e.g., negative reviews) or undersample the majority class (positive reviews) before training.
- SMOTE: Synthetic Minority Oversampling Technique (SMOTE) could also be used to generate synthetic examples for the minority class.