# RoboReviews Project
#### The new product review aggregator

## Phase 1: Setting up the environment

#### Loading the Dataset and Explore

In [1]:
import pandas as pd

# Load dataset
dataset_path = r'/notebooks/combined_amazon_reviews.csv'
df = pd.read_csv(dataset_path)

# Print the first few rows of the DataFrame
print(df.columns)


Index(['id', 'dateAdded', 'dateUpdated', 'name', 'asins', 'brand',
       'categories', 'primaryCategories', 'imageURLs', 'keys', 'manufacturer',
       'manufacturerNumber', 'reviews.date', 'reviews.dateSeen',
       'reviews.didPurchase', 'reviews.doRecommend', 'reviews.id',
       'reviews.numHelpful', 'reviews.rating', 'reviews.sourceURLs',
       'reviews.text', 'reviews.title', 'reviews.username', 'sourceURLs',
       'reviews.dateAdded', 'reviews.userCity', 'reviews.userProvince'],
      dtype='object')


  df = pd.read_csv(dataset_path)


#### Removing unnecesary columns

In [2]:
# Define the columns to keep
columns_to_keep = ['name', 'brand', 'primaryCategories', 'reviews.rating', 'reviews.text']

# Drop the columns that are not in the 'columns_to_keep' list
df_cleaned = df[columns_to_keep]

# Verify the cleaned dataset
print("Cleaned Dataset Preview:")
print(df_cleaned.head())

# Check the remaining columns
print("Remaining Columns:")
print(df_cleaned.columns)


Cleaned Dataset Preview:
                                                name         brand  \
0  AmazonBasics AAA Performance Alkaline Batterie...  Amazonbasics   
1  AmazonBasics AAA Performance Alkaline Batterie...  Amazonbasics   
2  AmazonBasics AAA Performance Alkaline Batterie...  Amazonbasics   
3  AmazonBasics AAA Performance Alkaline Batterie...  Amazonbasics   
4  AmazonBasics AAA Performance Alkaline Batterie...  Amazonbasics   

  primaryCategories  reviews.rating  \
0   Health & Beauty             3.0   
1   Health & Beauty             4.0   
2   Health & Beauty             5.0   
3   Health & Beauty             5.0   
4   Health & Beauty             5.0   

                                        reviews.text  
0  I order 3 of them and one of the item is bad q...  
1  Bulk is always the less expensive way to go fo...  
2  Well they are not Duracell but for the price i...  
3  Seem to work as well as name brand batteries a...  
4  These batteries are very long lasting the

#### Find and Clean Missing Values


In [3]:
# Check for missing values in all columns
missing_values = df_cleaned.isnull().sum()

# Display only columns that have missing values
missing_values = missing_values[missing_values > 0]

print("Columns with missing values:")
print(missing_values)


Columns with missing values:
name                  6760
primaryCategories    34660
reviews.rating          33
reviews.text             1
dtype: int64


In [4]:
# Drop rows where 'reviews.text' or 'reviews.rating' are missing
df_cleaned = df_cleaned.dropna(subset=['reviews.text', 'reviews.rating'])

# Verify if missing values are removed
print("Missing values after cleaning:")
print(df_cleaned.isnull().sum())


Missing values after cleaning:
name                  6759
brand                    0
primaryCategories    34626
reviews.rating           0
reviews.text             0
dtype: int64


#### Text Preprocessing Steps:
We will do some minimum text preprocessing since later we will use the BERT tokenizer for BERT-base-uncased model

In [5]:
import re

# Minimal preprocessing function (to prepare text for BERT tokenizer)
def preprocess_text_for_bert(text):
    # Convert to lowercase (for 'uncased' models; skip this for 'cased' models)
    text = text.lower()
    
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

# Apply preprocessing function to 'reviews.text'
df_cleaned['cleaned_text'] = df_cleaned['reviews.text'].apply(preprocess_text_for_bert)

# Preview the cleaned text
print(df_cleaned[['reviews.text', 'cleaned_text']].head())


                                        reviews.text  \
0  I order 3 of them and one of the item is bad q...   
1  Bulk is always the less expensive way to go fo...   
2  Well they are not Duracell but for the price i...   
3  Seem to work as well as name brand batteries a...   
4  These batteries are very long lasting the pric...   

                                        cleaned_text  
0  i order 3 of them and one of the item is bad q...  
1  bulk is always the less expensive way to go fo...  
2  well they are not duracell but for the price i...  
3  seem to work as well as name brand batteries a...  
4  these batteries are very long lasting the pric...  


In [6]:
df_cleaned.head()

Unnamed: 0,name,brand,primaryCategories,reviews.rating,reviews.text,cleaned_text
0,AmazonBasics AAA Performance Alkaline Batterie...,Amazonbasics,Health & Beauty,3.0,I order 3 of them and one of the item is bad q...,i order 3 of them and one of the item is bad q...
1,AmazonBasics AAA Performance Alkaline Batterie...,Amazonbasics,Health & Beauty,4.0,Bulk is always the less expensive way to go fo...,bulk is always the less expensive way to go fo...
2,AmazonBasics AAA Performance Alkaline Batterie...,Amazonbasics,Health & Beauty,5.0,Well they are not Duracell but for the price i...,well they are not duracell but for the price i...
3,AmazonBasics AAA Performance Alkaline Batterie...,Amazonbasics,Health & Beauty,5.0,Seem to work as well as name brand batteries a...,seem to work as well as name brand batteries a...
4,AmazonBasics AAA Performance Alkaline Batterie...,Amazonbasics,Health & Beauty,5.0,These batteries are very long lasting the pric...,these batteries are very long lasting the pric...


BERT Tokenization for sentiment classification

In [7]:
from transformers import BertTokenizer

# Load the pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the 'cleaned_text' column using BERT tokenizer
def tokenize_text(text):
    return tokenizer(text, padding='max_length', truncation=True, max_length=128, return_tensors="pt")

# Tokenize a single example for testing
sample_text = df_cleaned['cleaned_text'][0]
tokenized_output = tokenize_text(sample_text)
print(tokenized_output)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

{'input_ids': tensor([[  101,  1045,  2344,  1017,  1997,  2068,  1998,  2028,  1997,  1996,
          8875,  2003,  2919,  3737,  1012,  2003,  4394, 10200,  3500,  2061,
          1045,  2031,  2000,  2404,  1037, 27019,  1997, 13061,  2000,  2191,
          1996,  6046,  2147,  1012,   102,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,  

Tokenizing entire dataset

In [8]:
# Tokenize the entire dataset (reviews.text)
# Store tokenized input IDs and attention masks
input_ids = []
attention_masks = []

for review in df_cleaned['cleaned_text']:
    encoded_review = tokenize_text(review)
    input_ids.append(encoded_review['input_ids'])
    attention_masks.append(encoded_review['attention_mask'])

# Convert lists to tensors for later use with BERT
print("Tokenization complete!")


Tokenization complete!


# Setting up BERT based model for sentiment classification
we will map both reviews.rating and reviews.text columns to obtain a more accurate sentiment, instead of only the numerical rating

Mapping reviews.rating to Sentiment Labels

In [9]:
# Function to derive sentiment from reviews.rating
def map_rating_sentiment(rating):
    if rating >= 4:
        return 2  # Positive
    elif rating == 3:
        return 1  # Neutral
    else:
        return 0  # Negative

# Create sentiment labels based on ratings
df_cleaned['rating_sentiment'] = df_cleaned['reviews.rating'].apply(map_rating_sentiment)

# Check the rating sentiment labels
print(df_cleaned[['reviews.rating', 'rating_sentiment']].head())




   reviews.rating  rating_sentiment
0             3.0                 1
1             4.0                 2
2             5.0                 2
3             5.0                 2
4             5.0                 2


Using a Pre-trained Sentiment Analysis Model for reviews.text (using distibert)

In [None]:
from transformers import pipeline

# Load the pre-trained sentiment analysis model from Hugging Face
sentiment_analyzer = pipeline("text-classification", model="distilbert/distilbert-base-uncased-finetuned-sst-2-english")

# Function to analyze review text sentiment using the pre-trained model
def analyze_review_sentiment(review_text):
    try:
        analysis = sentiment_analyzer(review_text)[0]
        if analysis['label'] == 'POSITIVE':
            return 2  # Positive
        elif analysis['label'] == 'NEGATIVE':
            return 0  # Negative
        else:
            return 1  # Neutral
    except:
        return 1  # If there's an error or too short text, mark as neutral

# Apply the sentiment analysis model to 'reviews.text'
df_cleaned['text_sentiment'] = df_cleaned['reviews.text'].apply(analyze_review_sentiment)

# Check the text sentiment labels
print(df_cleaned[['reviews.text', 'text_sentiment']].head())


2024-10-15 09:50:42.216597: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-15 09:50:42.216703: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-15 09:50:42.217704: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-15 09:50:42.224379: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1181 > 512). Running this sequence through the model will result in indexing errors


Combine reviews.rating and reviews.text Sentiments

In [None]:
# Combine rating and text sentiment to derive final sentiment
def combine_sentiments(rating_sentiment, text_sentiment):
    # If the rating is neutral, we rely on the text sentiment
    if rating_sentiment == 1:  # Neutral rating
        return text_sentiment
    else:
        # Otherwise, prioritize the rating sentiment
        return rating_sentiment

# Apply the function to combine both sentiments
df_cleaned['final_sentiment'] = df_cleaned.apply(lambda row: combine_sentiments(row['rating_sentiment'], row['text_sentiment']), axis=1)

# Check the final sentiment labels
print(df_cleaned[['reviews.rating', 'reviews.text', 'rating_sentiment', 'text_sentiment', 'final_sentiment']].head())


Train-Test Split

In [None]:
from sklearn.model_selection import train_test_split
import torch

# Convert input_ids and attention_masks to tensors
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(df_cleaned['sentiment'].values)

# Split the data into training and validation sets
train_inputs, val_inputs, train_labels, val_labels = train_test_split(input_ids, labels, test_size=0.2, random_state=42)
train_masks, val_masks = train_test_split(attention_masks, test_size=0.2, random_state=42)

# Move data to the GPU (if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_inputs, train_masks, train_labels = train_inputs.to(device), train_masks.to(device), train_labels.to(device)
val_inputs, val_masks, val_labels = val_inputs.to(device), val_masks.to(device), val_labels.to(device)


BERT Model Setup

Training Loop

Model Evaluation