# Robo Reviews Project
## The new product review aggregator

Download Libraries


In [1]:
# Libraries
import pandas as pd  # pandas and numpy for data manipulation
import numpy as np
import sklearn as skl # scikit-learn for machine learning models (e.g., clustering, classification).
import transformers # transformers and torch for sentiment analysis and text generation.
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import re
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk  # nltk for text preprocessing (tokenization, stop-word removal, etc.)
# Download necessary nltk datasets
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')  # For WordNet lemmatizer
from transformers import AutoTokenizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


## Summary of the Workflow:
1. Preprocessing: Clean, tokenize, and normalize the text data.
2. Sentiment Classification (Positive, Neutral, Negative): Create model to classify customer reviews based on their sentiment.
3. Meta-Category Classification: Group reviews into 4-6 meta-categories.
4. Blog-style Article Generation: Generate articles based on the product categories, including the top 3 products, complaints, and worst product.

## Step 1 Preprocessing: Load and Explore the Dataset

We will focus on these relevant columns:

reviews.text: The review content (text) for sentiment analysis.
reviews.rating: The numerical rating that can guide sentiment classification.
categories: Product categories, which we can use to group reviews into meta-categories.
reviews.date, reviews.numHelpful, and reviews.username: Can be used later for insights or blog generation.

Preprocessing Steps
Clean the review text: Remove any unwanted characters and normalize the text.
Handle missing values: Remove rows where reviews.text or reviews.rating is missing.
Sentiment labeling: Use reviews.rating to label reviews as positive, neutral, or negative.
Tokenization: Convert the cleaned text into numerical format using TF-IDF or embeddings.



In [2]:
# Load the dataset
file_path = 'combined_amazon_reviews.csv'
df = pd.read_csv(file_path)

# Explore the dataset
df.head()  # See the first few rows



  df = pd.read_csv(file_path)


Unnamed: 0,id,dateAdded,dateUpdated,name,asins,brand,categories,primaryCategories,imageURLs,keys,...,reviews.numHelpful,reviews.rating,reviews.sourceURLs,reviews.text,reviews.title,reviews.username,sourceURLs,reviews.dateAdded,reviews.userCity,reviews.userProvince
0,AVpgNzjwLJeJML43Kpxn,2015-10-30T08:59:32Z,2019-04-25T09:08:16Z,AmazonBasics AAA Performance Alkaline Batterie...,"B00QWO9P0O,B00LH3DMUO",Amazonbasics,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,https://images-na.ssl-images-amazon.com/images...,"amazonbasics/hl002619,amazonbasicsaaaperforman...",...,,3.0,https://www.amazon.com/product-reviews/B00QWO9...,I order 3 of them and one of the item is bad q...,... 3 of them and one of the item is bad quali...,Byger yang,"https://www.barcodable.com/upc/841710106442,ht...",,,
1,AVpgNzjwLJeJML43Kpxn,2015-10-30T08:59:32Z,2019-04-25T09:08:16Z,AmazonBasics AAA Performance Alkaline Batterie...,"B00QWO9P0O,B00LH3DMUO",Amazonbasics,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,https://images-na.ssl-images-amazon.com/images...,"amazonbasics/hl002619,amazonbasicsaaaperforman...",...,,4.0,https://www.amazon.com/product-reviews/B00QWO9...,Bulk is always the less expensive way to go fo...,... always the less expensive way to go for pr...,ByMG,"https://www.barcodable.com/upc/841710106442,ht...",,,
2,AVpgNzjwLJeJML43Kpxn,2015-10-30T08:59:32Z,2019-04-25T09:08:16Z,AmazonBasics AAA Performance Alkaline Batterie...,"B00QWO9P0O,B00LH3DMUO",Amazonbasics,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,https://images-na.ssl-images-amazon.com/images...,"amazonbasics/hl002619,amazonbasicsaaaperforman...",...,,5.0,https://www.amazon.com/product-reviews/B00QWO9...,Well they are not Duracell but for the price i...,... are not Duracell but for the price i am ha...,BySharon Lambert,"https://www.barcodable.com/upc/841710106442,ht...",,,
3,AVpgNzjwLJeJML43Kpxn,2015-10-30T08:59:32Z,2019-04-25T09:08:16Z,AmazonBasics AAA Performance Alkaline Batterie...,"B00QWO9P0O,B00LH3DMUO",Amazonbasics,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,https://images-na.ssl-images-amazon.com/images...,"amazonbasics/hl002619,amazonbasicsaaaperforman...",...,,5.0,https://www.amazon.com/product-reviews/B00QWO9...,Seem to work as well as name brand batteries a...,... as well as name brand batteries at a much ...,Bymark sexson,"https://www.barcodable.com/upc/841710106442,ht...",,,
4,AVpgNzjwLJeJML43Kpxn,2015-10-30T08:59:32Z,2019-04-25T09:08:16Z,AmazonBasics AAA Performance Alkaline Batterie...,"B00QWO9P0O,B00LH3DMUO",Amazonbasics,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,https://images-na.ssl-images-amazon.com/images...,"amazonbasics/hl002619,amazonbasicsaaaperforman...",...,,5.0,https://www.amazon.com/product-reviews/B00QWO9...,These batteries are very long lasting the pric...,... batteries are very long lasting the price ...,Bylinda,"https://www.barcodable.com/upc/841710106442,ht...",,,


In [3]:
print(df.columns)  # Check the available columns

Index(['id', 'dateAdded', 'dateUpdated', 'name', 'asins', 'brand',
       'categories', 'primaryCategories', 'imageURLs', 'keys', 'manufacturer',
       'manufacturerNumber', 'reviews.date', 'reviews.dateSeen',
       'reviews.didPurchase', 'reviews.doRecommend', 'reviews.id',
       'reviews.numHelpful', 'reviews.rating', 'reviews.sourceURLs',
       'reviews.text', 'reviews.title', 'reviews.username', 'sourceURLs',
       'reviews.dateAdded', 'reviews.userCity', 'reviews.userProvince'],
      dtype='object')


## Data Cleaning

In [4]:
# Drop rows with missing reviews or ratings
df = df.dropna(subset=['reviews.text', 'reviews.rating'])

# Initialize stopwords 
stop_words = set(stopwords.words('english'))

# Function to clean and preprocess text
def clean_text(text):
    # Remove non-alphanumeric characters
    text = re.sub(r'[^\w\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()  # This ensures extra spaces are removed
    # Tokenize and remove stopwords
    text = word_tokenize(text)
    text = [word for word in text if word not in stop_words]  # Remove stopwords
    return ' '.join(text)

# Apply cleaning function to reviews
df['cleaned_text'] = df['reviews.text'].apply(clean_text)

# Check cleaned data
print(df[['reviews.text', 'cleaned_text']].head())

                                        reviews.text  \
0  I order 3 of them and one of the item is bad q...   
1  Bulk is always the less expensive way to go fo...   
2  Well they are not Duracell but for the price i...   
3  Seem to work as well as name brand batteries a...   
4  These batteries are very long lasting the pric...   

                                        cleaned_text  
0  order 3 one item bad quality missing backup sp...  
1    bulk always less expensive way go products like  
2                          well duracell price happy  
3  seem work well name brand batteries much bette...  
4                 batteries long lasting price great  


## Step 2: Sentiment Classification (Positive, Neutral, Negative)
We'll classify sentiment using the rating (1-5 scale):

Positive: Ratings 4-5,
Neutral: Rating 3,
Negative: Ratings 1-2



In [5]:
# Label sentiment based on the rating
def label_sentiment(rating):
    if rating >= 4:
        return 'positive'
    elif rating == 3:
        return 'neutral'
    else:
        return 'negative'

# Apply sentiment labeling
df['sentiment'] = df['reviews.rating'].apply(label_sentiment)

# Check distribution of sentiment
print(df['sentiment'].value_counts())


sentiment
positive    62546
neutral      2902
negative     2510
Name: count, dtype: int64


Tokenization

In [6]:
# Tokenization
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Tokenize the dataset
def tokenize_function(text):
    return tokenizer(text, padding="max_length", truncation=True, return_tensors="pt")

# Apply tokenization to the 'cleaned_text' column
df['tokenized'] = df['cleaned_text'].apply(lambda x: tokenize_function(x)['input_ids'].squeeze().tolist())

# Convert 'tokenized' column to a list if it's a DatasetDict or tensor object
df['tokenized'] = df['tokenized'].apply(lambda x: x.tolist() if isinstance(x, torch.Tensor) else x)

# Check tokenized data
print(df[['cleaned_text', 'tokenized']].head())


# Split data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['tokenized'], df['sentiment'], test_size=0.2, random_state=42)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

                                        cleaned_text  \
0  order 3 one item bad quality missing backup sp...   
1    bulk always less expensive way go products like   
2                          well duracell price happy   
3  seem work well name brand batteries much bette...   
4                 batteries long lasting price great   

                                           tokenized  
0  [101, 2344, 1017, 2028, 8875, 2919, 3737, 4394...  
1  [101, 9625, 2467, 2625, 6450, 2126, 2175, 3688...  
2  [101, 2092, 4241, 22903, 3363, 3976, 3407, 102...  
3  [101, 4025, 2147, 2092, 2171, 4435, 10274, 217...  
4  [101, 10274, 2146, 9879, 3976, 2307, 102, 0, 0...  


Integrating BERT for sentiment analysis 

In [7]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

# Define the folder path where the model is saved
model_folder = "./Robo-Reviews-Project/r"

# Load the tokenizer from the saved model folder
tokenizer = AutoTokenizer.from_pretrained(model_folder)

# Load the model from the saved model folder
model = AutoModelForSequenceClassification.from_pretrained(model_folder)

# Ensure the model is set to evaluation mode (important for inference)
model.eval()

# Define the device (GPU if available, otherwise CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)  # Move the model to the appropriate device


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [8]:
# Example function to predict sentiment using the loaded model
def predict_sentiment(text):
    # Tokenize the input text and move to the appropriate device
    inputs = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True).to(device)

    # Perform inference (no gradient calculation needed for evaluation)
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Get the predicted label (index of the max logit)
    predictions = torch.argmax(outputs.logits, dim=-1).item()

    # Map the label index to sentiment
    label_map = {0: 'positive', 1: 'neutral', 2: 'negative'}
    return label_map[predictions]

# Example usage
sample_text = "This is a great product!"
predicted_sentiment = predict_sentiment(sample_text)
print(f"Predicted sentiment: {predicted_sentiment}")


Predicted sentiment: positive


In [9]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from datasets import Dataset
import torch

# Load the tokenizer and model from your saved folder (adjust the path if necessary)
model_folder = "./Robo-Reviews-Project/r"
tokenizer = AutoTokenizer.from_pretrained(model_folder)
model = AutoModelForSequenceClassification.from_pretrained(model_folder)

# Tokenize the text data
def tokenize_function(examples):
    return tokenizer(examples["cleaned_text"], padding="max_length", truncation=True)

# Convert the pandas dataframe to a Huggingface Dataset
df['label'] = df['sentiment'].map({'positive': 0, 'neutral': 1, 'negative': 2})  
dataset = Dataset.from_pandas(df[['cleaned_text', 'label']])

# Tokenize the dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Split the dataset into training and test datasets
train_test_split = tokenized_dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

# Set the format for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# You can now use Trainer, TrainingArguments, and other functionalities as needed


2024-10-17 18:23:03.243859: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-17 18:23:03.243931: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-17 18:23:03.245125: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-17 18:23:03.252188: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Map:   0%|          | 0/67958 [00:00<?, ? examples/s]

We will train BERT to predict sentiments based on the review text. Since BERT is pre-trained, we will fine-tune it on the review dataset for sentiment classification

In [10]:
# Saving the model in folder
output_dir = "./Robo-Reviews-Project/r"

In [11]:
import os
output_dir = './results'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

In [12]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import Trainer
import torch
import random
import numpy as np

# Set seed for reproducibility
seed = 42
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)  # If using a GPU
np.random.seed(seed)
random.seed(seed)

# Step 1: Define the metrics calculation function
# This function calculates accuracy, precision, recall, and F1-score.
def compute_metrics(p):
    predictions, labels = p
    preds = predictions.argmax(-1)  # Get the predicted labels
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',            # output directory
    evaluation_strategy="epoch",       # evaluate at the end of every epoch
    learning_rate=1e-5,                # learning rate
    per_device_train_batch_size=16,    # batch size for training
    per_device_eval_batch_size=16,     # batch size for evaluation
    num_train_epochs=2,                # Increase this to fine-tune longer
    weight_decay=0.02,                 # stronger regularization to prevent overfitting
    report_to=["none"],                # avoid reporting to any platform
    gradient_accumulation_steps=2     # simulate larger batch size if memory is limited
)

# Step 2: Define Trainer with the compute_metrics function
trainer = Trainer(
    model=model,                         
    args=training_args,                  
    train_dataset=train_dataset,         
    eval_dataset=test_dataset,           
    tokenizer=tokenizer,   
    compute_metrics=compute_metrics  # Added the metrics computation function
)

# Train the model
trainer.train()


# Evaluate the model and print metrics
metrics = trainer.evaluate()
print(metrics)
# After training, save the model and tokenizer
trainer.save_model(output_dir)  # Saves the model, tokenizer, and config to 'output_dir'
tokenizer.save_pretrained(output_dir)  # Save the tokenizer to the same directory



You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.0852,0.062728,0.986683,0.986397,0.986338,0.986683
2,0.0565,0.067497,0.986536,0.986329,0.986238,0.986536


{'eval_loss': 0.06749678403139114, 'eval_accuracy': 0.9865361977633902, 'eval_f1': 0.986328855985762, 'eval_precision': 0.986237538727686, 'eval_recall': 0.9865361977633902, 'eval_runtime': 122.9193, 'eval_samples_per_second': 110.577, 'eval_steps_per_second': 6.915, 'epoch': 2.0}


('./results/tokenizer_config.json',
 './results/special_tokens_map.json',
 './results/vocab.txt',
 './results/added_tokens.json',
 './results/tokenizer.json')

Evaluate the BERT Model:
Evaluating the model on the test dataset to ensure the model has learned the sentiment classification.

In [13]:
# Evaluate the model
metrics = trainer.evaluate()
print(f"Evaluation Results: Loss = {metrics['eval_loss']:.4f}, "
      f"Accuracy = {metrics['eval_accuracy']:.4f}, "
      f"F1-Score = {metrics['eval_f1']:.4f}, "
      f"Precision = {metrics['eval_precision']:.4f}, "
      f"Recall = {metrics['eval_recall']:.4f}")


Evaluation Results: Loss = 0.0675, Accuracy = 0.9865, F1-Score = 0.9863, Precision = 0.9862, Recall = 0.9865


Use BERT for Sentiment Prediction on Entire Dataset

In [14]:
# Step 4: Fix the `predict_sentiment` function to ensure inputs and model are on the same device (CUDA or CPU) 
def predict_sentiment(text):
    # Ensure everything is moved to the same device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)  # Move model to the appropriate device

    # Tokenize and prepare inputs, move inputs to the same device
    inputs = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True).to(device)

    # Perform prediction
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=-1)

    # Map predictions to sentiment labels
    label_map = {0: 'positive', 1: 'neutral', 2: 'negative'}
    return label_map[predictions.item()]

#  Apply BERT sentiment prediction to the cleaned text (Updated Code)
df['bert_sentiment'] = df['cleaned_text'].apply(predict_sentiment)

# Check the distribution of predicted sentiments
print(df['bert_sentiment'].value_counts())


bert_sentiment
positive    62840
neutral      2675
negative     2443
Name: count, dtype: int64


In [15]:
# Print the first 20 reviews, their star ratings, and the DistilBERT predicted sentiment

# Define a function to apply the model and predict the sentiment
def predict_sentiment(text):
    # Tokenize the input text and move to the appropriate device
    inputs = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True).to(device)

    # Perform inference (no gradient calculation needed for evaluation)
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Get the predicted label (index of the max logit)
    predictions = torch.argmax(outputs.logits, dim=-1).item()

    # Map the label index to sentiment
    label_map = {0: 'positive', 1: 'neutral', 2: 'negative'}
    return label_map[predictions]

# Compare the first 20 reviews, star ratings, and predicted sentiments
for idx, row in df.head(20).iterrows():
    review_text = row['reviews.text']
    star_rating = row['reviews.rating']
    predicted_sentiment = predict_sentiment(row['cleaned_text'])
    df['bert_sentiment']

    print(f"Review {idx+1}:")
    print(f"Text: {review_text}")
    print(f"Star Rating: {star_rating}")
    print(f"Predicted Sentiment: {predicted_sentiment}")
    print("-" * 50)


Review 1:
Text: I order 3 of them and one of the item is bad quality. Is missing backup spring so I have to put a pcs of aluminum to make the battery work.
Star Rating: 3.0
Predicted Sentiment: neutral
--------------------------------------------------
Review 2:
Text: Bulk is always the less expensive way to go for products like these
Star Rating: 4.0
Predicted Sentiment: positive
--------------------------------------------------
Review 3:
Text: Well they are not Duracell but for the price i am happy.
Star Rating: 5.0
Predicted Sentiment: positive
--------------------------------------------------
Review 4:
Text: Seem to work as well as name brand batteries at a much better price
Star Rating: 5.0
Predicted Sentiment: positive
--------------------------------------------------
Review 5:
Text: These batteries are very long lasting the price is great.
Star Rating: 5.0
Predicted Sentiment: positive
--------------------------------------------------
Review 6:
Text: Bought a lot of batterie


Comparison with Rating-Based Sentiment:
After predicting the sentiment using BERT, comparing results with the rating-based labels originally used in step 2

In [16]:
# Compare the BERT predicted sentiment with the rating-based sentiment
print(df[['sentiment', 'bert_sentiment']].head())


  sentiment bert_sentiment
0   neutral        neutral
1  positive       positive
2  positive       positive
3  positive       positive
4  positive       positive


In [17]:
import torch

torch.cuda.empty_cache()  # Clear the GPU cache

# Step 3: Meta-Category Classification
Step 3: Meta-Category Classification, focused on the XLNet model and including the evaluation with Silhouette Score and Davies-Bouldin Index.

In [None]:
# Import the models and tokenizers for XLNET, ELECTRA, and ALBERT
from transformers import XLNetTokenizer, XLNetForSequenceClassification

import torch
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


# First, split the data into training and testing sets
from sklearn.model_selection import train_test_split

# Split df into df_train and df_test (80% train, 20% test)
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

# Define the meta-categories based on broader product categories
categories_mapping = {
    "Ebook Readers": ["kindle", "ereader"],
    "Batteries": ["battery", "charge", "AAA", "AA", "alkaline"],
    "Accessories": ["keyboard", "mouse", "laptop stand", "case", "headphones" "adapter", "speakers", "charger", "cables", "remote controls", "docker" "TV fires sticks", "docker",],
    "Tablets": ["Ipad", "Kids Tablets", "Fire Tablets", "Amazon Tablets"],
    "Non-Electronics": ["nespresso", "pod", "pet carrier", "coffee"]
}

# Function to categorize products based on keywords in the 'categories' column
def assign_category(product_category):
    for meta_category, keywords in categories_mapping.items():
        if any(keyword.lower() in str(product_category).lower() for keyword in keywords):
            return meta_category
    return 'Other'

# Apply category assignment to both training and test sets
df_train['meta_category'] = df_train['categories'].apply(assign_category)
df_test['meta_category'] = df_test['categories'].apply(assign_category)

# Map the 'meta_category' column to numerical labels
meta_category_mapping = {category: i for i, category in enumerate(categories_mapping.keys())}
meta_category_mapping['Other'] = len(meta_category_mapping) 
df_train['meta_category_label'] = df_train['meta_category'].map(meta_category_mapping)
df_test['meta_category_label'] = df_test['meta_category'].map(meta_category_mapping)

# Check the resulting meta-category labels
print(df_train[['categories', 'meta_category', 'meta_category_label']].head())
print(df_test[['categories', 'meta_category', 'meta_category_label']].head())

# Check for NaN values in the labels
print(df_train[df_train['meta_category_label'].isna()][['categories', 'meta_category']])
print(df_test[df_test['meta_category_label'].isna()][['categories', 'meta_category']])

# Convert 'tokenized' column and 'meta_category_label' into PyTorch tensors for both sets
X_train_meta = torch.tensor(df_train['tokenized'].tolist())
X_test_meta = torch.tensor(df_test['tokenized'].tolist())
y_train_meta = torch.tensor(df_train['meta_category_label'].tolist(), dtype=torch.long)  # Ensure long dtype for class indices
y_test_meta = torch.tensor(df_test['meta_category_label'].tolist(), dtype=torch.long)    # Ensure long dtype for class indices

# Reshape target tensors
y_train_meta = y_train_meta.view(-1)
y_test_meta = y_test_meta.view(-1)

# Define the models for XLNet, ELECTRA, and ALBERT
models = {
    "xlnet": XLNetForSequenceClassification.from_pretrained("xlnet-base-cased", num_labels=len(meta_category_mapping))
    # "electra": ElectraForSequenceClassification.from_pretrained("google/electra-small-discriminator", num_labels=len(meta_category_mapping)),
    # "albert": AlbertForSequenceClassification.from_pretrained("albert-base-v2", num_labels=len(meta_category_mapping))
}

# Create Hugging Face datasets for training and testing
train_dataset = Dataset.from_dict({
    "input_ids": X_train_meta.tolist(),
    "labels": y_train_meta.tolist()
})
test_dataset = Dataset.from_dict({
    "input_ids": X_test_meta.tolist(),
    "labels": y_test_meta.tolist()
})

# Set format for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'labels'])
test_dataset.set_format(type='torch', columns=['input_ids', 'labels'])


# Training loop for each model
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,  # Reduce the batch size to prevent memory overflow
    per_device_eval_batch_size=8,   # Reduce evaluation batch size as well
    num_train_epochs=5,
    weight_decay=0.01,
    report_to=["none"],
    gradient_accumulation_steps=2,    # Accumulate gradients over 2 steps to simulate a larger batch size
    fp16=True                         # Enable FP16 mixed precision training
)

# Metrics calculation function (accuracy, precision, recall, F1)
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average='weighted')
    accuracy = accuracy_score(p.label_ids, preds)
    return {'accuracy': accuracy, 'f1': f1, 'precision': precision, 'recall': recall}

# Train and evaluate each model
results = {}
for model_name, model in models.items():
    print(f"Training {model_name}...")
    
    # Setup Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics
    )
    
    # Start training
trainer.train()

# Evaluate the XLNet model
metrics = trainer.evaluate()

# Print the evaluation metrics
print(f"Evaluation Results: Loss = {metrics['eval_loss']:.4f}, "
      f"Accuracy = {metrics['eval_accuracy']:.4f}, "
      f"F1-Score = {metrics['eval_f1']:.4f}, "
      )
     
# Saving the model in folder
output_dir = "./Robo-Reviews-Project/r"

# After training, save the model and tokenizer
trainer.save_model(output_dir)  # Saves the model, tokenizer, and config to 'output_dir'
tokenizer.save_pretrained(output_dir)  # Save the tokenizer to the same directory





                                              categories  meta_category  \
18505  Fire Tablets,Computers/Tablets & Networking,Ta...        Tablets   
56501  Stereos,Remote Controls,Amazon Echo,Audio Dock...  Ebook Readers   
14165  Fire Tablets,Learning Toys,Toys,Tablets,Amazon...        Tablets   
53946  Walmart for Business,Office Electronics,Tablet...  Ebook Readers   
60462  Stereos,Remote Controls,Amazon Echo,Audio Dock...  Ebook Readers   

       meta_category_label  
18505                    3  
56501                    0  
14165                    3  
53946                    0  
60462                    0  
                                              categories  meta_category  \
62339  Featured Brands,Electronics,Amazon Devices,Hom...  Ebook Readers   
60428  Stereos,Remote Controls,Amazon Echo,Audio Dock...  Ebook Readers   
20873  Fire Tablets,Tablets,Amazon Tablets,Computers ...        Tablets   
5012   AA,AAA,Health,Electronics,Health & Household,C...      Batteries   


config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/467M [00:00<?, ?B/s]

Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'sequence_summary.summary.bias', 'logits_proj.weight', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training xlnet...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6043,0.584056,0.819968,0.788599,0.76426,0.819968


  _warn_prf(average, modifier, msg_start, len(result))


# Step 4: Generate Blog-Style Articles

Summarization Pipeline: Uses the BART model to generate summaries for each meta-category.
Top Products and Worst Product Identification: Selects the top 3 and worst product based on ratings and review count.
Top Complaints Extraction: Extracts common complaints for the top 3 products from negative reviews.
Blog Post Generation: Combines the summary, top products, differences, complaints, and the worst product into a blog-style format.

In [60]:
# Blog Article Generation using Summarization Pipeline with BART
from transformers import BartTokenizer, BartForConditionalGeneration

# Load the BART model and tokenizer for summarization 
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')

# Summarize reviews for each meta-category using BART 
def summarize_reviews(reviews):
    review_text = " ".join(reviews)  # Concatenate reviews into a single string
    inputs = tokenizer([review_text], max_length=1024, return_tensors='pt', truncation=True)  # Tokenize and truncate if needed
    summary_ids = model.generate(inputs['input_ids'], max_length=150, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary_text = tokenizer.decode(summary_ids[0], skip_special_tokens=True)  # Decode generated summary
    return summary_text

# Group reviews by meta-category 
grouped_reviews = df.groupby('meta_category')['cleaned_text'].apply(list).to_dict()

# Summarizing each meta-category 
summaries = {category: summarize_reviews(reviews) for category, reviews in grouped_reviews.items()}

# Identifying Top 3 Products and Worst Product in Each Category 
def get_top_products(df_category):
    top_products = df_category.groupby('product_name').agg({
        'reviews.rating': 'mean', 'reviews.numHelpful': 'sum', 'product_name': 'count'
    }).sort_values(by=['reviews.rating', 'product_name'], ascending=[False, False])
    
    # Get the top 3 products
    top_3 = top_products.head(3).index.tolist()
    
    # Get the worst product (lowest-rated product)
    worst = top_products.tail(1).index[0]
    
    return top_3, worst

# Extracting top complaints for the top 3 products 
def extract_top_complaints(df_category, top_3):
    complaints = {}
    for product in top_3:
        product_reviews = df_category[df_category['product_name'] == product]
        negative_reviews = product_reviews[product_reviews['sentiment'] == 'negative']
        complaint_keywords = negative_reviews['cleaned_text'].str.cat(sep=' ').split()
        complaint_summary = " ".join(pd.Series(complaint_keywords).value_counts().head(3).index)
        complaints[product] = complaint_summary
    return complaints





vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

KeyError: 'meta_category'

# Step 5: Final Blog Output
Finally, generate blog-style content based on the summaries and insights.



In [None]:
# Blog Article Generation using Summarization Pipeline with BART
from transformers import BartTokenizer, BartForConditionalGeneration

# Load the BART model and tokenizer for summarization 
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')

# Summarize reviews for each meta-category using BART 
def summarize_reviews(reviews):
    review_text = " ".join(reviews)  # Concatenate reviews into a single string
    inputs = tokenizer([review_text], max_length=1024, return_tensors='pt', truncation=True)  # Tokenize and truncate if needed
    summary_ids = model.generate(inputs['input_ids'], max_length=150, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary_text = tokenizer.decode(summary_ids[0], skip_special_tokens=True)  # Decode generated summary
    return summary_text

# Generate blog post for each meta-category 
def generate_blog_post(category, summary, top_3, worst_product, complaints):
    blog_post = f"Category: {category}\n\nSummary: {summary}\n\n"
    blog_post += "Top 3 Products:\n"
    
    for product in top_3:
        blog_post += f"- {product}\n"
    
    blog_post += "\nKey differences:\n- Product 1 is better for feature X\n- Product 2 excels at feature Y\n- Product 3 is known for durability\n"
    blog_post += "\nTop complaints:\n"
    
    for product, complaint in complaints.items():
        blog_post += f"- {product}: {complaint}\n"
    
    blog_post += f"\nWorst product: {worst_product}\nReason: Poor performance, durability issues."
    return blog_post

# Create blog posts for each meta-category 
blog_posts = {}

for category, reviews in grouped_reviews.items():
    df_category = df[df['meta_category'] == category]
    
    # Summarizing using BART
    summary = summaries[category]  # BART is now used for generating summaries
    top_3, worst_product = get_top_products(df_category)
    complaints = extract_top_complaints(df_category, top_3)
    
    # Generate blog posts
    blog_posts[category] = generate_blog_post(category, summary, top_3, worst_product, complaints)

# Display blog posts (remains the same)
for category, blog in blog_posts.items():
    print(f"Blog Post for {category}:\n{blog}\n")

