1. Setup


In [None]:
from transformers import pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from collections import Counter
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sentence_transformers import SentenceTransformer
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import datasets
import numpy as np
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from transformers import GPT2LMHeadModel, GPT2Tokenizer


datasets.logging.set_verbosity_error()

Load model


In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    'nlptown/bert-base-multilingual-uncased-sentiment')

model = AutoModelForSequenceClassification.from_pretrained(
    'nlptown/bert-base-multilingual-uncased-sentiment')

Load data


In [None]:
df = pd.read_csv(
    '/notebooks/Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products_May19.csv')

Data cleaning


# Remove rows where 'reviews.text' is NaN or contains only whitespace

df = df.dropna(subset=['reviews.text']).reset_index(drop=True)
df['reviews.text'] = df['reviews.text'].astype(str)
df = df[df['reviews.text'].str.strip().astype(bool)].reset_index(drop=True)

columns_to_drop_df = ['dateAdded', 'dateUpdated', 'asins', 'reviews.date', 'manufacturerNumber']
df.drop(columns=columns_to_drop_df, inplace=True, errors='ignore')


In [None]:
reviews = df['reviews.text'].tolist()
df_reviews = pd.DataFrame(reviews, columns=['review'])

Sentiment


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.eval()


def sentiment_score(review):
    with torch.no_grad():
        tokens = tokenizer.encode(
            review,
            return_tensors='pt',
            truncation=True,
            max_length=512
        ).to(device)

        result = model(tokens)

        # Get the predicted class (0-4) and convert to 1-5 scale
        score = int(torch.argmax(result.logits)) + 1

    return score


def map_score_to_classification(score):
    if score <= 2:
        return 'Negative'
    elif score == 3:
        return 'Neutral'
    else:
        return 'Positive'


sentiment_scores = []
sentiment_classifications = []

for review in reviews:
    score = sentiment_score(review)
    classification = map_score_to_classification(score)

    sentiment_scores.append(score)
    sentiment_classifications.append(classification)

In [None]:
df['categories_list'] = df['categories'].apply(
    lambda x: [cat.strip() for cat in x.split(',') if cat.strip()])

# Consolidate similar categories to eliminate redundancy
consolidation_mapping = {
    'All Bluetooth & Wireless Speakers': 'Bluetooth & Wireless Speakers',
    'All Streaming Media Players': 'Streaming Media Players',
    'All Tablets': 'Tablets',
    'Amazon Tablets': 'Tablets',
    'Amazon Book Reader': 'Book Readers',
    'Amazon Book Reader Accessory': 'Book Reader Accessories',
    'Amazon Device Accessories': 'Device Accessories',
    'Amazon Devices': 'Devices',
    'Amazon Devices & Accessories': 'Devices & Accessories',
    'Amazon Ereaders': 'Ereaders',
    'Amazon Tablet Accessory': 'Tablet Accessories',
    'Amazon Tap': 'Amazon Tap',
    'Audio Player Accessories': 'Audio Accessories',
    'Carrying Case Or Bag': 'Carrying Cases & Bags',
    'Cases & Bags': 'Cases & Bags',
    'Cases & Covers': 'Cases & Covers',
    'Carriers & Crates': 'Carriers & Crates',
    'Carriers & Totes': 'Carriers & Totes',
    'Cookware': 'Home & Kitchen',
    'Kitchen & Dining': 'Home & Kitchen',
    'Portable Audio & Headphones': 'Audio',
    'Tablets': 'Computers & Tablets',
    'iPads': 'Computers & Tablets',
    'E-readers': 'Computers & Tablets',
}

df['categories_list'] = df['categories_list'].apply(
    lambda cats: [consolidation_mapping.get(cat, cat) for cat in cats]
)

# Remove or assign to a meaningful category


def clean_category(cat):
    if cat in ['14701001', 'AA', 'AAA', 'Abis Electronics', 'Amazon', 'Amazon SMP']:
        return None
    return cat


df['categories_list'] = df['categories_list'].apply(
    lambda cats: [clean_category(cat)
                  for cat in cats if clean_category(cat) is not None]
)

# Extract unique categories
unique_categories = sorted(
    set(cat for cats in df['categories_list'] for cat in cats))


# Generate category embeddings using sentence transformers
embedding_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
category_embeddings = embedding_model.encode(
    unique_categories, show_progress_bar=True)
category_embeddings = np.array(category_embeddings)

# Cluster categories
num_clusters = 6
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
cluster_labels = kmeans.fit_predict(category_embeddings)

# Create category to cluster mapping
category_cluster_df = pd.DataFrame({
    'category': unique_categories,
    'category_cluster': cluster_labels
})

# Map categories to clusters
category_to_cluster = dict(
    zip(category_cluster_df['category'], category_cluster_df['category_cluster']))

# Assign primary cluster to each review


def assign_primary_cluster(categories):
    clusters = [category_to_cluster.get(
        cat) for cat in categories if category_to_cluster.get(cat) is not None]
    if not clusters:
        return np.nan
    return Counter(clusters).most_common(1)[0][0]


df['cluster'] = df['categories_list'].apply(assign_primary_cluster)


ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
brand_encoded = ohe.fit_transform(df[['brand']])
brand_encoded_df = pd.DataFrame(
    brand_encoded, columns=[f"brand_{cat}" for cat in ohe.categories_[0]])
df = pd.concat([df, brand_encoded_df], axis=1)

name_embeddings = embedding_model.encode(
    df['name'].tolist(), show_progress_bar=True)
name_embeddings = np.array(name_embeddings)

pca_product = PCA(n_components=50, random_state=42)
name_pca = pca_product.fit_transform(name_embeddings)

category_cluster_feature = df['cluster'].values.reshape(-1, 1)
brand_features = brand_encoded
product_features = name_pca

combined_features = np.hstack([
    category_cluster_feature,
    brand_features,
    product_features,
])

combined_scaler = StandardScaler()
combined_features_scaled = combined_scaler.fit_transform(combined_features)

final_k = 6

final_kmeans = KMeans(n_clusters=final_k, random_state=42)
final_cluster_labels = final_kmeans.fit_predict(combined_features_scaled)

df['final_cluster'] = final_cluster_labels

In [None]:
cluster_title_mapping = {
    0: 'Power and Connectivity Accessories',
    1: 'Amazon Devices and Media Equipment',
    2: 'Computing and Mobile Devices',
    3: 'Home and Office Products',
    4: 'Carrying and Storage Accessories',
    5: 'Audio Equipment and Accessories'
}

Summarization with Bart-base


In [None]:
summarizer = pipeline("summarization", model="facebook/bart-base",
                      tokenizer="facebook/bart-base", device=0)


def generate_summary(reviews_text):
    input_length = len(reviews_text.split())

    max_len = max(5, min(25, input_length - 1))
    min_len = max(3, int(max_len * 0.5))

    if min_len >= max_len:
        min_len = max_len - 1

    tokenizer = summarizer.tokenizer
    tokenized_text = tokenizer(reviews_text, truncation=True, max_length=1024)
    truncated_text = tokenizer.decode(
        tokenized_text['input_ids'], skip_special_tokens=True)

    if input_length < 5:
        return reviews_text

    summary = summarizer(
        truncated_text,
        max_length=max_len,
        min_length=min_len,
        do_sample=False
    )
    return summary[0]['summary_text']

In [None]:
def calculate_average_sentiment(df):
    agg_df = df.groupby(['final_cluster', 'name']).agg(
        avg_sentiment_score=('sentiment_score', 'mean'),
        total_reviews=('sentiment_score', 'count')
    ).reset_index()

    return agg_df


aggregated_metrics = calculate_average_sentiment(df)


def get_top_n_best_rated_and_most_reviewed(agg_df, n=3):
    top_products = agg_df.sort_values(
        ['final_cluster', 'avg_sentiment_score'], ascending=[True, False])
    top_n = top_products.groupby('final_cluster').head(n)
    most_reviewed = agg_df.sort_values(
        ['final_cluster', 'total_reviews'], ascending=[True, False])
    top_n_most_reviewed = most_reviewed.groupby('final_cluster').head(n)
    return top_n, top_n_most_reviewed


top_3_best_rated, top_3_most_reviewed = get_top_n_best_rated_and_most_reviewed(
    aggregated_metrics, n=3)

Prompt function


In [None]:
def create_gpt2_comparison_prompt(cluster, top_3, cluster_title_mapping):
    category_title = cluster_title_mapping.get(cluster, f"Cluster {cluster}")

    prompt = []

    prompt.append(f"Category: {category_title}\n\n")
    prompt.append("## Top 3 Products Comparison\n")

    for index, row in top_3.iterrows():
        prompt.append(f"- **{row['name']}**: Average rating {
                      row['avg_sentiment_score']:.2f} based on {row['total_reviews']} reviews.\n")

    prompt.append("\n### Key Differences Between the Top 3 Products:\n")
    prompt.append(
        "Compare the products based on features, price, and customer reviews. Highlight the strengths and weaknesses of each product.\n")

    return "".join(prompt)


cluster = 4

top_3 = top_3_best_rated[top_3_best_rated['final_cluster'] == cluster]

prompt = create_gpt2_comparison_prompt(cluster, top_3, cluster_title_mapping)

GPT2


In [None]:

model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

tokenizer.pad_token = tokenizer.eos_token
model.resize_token_embeddings(len(tokenizer))


input_ids = tokenizer.encode(prompt, return_tensors="pt")

generated_ids = model.generate(
    input_ids,
    max_length=1000,
    num_return_sequences=1,
    no_repeat_ngram_size=2,
    temperature=0.7,
    top_p=0.9,
    do_sample=True
)

generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)