1. Setup


In [None]:
from transformers import pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from collections import Counter
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sentence_transformers import SentenceTransformer
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import datasets
import numpy as np
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import spacy
from collections import Counter

datasets.logging.set_verbosity_error()

Load model


In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    'nlptown/bert-base-multilingual-uncased-sentiment')

model = AutoModelForSequenceClassification.from_pretrained(
    'nlptown/bert-base-multilingual-uncased-sentiment')

Load data


In [None]:
df = pd.read_csv(
    '/notebooks/Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products_May19.csv')

Data cleaning


In [None]:
# Remove rows where 'reviews.text' is NaN or contains only whitespace

df = df.dropna(subset=['reviews.text']).reset_index(drop=True)
df['reviews.text'] = df['reviews.text'].astype(str)
df = df[df['reviews.text'].str.strip().astype(bool)].reset_index(drop=True)

columns_to_drop_df = ['dateAdded', 'dateUpdated',
                      'asins', 'reviews.date', 'manufacturerNumber']
df.drop(columns=columns_to_drop_df, inplace=True, errors='ignore')

In [None]:
reviews = df['reviews.text'].tolist()
df_reviews = pd.DataFrame(reviews, columns=['review'])

Sentiment


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.eval()


def sentiment_score(review):
    with torch.no_grad():
        tokens = tokenizer.encode(
            review,
            return_tensors='pt',
            truncation=True,
            max_length=512
        ).to(device)

        result = model(tokens)

        # Get the predicted class (0-4) and convert to 1-5 scale
        score = int(torch.argmax(result.logits)) + 1

    return score


def map_score_to_classification(score):
    if score <= 2:
        return 'Negative'
    elif score == 3:
        return 'Neutral'
    else:
        return 'Positive'


sentiment_scores = []
sentiment_classifications = []

for review in reviews:
    score = sentiment_score(review)
    classification = map_score_to_classification(score)

    sentiment_scores.append(score)
    sentiment_classifications.append(classification)

df['sentiment_score'] = sentiment_scores
df['sentiment_classification'] = sentiment_classifications

In [None]:
df['categories_list'] = df['categories'].apply(
    lambda x: [cat.strip() for cat in x.split(',') if cat.strip()])

# Consolidate similar categories to eliminate redundancy
consolidation_mapping = {
    'All Bluetooth & Wireless Speakers': 'Bluetooth & Wireless Speakers',
    'All Streaming Media Players': 'Streaming Media Players',
    'All Tablets': 'Tablets',
    'Amazon Tablets': 'Tablets',
    'Amazon Book Reader': 'Book Readers',
    'Amazon Book Reader Accessory': 'Book Reader Accessories',
    'Amazon Device Accessories': 'Device Accessories',
    'Amazon Devices': 'Devices',
    'Amazon Devices & Accessories': 'Devices & Accessories',
    'Amazon Ereaders': 'Ereaders',
    'Amazon Tablet Accessory': 'Tablet Accessories',
    'Amazon Tap': 'Amazon Tap',
    'Audio Player Accessories': 'Audio Accessories',
    'Carrying Case Or Bag': 'Carrying Cases & Bags',
    'Cases & Bags': 'Cases & Bags',
    'Cases & Covers': 'Cases & Covers',
    'Carriers & Crates': 'Carriers & Crates',
    'Carriers & Totes': 'Carriers & Totes',
    'Cookware': 'Home & Kitchen',
    'Kitchen & Dining': 'Home & Kitchen',
    'Portable Audio & Headphones': 'Audio',
    'Tablets': 'Computers & Tablets',
    'iPads': 'Computers & Tablets',
    'E-readers': 'Computers & Tablets',
}

df['categories_list'] = df['categories_list'].apply(
    lambda cats: [consolidation_mapping.get(cat, cat) for cat in cats]
)

# Remove or assign to a meaningful category


def clean_category(cat):
    if cat in ['14701001', 'AA', 'AAA', 'Abis Electronics', 'Amazon', 'Amazon SMP']:
        return None
    return cat


df['categories_list'] = df['categories_list'].apply(
    lambda cats: [clean_category(cat)
                  for cat in cats if clean_category(cat) is not None]
)

# Extract unique categories
unique_categories = sorted(
    set(cat for cats in df['categories_list'] for cat in cats))


# Generate category embeddings using sentence transformers
embedding_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
category_embeddings = embedding_model.encode(
    unique_categories, show_progress_bar=True)
category_embeddings = np.array(category_embeddings)

# Cluster categories
num_clusters = 6
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
cluster_labels = kmeans.fit_predict(category_embeddings)

# Create category to cluster mapping
category_cluster_df = pd.DataFrame({
    'category': unique_categories,
    'category_cluster': cluster_labels
})

# Map categories to clusters
category_to_cluster = dict(
    zip(category_cluster_df['category'], category_cluster_df['category_cluster']))

# Assign primary cluster to each review


def assign_primary_cluster(categories):
    clusters = [category_to_cluster.get(
        cat) for cat in categories if category_to_cluster.get(cat) is not None]
    if not clusters:
        return np.nan
    return Counter(clusters).most_common(1)[0][0]


df['cluster'] = df['categories_list'].apply(assign_primary_cluster)


ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
brand_encoded = ohe.fit_transform(df[['brand']])
brand_encoded_df = pd.DataFrame(
    brand_encoded, columns=[f"brand_{cat}" for cat in ohe.categories_[0]])
df = pd.concat([df, brand_encoded_df], axis=1)

name_embeddings = embedding_model.encode(
    df['name'].tolist(), show_progress_bar=True)
name_embeddings = np.array(name_embeddings)

pca_product = PCA(n_components=50, random_state=42)
name_pca = pca_product.fit_transform(name_embeddings)

category_cluster_feature = df['cluster'].values.reshape(-1, 1)
brand_features = brand_encoded
product_features = name_pca

combined_features = np.hstack([
    category_cluster_feature,
    brand_features,
    product_features,
])

combined_scaler = StandardScaler()
combined_features_scaled = combined_scaler.fit_transform(combined_features)

final_k = 6

final_kmeans = KMeans(n_clusters=final_k, random_state=42)
final_cluster_labels = final_kmeans.fit_predict(combined_features_scaled)

df['final_cluster'] = final_cluster_labels

In [3]:
cluster_title_mapping = {
    0: 'Power and Connectivity Accessories',
    1: 'Amazon Devices and Media Equipment',
    2: 'Computing and Mobile Devices',
    3: 'Home and Office Products',
    4: 'Carrying and Storage Accessories',
    5: 'Audio Equipment and Accessories'
}

Summarization with Bart-base


In [None]:
summarizer = pipeline("summarization", model="facebook/bart-base",
                      tokenizer="facebook/bart-base", device=0)


def generate_summary(reviews_text):
    input_length = len(reviews_text.split())

    max_len = max(5, min(25, input_length - 1))
    min_len = max(3, int(max_len * 0.5))

    if min_len >= max_len:
        min_len = max_len - 1

    tokenizer = summarizer.tokenizer
    tokenized_text = tokenizer(reviews_text, truncation=True, max_length=1024)
    truncated_text = tokenizer.decode(
        tokenized_text['input_ids'], skip_special_tokens=True)

    if input_length < 5:
        return reviews_text

    summary = summarizer(
        truncated_text,
        max_length=max_len,
        min_length=min_len,
        do_sample=False
    )
    return summary[0]['summary_text']


df['summary'] = df['reviews.text'].apply(
    lambda x: generate_summary(x) if pd.notnull(x) else "")

Prompt function


Generate pros and cons


In [8]:
nlp = spacy.load("en_core_web_sm")


def extract_pros_cons_spacy(reviews, sentiments):
    pros = []
    cons = []

    positive_reviews = [reviews[i] for i in range(
        len(reviews)) if sentiments[i] == 'Positive']
    negative_reviews = [reviews[i] for i in range(
        len(reviews)) if sentiments[i] == 'Negative']

    for review in positive_reviews:
        doc = nlp(review)
        for token in doc:
            if token.pos_ == 'ADJ' and token.dep_ in ['amod', 'acomp']:
                pros.append(token.text.lower())

    for review in negative_reviews:
        doc = nlp(review)
        for chunk in doc.noun_chunks:
            if any(token.dep_ == 'neg' for token in chunk.root.lefts) or any(token.text.lower() in ['problem', 'issue', 'bad', 'poor'] for token in chunk):
                cons.append(chunk.text.lower())

    top_pros = [word for word, count in Counter(pros).most_common(10)]
    top_cons = [phrase for phrase, count in Counter(cons).most_common(10)]

    return top_pros, top_cons


pros_cons_list = []
for cluster_id in df['final_cluster'].unique():
    top_product = df[df['final_cluster']
                     == cluster_id].iloc[0]
    product_name = top_product['name']

    reviews = df[df['final_cluster']
                 == cluster_id]['reviews.text'].tolist()
    sentiments = df[df['final_cluster']
                    == cluster_id]['sentiment_classification'].tolist()

    # Extract pros and cons using spaCy
    pros, cons = extract_pros_cons_spacy(reviews, sentiments)

    # Store the results
    pros_cons_list.append({
        'cluster': cluster_id,
        'product_name': product_name,
        'pros': pros,
        'cons': cons
    })

# Display or save the pros and cons for each top product
for entry in pros_cons_list:
    print(f"Cluster {entry['cluster']} - {entry['product_name']}")
    print("Pros: ", ', '.join(entry['pros']))
    print("Cons: ", ', '.join(entry['cons']))
    print()


# pros_cons_df = pd.DataFrame(pros_cons_list)
# pros_cons_df.to_csv('product_pros_and_cons_spacy.csv', index=False)

OSError: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a Python package or a valid path to a data directory.

GPT2


In [None]:
df = pd.read_csv(
    '/notebooks/df_reviews_with_summaries_and_clusters_sentiment.csv')

# Load the CSV containing pros and cons
pros_cons_df = pd.read_csv('product_pros_and_cons_spacy.csv')
pros_cons_list = pros_cons_df.to_dict('records')

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

tokenizer.pad_token = tokenizer.eos_token

model.eval()


# Sort the df by 'final_cluster' (ascending) and 'sentiment_score' (descending)
df = df.sort_values(
    ['final_cluster', 'sentiment_score'], ascending=[True, False])


def generate_product_summary(product_name, product_category):
    prompt = (
        f"The {product_name} from the "
        f"{product_category} category is built to offer excellent performance. "
        "It stands out for its reliability, durability, and value for money. Here's a short overview of this product, outlining its key benefits."
    )

    inputs = tokenizer(prompt, return_tensors="pt", padding=True,
                       truncation=True)
    outputs = model.generate(
        inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=200,
        temperature=0.6,
        top_p=0.85,
        no_repeat_ngram_size=3,
        early_stopping=True,
        pad_token_id=tokenizer.eos_token_id
    )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)


def generate_product_features(product_name):
    prompt = (
        f"The key features of the "
        f"{product_name} include its high performance, durable design, and superior functionality. "
        "This product is ideal for consumers who need reliable technology. It offers great value when compared to competitors in the same category."
    )

    inputs = tokenizer.encode(prompt, return_tensors="pt")
    outputs = model.generate(
        inputs,
        max_length=200,
        temperature=0.6,
        top_p=0.85,
        no_repeat_ngram_size=2,
        early_stopping=True,
        pad_token_id=tokenizer.eos_token_id

    )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)


# Generate the blog post
def generate_blog_post(product_name, product_category, pros, cons):
    summary = generate_product_summary(product_name, product_category)
    features = generate_product_features(product_name)

    blog_post = (
        f"## Product Overview\n\n{summary}\n\n"
        f"## Key Features\n\n{features}\n\n"
        f"## Pros\n\n- " + '\n- '.join(pros) + "\n\n"
        f"## Cons\n\n- " + '\n- '.join(cons) + "\n\n"
        f"## Conclusion\n\n"
        f"In conclusion, the {product_name} offers excellent "
        f"{pros[0]} and {pros[1]}, "
        f"but keep in mind that it has some drawbacks such as {cons[0]}."
    )

    return blog_post


# Generate blog posts for the top 1 product in each cluster
blog_posts = []
for cluster_id in df['final_cluster'].unique():
    # Select the top product from each cluster (sorted by sentiment score)
    top_product = df[df['final_cluster'] == cluster_id].iloc[0]

    product_name = top_product['name']
    product_category = cluster_title_mapping.get(
        cluster_id, "Miscellaneous Products")

    # Get the pros and cons for this product
    entry = next(
        (item for item in pros_cons_list if item['product_name'] == product_name), None)

    if entry:
        pros = entry['pros'].split(',')
        cons = entry['cons'].split(',')

        # Generate the blog post
        blog_post = generate_blog_post(
            product_name, product_category, pros, cons)

        # Append to blog_posts list
        blog_posts.append({
            'cluster': cluster_id,
            'product_name': product_name,
            'blog_post': blog_post
        })

# Output the blog posts
for post in blog_posts:
    print(f"Cluster {post['cluster']} - {post['product_name']}")
    print(post['blog_post'])
    print()


blog_posts_df = pd.DataFrame(blog_posts)
blog_posts_df.to_csv('generated_blog_posts1.csv', index=False)


# blog_posts_df = pd.DataFrame(blog_posts)
# blog_posts_df.to_csv('../data/generated_blog_posts.csv', index=False)

  df = pd.read_csv(
