# Review Topic Modelling with LDA

In [11]:
import re

import matplotlib.pyplot as plt
import nltk
import pandas as pd
from datasets import load_dataset
from nltk.corpus import stopwords
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)


# Load Data

In [2]:
dataset_reviews = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_All_Beauty", trust_remote_code=True)
df_reviews = dataset_reviews["full"].to_pandas()

dataset_items = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_meta_All_Beauty", split="full", trust_remote_code=True)
df_items = dataset_items.to_pandas()

In [3]:
df = pd.merge(df_reviews, df_items, on='parent_asin', how='left', suffixes=('_review', '_item'))
df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms')
df = df.sort_values('timestamp')
df.head()

Unnamed: 0,rating,title_review,text,images_review,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase,main_category,title_item,average_rating,rating_number,features,description,price,images_item,videos,store,categories,details,bought_together,subtitle,author
594949,5.0,"The best electric toothbrush ever, REALLY!",We have used Oral-B products for 15 years; thi...,[],B000050FDE,B000050FDE,AED2GFGIAJ22PHMZGSKH2CPUF75Q,2000-11-01 04:24:18,10,False,All Beauty,Oral-B Professional Care 1000 Power Toothbrush,3.8,251,[Removes up to 97% of plaque from hard-to-reac...,"[Product Description, The Oral-B Professional ...",,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",Oral-B,[],"{""Brand"": ""Oral-B"", ""Age Range (Description)"":...",,,
590647,2.0,Fine while it's working,"I paid the full... for mine, but had to return...",[],B000050FDE,B000050FDE,AH54X3UMWTAMUJU2CVWYWNZVETLA,2001-01-16 15:10:44,20,False,All Beauty,Oral-B Professional Care 1000 Power Toothbrush,3.8,251,[Removes up to 97% of plaque from hard-to-reac...,"[Product Description, The Oral-B Professional ...",,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",Oral-B,[],"{""Brand"": ""Oral-B"", ""Age Range (Description)"":...",,,
143646,5.0,Over [price] for a toothbrush?? It's worth eve...,"First of all, I am not a dental professional.....",[],B000050AUD,B000050AUD,AHQGZITP7IEYTISSUELRAKFRH3GA,2001-01-30 17:13:34,24,False,All Beauty,Philips Sonicare PL-4 (4700) Sonic Toothbrush,3.4,50,[],"[Product Description, You could have a better ...",,"{'hi_res': [None, None, None, None], 'large': ...","{'title': [], 'url': [], 'user_id': []}",PHILIPS,[],"{""Brand"": ""PHILIPS"", ""Age Range (Description)""...",,,
37437,5.0,Why did I wait so long?,"I admit it, I put off buying the Sonicare beca...",[],B000050AUD,B000050AUD,AEG7T4QNZ2EZEE4QVV6WZB2LXCOQ,2001-03-05 07:27:57,7,False,All Beauty,Philips Sonicare PL-4 (4700) Sonic Toothbrush,3.4,50,[],"[Product Description, You could have a better ...",,"{'hi_res': [None, None, None, None], 'large': ...","{'title': [], 'url': [], 'user_id': []}",PHILIPS,[],"{""Brand"": ""PHILIPS"", ""Age Range (Description)""...",,,
422529,5.0,I wouldn't be without it.......,I purchased this tooth brush about five years ...,[],B000050AUD,B000050AUD,AHC4HE4WV6CUPI4K74KQ7YCPRPWA,2001-03-29 23:38:47,4,False,All Beauty,Philips Sonicare PL-4 (4700) Sonic Toothbrush,3.4,50,[],"[Product Description, You could have a better ...",,"{'hi_res': [None, None, None, None], 'large': ...","{'title': [], 'url': [], 'user_id': []}",PHILIPS,[],"{""Brand"": ""PHILIPS"", ""Age Range (Description)""...",,,


# Pre Processing

In [8]:
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/dariabaikova/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/dariabaikova/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
def pre_process_reviews(df, title_col='title_review', text_col='text', output_col='full_review'):
    df[output_col] = df[title_col].fillna('') + ' ' + df[text_col].fillna('')

    df[output_col] = (
        df[output_col]
        .str.lower()  
        .str.replace(r'[^a-z\s]', '', regex=True)  # Remove special characters
    )

    # tokenize and remove stopwords
    stop_words = set(stopwords.words('english'))
    df[output_col] = df[output_col].str.split().apply(lambda tokens: ' '.join(word for word in tokens if word not in stop_words))

    return df

In [13]:
df = pre_process_reviews(df)

In [14]:
df.full_review.iloc[0]

'best electric toothbrush ever really used oralb products years new model even better stronger yet thinner generates different vibrations around toothbrush head varies according pressure also builtin timer enjoy'

# Reviews Topic Modelling

In [15]:
vectorizer = CountVectorizer(max_features=1000, stop_words='english')
document_term_matrix = vectorizer.fit_transform(df['full_review'])

In [16]:
document_term_matrix

<701528x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 7301171 stored elements in Compressed Sparse Row format>

In [17]:
lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda.fit(document_term_matrix)

In [18]:
def display_topics(model, feature_names, n_top_words):
    for idx, topic in enumerate(model.components_):
        print(f"Topic {idx + 1}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
        print()

In [19]:
feature_names = vectorizer.get_feature_names_out()
display_topics(lda, feature_names, 10)

Topic 1:
product dont work money like lashes buy didnt use good

Topic 2:
skin like product use smell face good br great using

Topic 3:
great love product stars easy works use nail nails good

Topic 4:
hair brush use like soft shampoo long hold really great

Topic 5:
good quality nice perfect great like color price cute wig



In [20]:
topic_distribution = lda.transform(document_term_matrix)
df['dominant_topic'] = topic_distribution.argmax(axis=1)

In [21]:
df[['full_review', 'dominant_topic']].head()

Unnamed: 0,full_review,dominant_topic
594949,best electric toothbrush ever really used oral...,1
590647,fine working paid full mine return months late...,0
143646,price toothbrush worth every penny first denta...,3
37437,wait long admit put buying sonicare couldnt ju...,0
422529,wouldnt without purchased tooth brush five yea...,3


In [22]:
df[df.dominant_topic == 4].text.iloc[9]

'Not really an improvement over standard Mach 3 Cartridges, and certainly not worth the price jump. The shave quality still does not match that by an "old-fashioned" safety razor.'

# Reviews by User Topic Modelling

In [23]:
user_text = df.groupby('user_id')['full_review'].apply(lambda x: ' '.join(x)).reset_index()

In [24]:
user_text.head()

Unnamed: 0,user_id,full_review
0,AE222BBOVZIF42YOOPNBXL4UUMYA,five stars great productexcellent price good r...
1,AE222FP7YRNFCEQ2W3ZDIGMSYTLQ,nice consistency great smell videoidbaeecebadff
2,AE222X475JC6ONXMIKZDFGQ7IAUA,wow tastes good
3,AE222Y4WTST6BUZ4J5Y2H6QMBITQ,lensoclean unit cleaning unit good job cleanin...
4,AE2232TEZOEWQLAFEX2NA6VBGMYQ,sus colores son como en la foto


In [25]:
document_term_matrix_user = vectorizer.fit_transform(user_text['full_review'])

In [26]:
lda_user = LatentDirichletAllocation(n_components=5, random_state=42)
lda_user.fit(document_term_matrix_user)

* topic 1 seems to revolve around negative experiences
* topic 2 revolves around sincare
* topic 3 seems related to naisl and nail polish
* topic 4 revolves around hair and hair products
* topic 5 seems mostly positive, some words related to acessories

In [27]:
display_topics(lda_user, vectorizer.get_feature_names_out(), 20)


Topic 1:
product dont money work lashes like buy didnt good use time worth waste doesnt disappointed used star came got cheap

Topic 2:
skin like product use br face smell good using great really used scent smells ive love oil feel best im

Topic 3:
great love product stars easy works use good nail nails amazing polish recommend awesome price really color loves loved excellent

Topic 4:
hair brush use like soft shampoo really long hold dont great used doesnt love easy time good fine im ive

Topic 5:
good quality nice great perfect like color cute price wig pretty little small size really colors love brushes beautiful brush

