In [2]:
import pandas as pd
df = pd.read_csv('reviews.csv')

In [3]:
# check for duplicates in id column
df['id'].duplicated().sum()

13

In [4]:
# list the duplicate ids
df[df['id'].duplicated()]['id']

616    R2YFM3DW9ZPKAV
626    R2RXKVD3AWSP2L
630    R3CLJT1O2XDK28
633    R1SU9GHKMF2PXN
635    R1PXSZE8MH4HCZ
638    R2J2XFTAAEL50V
641    R1MXD3GZKHTAKE
642    R33O38DNX0P4DD
880     R4P0HPOPSRECI
881    R366Y3EP4BFRMK
882     RUPX3V2E7K6B8
883     R5DM8A2UO2Z9N
884    R3SGPS32AFOG6R
Name: id, dtype: object

In [5]:
# drop duplicates
df.drop_duplicates(subset='id', inplace=True)

In [6]:
# view id R2YFM3DW9ZPKAV to see if it was dropped
df[df['id'] == 'R2YFM3DW9ZPKAV']

Unnamed: 0,id,title,body,asin,body_html,link,rating,vine_program,verified_purchase,helpful_votes,...,date.raw,date.utc,profile.name,profile.link,profile.id,profile.image,images,videos,attributes,attributes_flat
66,R2YFM3DW9ZPKAV,Good activity book,Cute pictures and great quality book,593385691,<span>Cute pictures and great quality book</span>,https://www.amazon.ca/gp/customer-reviews/R2YF...,5,False,True,,...,"Reviewed in Canada on March 26, 2024",2024-03-26T00:00:00.000Z,Magskywei,https://www.amazon.ca/gp/profile/amzn1.account...,AEB3ND3DRYZCTFEJK6YAX6UTB5HA,,,,,


In [7]:
# check for null values in body column
df['body'].isnull().sum()

1

In [8]:
#check for null values in title column
df['title'].isnull().sum()

0

In [9]:
df['title_body'] = df['title'] + ' ' + df['body']

In [13]:
# string
docs = df['title_body'].astype(str)

Preprocessing with SPACY

In [14]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [15]:
# tokenize
docs = docs.apply(nlp)

In [16]:
# stop words
from spacy.lang.en.stop_words import STOP_WORDS

# remove stop words
docs = docs.apply(lambda x: [token for token in x if not token.is_stop])

In [17]:
# lemmatize the text

docs = docs.apply(lambda x: [token.lemma_ for token in x])

In [18]:
docs

0      [financial, intelligence, Great, ,, great, rea...
1      [Unlock, psychology, personal, finance, ", psy...
2      [quick, review, decision, make, money, like, b...
3      [wish, read, book, soon, investor, solid, book...
4      [read, book, timeless, lesson, great, insight,...
                             ...                        
877    [product, look, disappointed, quality, product...
878    [worth, money, piece, LEGO, come, piece, build...
879    [terrible, ., gearbot, gear, ., little, Lego, ...
885    [beautiful, book, easy, recipe, beautiful, boo...
886    [great, inspiration, great, meal, inspiration,...
Name: title_body, Length: 874, dtype: object

In [20]:
# remove punctuation and non-alphabetic characters
docs = docs.apply(lambda x: [token for token in x if token.isalpha()])

In [21]:
# lower case
docs = docs.apply(lambda x: [token.lower() for token in x])

In [22]:
from gensim import corpora, models
from gensim.models import CoherenceModel

In [23]:
dictionary = corpora.Dictionary(docs)

In [24]:
corpus = [dictionary.doc2bow(doc) for doc in docs]

Model 

In [25]:
# Calculate coherence scores for different numbers of topics
coherence_scores = []
for num_topics in range(2, 50, 2):
    lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary)
    coherence_model = CoherenceModel(model=lda_model, texts=docs, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model.get_coherence()
    coherence_scores.append(coherence_score)

# Find the number of topics with the highest coherence score
optimal_num_topics = range(2, 50, 2)[coherence_scores.index(max(coherence_scores))]

# Create the LDA model with the optimal number of topics
lda_model = models.LdaModel(corpus, num_topics=optimal_num_topics, id2word=dictionary, passes=15)

In [27]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

# Prepare the visualization
vis = gensimvis.prepare(lda_model, corpus, dictionary)

# Display the visualization
pyLDAvis.display(vis)

  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()


In [29]:
# coherence_scores of the optimal number of topics
coherence_model = CoherenceModel(model=lda_model, texts=docs, dictionary=dictionary, coherence='c_v')
coherence_score = coherence_model.get_coherence()
print(coherence_score)

0.3247869701008909


In [31]:
# how can we improve the model?
# 1. increase the number of passes
# 2. increase the number of topics
# 3. increase the number of words per topic
# 4. increase the number of documents
# 5. increase the number of words per document
# 6. increase the number of words per topic
# 7. increase the number of words per document
# 8. increase the number of words per topic

# let's try increasing the number of topics
