# Amazon Review Clustering

import libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import json
import seaborn as sns  
import matplotlib.pyplot as plt 
import pandas as pd
from sklearn import preprocessing
import pickle
import sklearn
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

Load the Dataset

In [2]:
df_data = pd.read_json('/content/drive/MyDrive/Colab Notebooks/data_1pct.json', lines=True, orient='records')

#Removing the unwanted columns
df_new = df_data.loc[:, ['reviewText', 'summary']]

In [3]:
!pip install --user -U nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
print('The nltk version is {}.'.format(nltk.__version__))
print('The scikit-learn version is {}.'.format(sklearn.__version__))

The nltk version is 3.8.1.
The scikit-learn version is 1.2.1.


Download necessary NLTK resources

In [5]:
# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [6]:
# Define stop words
stop_words = set(stopwords.words('english'))

# Define stemmer
stemmer = PorterStemmer()

Preprocessing the Text

In [7]:
def preprocess_text(text):
    # Convert input to string
    text = str(text)
    # Tokenize text into words
    words = word_tokenize(text.lower())
    # Remove stop words and words with length <= 2
    words = [word for word in words if word not in stop_words and len(word) > 2]
    # Stem words
    words = [stemmer.stem(word) for word in words]
    # Join words back into text
    text = ' '.join(words)
    return text
# Apply preprocessing to 'reviewText' and 'summary' columns
df_new['reviewText'] = df_new['reviewText'].apply(preprocess_text)
df_new['summary'] = df_new['summary'].apply(preprocess_text)

Choose and fit the LDA model to the corpus

In [8]:
import gensim
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamodel import LdaModel
from gensim.models import CoherenceModel

In [9]:

docs = df_new['reviewText'].apply(lambda x: x.split())

# Create a dictionary of the words in the corpus
dictionary = Dictionary(docs)

# Filter out words that appear in less than 10 documents or more than 50% of the documents
dictionary.filter_extremes(no_below=10, no_above=0.5)

# Convert the corpus to a bag of words format
corpus = [dictionary.doc2bow(doc) for doc in docs]

# Choose the number of topics for the LDA model
num_topics = 10

# Fit the LDA model to the corpus
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, passes=10)


Coherence Score:  0.4588494582321574
Topic 0: ['...', 'open', '....', 'oven', 'manual', 'class=', 'reliabl', 'function', '.....', 'ie=utf8']
Topic 1: ['set', 'excel', 'product', 'look', 'perk', 'qualiti', 'order', 'color', 'inch', 'box']
Topic 2: ['knife', 'use', 'handl', 'cut', 'stainless', 'steel', 'well', "n't", 'like', 'blade']
Topic 3: ["n't", 'filter', 'get', 'would', 'like', 'use', 'time', 'review', 'work', 'thing']
Topic 4: ['pan', 'egg', 'use', 'cook', 'make', 'cake', "n't", 'food', 'heat', 'time']
Topic 5: ['love', 'use', 'easi', 'great', 'perfect', 'clean', 'one', 'make', 'recommend', 'size']
Topic 6: ['one', 'year', 'use', 'replac', "n't", 'bought', 'last', 'time', 'purchas', 'buy']
Topic 7: ['great', 'work', 'good', 'well', 'price', 'product', 'qualiti', 'nice', 'expect', 'made']
Topic 8: ['best', 'ever', 'electr', "'ve", 'kitchen', 'poach', 'thing', 'awesom', 'amaz', 'make']
Topic 9: ['coffe', 'cup', 'make', 'water', 'maker', 'pot', 'brew', 'hot', 'mug', 'use']


Model Evaluation

In [None]:
# Evaluate the model's performance using coherence score
coherence_model_lda = CoherenceModel(model=lda_model, texts=docs, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)

# Inspect the topics
topics = lda_model.show_topics(num_topics=num_topics, formatted=False)
for i, topic in enumerate(topics):
    print('Topic {}: {}'.format(i, [word[0] for word in topic[1]]))

Write the mode in the pickel file

In [10]:
with open('lda_model.pkl', 'wb') as f:
    pickle.dump(lda_model, f)
    

In [11]:
with open('lda_model.pkl', 'rb') as f:
    lda_model = pickle.load(f)