In [19]:
# !pip install bertopic



In [2]:
import pandas as pd
import numpy as np
import seaborn as sns

In [3]:
import nltk
nltk.download('stopwords')
from umap import UMAP # Dimension reduction
from hdbscan import HDBSCAN # Clustering
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
import os
import json
import gzip
from urllib.request import urlopen

In [5]:
def open_to_df(file):
  data = []
  with gzip.open(file) as f:
      for l in f:
          data.append(json.loads(l.strip()))

  df = pd.DataFrame.from_dict(data)
  return df

In [6]:
!wget https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_v2/metaFiles2/meta_Appliances.json.gz

--2023-12-09 06:17:32--  https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_v2/metaFiles2/meta_Appliances.json.gz
Resolving datarepo.eng.ucsd.edu (datarepo.eng.ucsd.edu)... 132.239.8.30
Connecting to datarepo.eng.ucsd.edu (datarepo.eng.ucsd.edu)|132.239.8.30|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 59884788 (57M) [application/x-gzip]
Saving to: ‘meta_Appliances.json.gz.3’


2023-12-09 06:17:32 (201 MB/s) - ‘meta_Appliances.json.gz.3’ saved [59884788/59884788]



In [7]:
metadata = open_to_df('meta_Appliances.json.gz')
metadata.head()

Unnamed: 0,category,tech1,description,fit,title,also_buy,tech2,brand,feature,rank,also_view,details,main_cat,similar_item,date,price,asin,imageURL,imageURLHighRes
0,"[Appliances, Refrigerators, Freezers & Ice Mak...","class=""a-keyvalue prodDetTable"" role=""present...",[],,Tupperware Freezer Square Round Container Set ...,[],,Tupperware,[Each 3-pc. set includes two 7/8-cup/200 mL an...,"[>#39,745 in Appliances (See top 100)]",[],{},Appliances,,"November 19, 2008",,7301113188,[],[]
1,"[Appliances, Refrigerators, Freezers & Ice Mak...","class=""a-keyvalue prodDetTable"" role=""present...",[2 X Tupperware Pure & Fresh Unique Covered Co...,,2 X Tupperware Pure &amp; Fresh Unique Covered...,[],,Tupperware,[2 X Tupperware Pure & Fresh Unique Covered Co...,"[>#6,118 in Appliances (See top 100)]",[B004RUGHJW],{},Appliances,,"June 5, 2016",$3.62,7861850250,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...
2,"[Appliances, Parts &amp; Accessories]",,[],,The Cigar - Moments of Pleasure,[],,The Cigar Book,[],"[>#1,861,816 in Home &amp; Kitchen (See Top 10...","[B01HCAVSLK, 1632206579]",{},Amazon Home,,,$150.26,8792559360,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...
3,"[Appliances, Parts & Accessories]","class=""a-keyvalue prodDetTable"" role=""present...","[Multi purpost descaler, especially suited to ...",,Caraselle 2X 50G Appliance Descalene,[],,Caraselle,[],"[>#1,654,505 in Tools & Home Improvement (See ...",[],{},Tools & Home Improvement,,"December 17, 2014",.a-box-inner{background-color:#fff}#alohaBuyBo...,9792954481,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...
4,"[Appliances, Parts & Accessories, Range Parts ...","class=""a-keyvalue prodDetTable"" role=""present...",[Full gauge and size beveled-edge; furnished w...,,EATON Wiring 39CH-SP-L Arrow Hart 1-Gang Chrom...,[],,EATON Wiring,[Returns will not be honored on this closeout ...,"[>#3,066,990 in Tools & Home Improvement (See ...",[],{},Tools & Home Improvement,,"January 16, 2007",$3.43,B00002N5EL,[],[]


In [8]:
!wget https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_v2/categoryFiles/Appliances.json.gz

--2023-12-09 06:17:43--  https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_v2/categoryFiles/Appliances.json.gz
Resolving datarepo.eng.ucsd.edu (datarepo.eng.ucsd.edu)... 132.239.8.30
Connecting to datarepo.eng.ucsd.edu (datarepo.eng.ucsd.edu)|132.239.8.30|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 69677301 (66M) [application/x-gzip]
Saving to: ‘Appliances.json.gz.3’


2023-12-09 06:17:43 (247 MB/s) - ‘Appliances.json.gz.3’ saved [69677301/69677301]



In [9]:
userrating = open_to_df('Appliances.json.gz')
userrating = userrating.iloc[35:, :] # removing erroneous data
userrating.head(3)

Unnamed: 0,overall,vote,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,image
35,5.0,,True,"03 20, 2015",A3SHVDMM83IHJ4,B00002N7IL,{'Size:': ' Pack of 1'},steve crumpler,Just what I needed for my electric range. Matc...,Matched pigtail cord and works great.,1426809600,
36,5.0,,True,"04 20, 2013",A2OXDRWBASV91Y,B00004SQHD,"{'Size:': ' 6 ft', 'Style:': ' (Old)'}",Harold E. Ewing,I like the fact that the wire ends have mounti...,complete package,1366416000,
37,5.0,2.0,True,"03 16, 2013",A2KG6AWJSWILPR,B00004SQHD,"{'Size:': ' 6 ft', 'Style:': ' (Old)'}",Christopher J Park,"Needed another couple of feet with new dryer, ...",Perfect Fit,1363392000,


In [49]:
completedata = pd.merge(userrating, metadata, on='asin', how = 'inner')
completedata.head(5)

Unnamed: 0,overall,vote,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,...,feature,rank,also_view,details,main_cat,similar_item,date,price,imageURL,imageURLHighRes
0,5.0,,True,"03 20, 2015",A3SHVDMM83IHJ4,B00002N7IL,{'Size:': ' Pack of 1'},steve crumpler,Just what I needed for my electric range. Matc...,Matched pigtail cord and works great.,...,"[Fits all wallplates with 2.15"" diameter cente...","[>#411,187 in Tools & Home Improvement (See to...","[B00002N7HY, B074Q2KZGV, B000FPCL2K, B000FP8LK...",{},Tools & Home Improvement,"class=""a-bordered a-horizontal-stripes a-spa...","July 7, 2004",,[],[]
1,5.0,,True,"03 20, 2015",A3SHVDMM83IHJ4,B00002N7IL,{'Size:': ' Pack of 1'},steve crumpler,Just what I needed for my electric range. Matc...,Matched pigtail cord and works great.,...,"[Fits all wallplates with 2.15"" diameter cente...","[>#411,187 in Tools & Home Improvement (See to...","[B00002N7HY, B074Q2KZGV, B000FPCL2K, B000FP8LK...",{},Tools & Home Improvement,"class=""a-bordered a-horizontal-stripes a-spa...","July 7, 2004",,[],[]
2,5.0,,True,"03 26, 2018",A3TIWHNJXMSIU7,B00002N7IL,{'Size:': ' Pack of 1'},Torpex,this particular type is perfect and easily ada...,this particular type is perfect and easily ada...,...,"[Fits all wallplates with 2.15"" diameter cente...","[>#411,187 in Tools & Home Improvement (See to...","[B00002N7HY, B074Q2KZGV, B000FPCL2K, B000FP8LK...",{},Tools & Home Improvement,"class=""a-bordered a-horizontal-stripes a-spa...","July 7, 2004",,[],[]
3,5.0,,True,"03 26, 2018",A3TIWHNJXMSIU7,B00002N7IL,{'Size:': ' Pack of 1'},Torpex,this particular type is perfect and easily ada...,this particular type is perfect and easily ada...,...,"[Fits all wallplates with 2.15"" diameter cente...","[>#411,187 in Tools & Home Improvement (See to...","[B00002N7HY, B074Q2KZGV, B000FPCL2K, B000FP8LK...",{},Tools & Home Improvement,"class=""a-bordered a-horizontal-stripes a-spa...","July 7, 2004",,[],[]
4,5.0,,True,"03 20, 2018",AVP16JFIT6LPL,B00002N7IL,{'Size:': ' Pack of 1'},RAFAEL FERNANDEZ,excellent,Five Stars,...,"[Fits all wallplates with 2.15"" diameter cente...","[>#411,187 in Tools & Home Improvement (See to...","[B00002N7HY, B074Q2KZGV, B000FPCL2K, B000FP8LK...",{},Tools & Home Improvement,"class=""a-bordered a-horizontal-stripes a-spa...","July 7, 2004",,[],[]


In [50]:
missing_values = completedata['reviewText'].isnull().sum()
print("Number of missing values in 'reviewText':", missing_values)
completedata['reviewText'].fillna('', inplace=True)

Number of missing values in 'reviewText': 325


In [64]:
data = completedata.copy()

In [65]:
completedata = data.copy()

In [52]:
data.value_counts(['title'])

title                                                                                                                                      
Samsung Genuine DA29-00020B Refrigerator Water Filter, 3 Pack                                                                                  7404
General Electric MWF Refrigerator Water Filter                                                                                                 6510
 Gardus RLE202 LintEater Rotary Dryer Vent Cleaning System                                                                                     4048
GE MWF SmartWater Compatible Water Filter Cartridge - Refrigerator                                                                             4045
Broan 413004 ADA Capable Non-Ducted Under-Cabinet Range Hood, 30-Inch, Stainless Steel                                                         3056
                                                                                                                        

In [53]:
completedata = completedata.iloc[:10000, :]

In [48]:
# NLTK English stopwords
stopwords = nltk.corpus.stopwords.words('english')

In [24]:
len(stopwords)

179

In [25]:
amazon_rel = ['amazon', 'buy', 'deliver', 'pack', 'also']

In [26]:
stopwords.extend(amazon_rel)
print(f'There are {len(stopwords)} stopwords.')

There are 184 stopwords.


In [33]:
# Initiate UMAP
umap_model = UMAP(n_neighbors=15,
                  n_components=5,
                  min_dist=0.0,
                  metric='cosine',
                  random_state=100)

# Count vectorizer
vectorizer_model = CountVectorizer(stop_words=stopwords)

In [28]:
missing_values = completedata['reviewText'].isnull().sum()
print("Number of missing values in 'reviewText':", missing_values)
completedata['reviewText'].fillna('', inplace=True)

Number of missing values in 'reviewText': 0


In [47]:
topic_model = BERTopic(umap_model=umap_model,
                       vectorizer_model=vectorizer_model,
                       min_topic_size=200,
                       top_n_words=10,
                       language="multilingual",
                       calculate_probabilities=True)

topics = topic_model.fit_transform(completedata['reviewText'])
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,1815,-1_great_good_love_knobs,"[great, good, love, knobs, perfect, nice, exac...","[Love the color..and made of good quality., Gr..."
1,0,2077,0_vent_dryer_air_lint,"[vent, dryer, air, lint, house, cold, one, roo...","[During a recent cold-snap (5 degrees), I real..."
2,1,1690,1_filter_humidifier_filters_holmes,"[filter, humidifier, filters, holmes, one, rep...",[I purchased this replacement filter for my Ho...
3,2,1525,2_stove_covers_burner_kitchen,"[stove, covers, burner, kitchen, look, burners...","[well made. great stove covers., Love my stove..."
4,3,1125,3_price_product_great_good,"[price, product, great, good, works, easy, ite...","[Good product and good price., Good product an..."
5,4,748,4_cord_dryer_needed_longer,"[cord, dryer, needed, longer, washer, one, out...",[We had just moved from a house into an older ...
6,5,666,5_works_fit_great_worked,"[works, fit, great, worked, perfect, perfectly...","[Works great!, Works great!, works great]"
7,6,354,6_machine_basket_washer_use,"[machine, basket, washer, use, washing, dishwa...","[We were skeptical about portable washers, but..."


In [66]:
from collections import defaultdict

# Create a dictionary to store topic information for each product title
topic_info_per_title = defaultdict()

# Get unique product titles
unique_titles = completedata['title'].unique()

# Set the minimum number of samples required for clustering
min_samples_for_clustering = 50  # Adjust as needed

# Loop through each product title
for title in unique_titles:
    # Filter data for the current title
    title_data = completedata[completedata['title'] == title]

    # Check if there are enough samples for clustering
    if len(title_data) >= min_samples_for_clustering:
        try:
            # Perform topic modeling for the reviews of the current title
            topic_model = BERTopic(umap_model=umap_model,
                                   vectorizer_model=vectorizer_model,
                                   min_topic_size=200,
                                   top_n_words=10,
                                   language="multilingual",
                                   calculate_probabilities=True)

            # Fit and transform the review text for the current title
            topics = topic_model.fit_transform(title_data['reviewText'])

            # Get topic information for the current title and store it in the dictionary
            topic_info_per_title[title] = topic_model.get_topic_info()
        except Exception as e:
            print(f"An error occurred for title '{title}': {str(e)}. Skipped.")
    else:
        print(f"Not enough data points for title '{title}'. Skipped.")


Not enough data points for title 'Leviton 5206 50 Amp, 125/250 Volt, NEMA 10-50R, 3P, 3W, Flush Mounting Receptacle, Straight Blade, Industrial Grade, Non-Grounding, Side Wired, Steel Strap, Black'. Skipped.
An error occurred for title 'Coleman Cable 09045 5-Foot Range Cord': k must be less than or equal to the number of training points. Skipped.
Not enough data points for title 'Dundas Jafine MLFVK48E Semi-Rigid Aluminum Dryer Vent Kit'. Skipped.
An error occurred for title 'Protec DynaFilter Humidifier Cartridge, Air Cleaning, 3 ct.': k must be less than or equal to the number of training points. Skipped.
An error occurred for title 'Sanyo BC1206 Kegerator Beer Cooler': k must be less than or equal to the number of training points. Skipped.
An error occurred for title 'HONEYWELL HAC-504V1 Humidifier Filter Pad': k must be less than or equal to the number of training points. Skipped.
An error occurred for title 'RANGE KLEEN RGP-300 Chrome Range Round Pan/Orange Label (8.26&quot;)': k 

In [76]:
for title, topic_info in topic_info_per_title.items():
    print(f"Title: {title}")
    print("Topic Information:")
    print(topic_info)
    print("\n")

Title: Dundas Jafine CHK100ZW CHK100ZW6 Vents, 4-Inch, White
Topic Information:
   Topic  Count                     Name  \
0     -1    522  -1_air_heat_dryer_house   

                                      Representation  \
0  [air, heat, dryer, house, lint, works, great, ...   

                                 Representative_Docs  
0  [This item does what it's intended for, and do...  


Title: Holmes &quot;C&quot; Humidifier Filter, HWF65PDQ-U
Topic Information:
   Topic  Count                                 Name  \
0     -1    500  -1_filter_humidifier_filters_holmes   

                                      Representation  \
0  [filter, humidifier, filters, holmes, works, o...   

                                 Representative_Docs  
0  [Works great to replace the original filter fo...  


Title: Holmes &quot;A&quot; Humidifier Filter, HWF62
Topic Information:
   Topic  Count                             Name  \
0     -1    543        -1_works_holmes_great_fit   
1      0    720