In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import umap
import hdbscan
from sklearn.preprocessing import StandardScaler

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = pd.read_parquet('../data/marketing_sample_walmart.parq.gzip')

In [3]:
data.head()

Unnamed: 0,Uniq Id,Crawl Timestamp,Product Url,Product Name,Description,List Price,Sale Price,Brand,Item Number,Gtin,Package Size,Category,Postal Code,Available
0,51b010b871cde349bd32159a1cc1a15f,2020-01-24 16:08:36 +0000,https://www.walmart.com/ip/Allegiance-Economy-...,Allegiance Economy Dual-scale Digital Thermometer,We aim to show you accurate product informati...,11.11,11.11,Cardinal Health,,707389636164,,Health | Medicine Cabinet | Thermometers | Dig...,,True
1,d6a7f100e44a626a3701804e99236ad6,2020-01-24 15:54:21 +0000,https://www.walmart.com/ip/Kenneth-Cole-Reacti...,Kenneth Cole Reaction Eau De Parfum Spray For ...,We aim to show you accurate product informati...,23.99,23.99,Kenneth Cole,,191565696101,,Premium Beauty | Premium Fragrance | Premium P...,,True
2,99d2b7da7e3e427a942f864937dacd9d,2020-01-24 18:34:28 +0000,https://www.walmart.com/ip/Kid-Tough-Fitness-I...,Kid Tough Fitness Inflatable Free-Standing Pun...,We aim to show you accurate product informati...,30.76,30.76,BONK FIT,563852139.0,855523007070,,Sports & Outdoors | Outdoor Sports | Hunting |...,,True
3,4c76d170c2c6a759cbce812d790a0b88,2020-01-24 11:08:53 +0000,https://www.walmart.com/ip/THE-FIRST-YEARS/167...,THE FIRST YEARS,We aim to show you accurate product informati...,6.99,6.99,The First Years,553299941.0,71463046263,,Baby | Diapering | Baby Wipes,,True
4,8ac95837dc8baa01e504fd8f633ffaf2,2020-03-10 07:37:21 +0000,https://www.walmart.com/ip/4-Pack-MD-USA-Seaml...,4 Pack - MD USA Seamless Toe-Wave-In Mesh Diab...,We aim to show you accurate product informatio...,28.27,28.27,MD USA,,191897514500,,Health | Diabetes Care | Diabetic Socks,,True


In [4]:
data.shape

(30000, 14)

Many of these URLs are invalid (two years old), so I'm going to treat the `Product Name` as the title that would've been retrieved from URL HTML.  Otherwise, we would fetch the titles and/or actual HTML content.

In [5]:
products = data['Product Name'].to_list()

In [6]:
products[:10]

['Allegiance Economy Dual-scale Digital Thermometer',
 'Kenneth Cole Reaction Eau De Parfum Spray For Women 3.40 Oz',
 'Kid Tough Fitness Inflatable Free-Standing Punching Bag + Machine Washable Fabric Cover South Carolina Gamecocks Kids Workout Buddy by Bonk Fit',
 'THE FIRST YEARS',
 '4 Pack - MD USA Seamless Toe-Wave-In Mesh Diabetic Crew Socks, Black, Medium, 1 Pair',
 'Gerber 2nd Foods Apple Baby Food 4 oz. Tubs 2 Count',
 'Kushies Ultra-Lite All-In-One Form-Fitted Washable Cloth Diapers (Blue Whales, Infant)',
 'sunmark Stop Smoking Aid 14 mg Strength Transdermal Patch, 70677003101 - Box of 14',
 'Berkley PowerBait Glitter Chroma-Glow Dough Fishing Bait',
 'Mikasa Rubber Basketball, Intermediate, 28.5']

In [7]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings = model.encode(products, show_progress_bar=True)

Batches: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 938/938 [02:46<00:00,  5.64it/s]


In [24]:
embeddings.shape

(30000, 384)

In [26]:
red = umap.UMAP(n_components=int(embeddings.shape[1]*.2), metric='cosine')
red_embed = red.fit_transform(embeddings)

In [27]:
sc = StandardScaler()
red_embed = sc.fit_transform(red_embed)

In [28]:
clust = hdbscan.HDBSCAN(min_cluster_size=15, cluster_selection_epsilon=.25)
clust.fit(red_embed)

In [122]:
res = pd.DataFrame({
    'product': products,
    'cluster': clust.labels_
})

In [123]:
res.groupby('cluster').count().sort_values('product', ascending=False)

Unnamed: 0_level_0,product
cluster,Unnamed: 1_level_1
-1,9757
209,410
161,313
62,247
35,241
...,...
0,16
213,16
191,16
261,15


In [129]:
mask = res['cluster'] == 277
res[mask]

Unnamed: 0,product,cluster
290,Swimline Inflatable 36-Inch Classic Rainbow Be...,277
381,Swimline Vinyl Sidebyside Inflatable Pool Floa...,277
641,Swimline Vinyl Ultimate Super-Sized Mattress P...,277
1670,SwimWays Spring Float Graphic Prints - Pink Tr...,277
1758,Swimline Water Wheel Inflatable Pool Toy,277
...,...,...
26073,Vp Components No Float Cleat,277
26567,Swimline Inflatable YOLO Swimming Pool Backyar...,277
28207,Intex Kiddie Pool - Kid\s Summer Sunset Glow D...,277
29798,"Intex Vinyl Sit N Inflatable Tube Pool Float, ...",277


In [130]:
[p for p in res.loc[mask, 'product']]

['Swimline Inflatable 36-Inch Classic Rainbow Beach Ball For Pool/Lake | 90036',
 'Swimline Vinyl Sidebyside Inflatable Pool Float, Purple',
 'Swimline Vinyl Ultimate Super-Sized Mattress Pool Float, Blue',
 'SwimWays Spring Float Graphic Prints - Pink Tropical Hibiscus',
 'Swimline Water Wheel Inflatable Pool Toy',
 'Intex Outdoor Inflatable Family and Kids Swimming Pool Swim Center, Mandarin',
 'Heavy-Duty 4 x 8 Winterizing Air Pillow for Above-Ground Swimming Pools',
 'California Sun Deluxe Oversized Unsinkable Foam Cushion Pool Float',
 'Swimline Giant LED Light-Up Swan Float for Swimming Pools',
 'Summer Waves Large Golden Giant Ride On Swan Inflatable Swimming Pool Float Raft',
 '59" Clear and Blue Inflatable Sea Life 3 Ring Swimming Pool for Children',
 'Inflatable Popsicle Pool Lounge',
 'SwimWays Baby Spring Float Activity Canopy',
 'Intex Palm Leaf Mat Floating Pool Lounge',
 'My Sunshine Animal Split Ring Float',
 'Swim Central 22" Water Sports Underwater Slalom Hoops Course

In [131]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [132]:
tf = TfidfVectorizer()
z = tf.fit_transform([p for p in res.loc[mask, 'product']])

In [133]:
topics = pd.DataFrame(
    z[0].T.todense(),
    index=tf.get_feature_names_out(),
    columns=["TF-IDF"]
).sort_values('TF-IDF',ascending=False)

topics.head()

Unnamed: 0,TF-IDF
36,0.349398
beach,0.349398
90036,0.349398
lake,0.349398
rainbow,0.3181
