# Prelimniaries

In [1]:
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
import umap
import hdbscan
from sklearn.preprocessing import StandardScaler

  from .autonotebook import tqdm as notebook_tqdm


# Data

In [2]:
data = pd.read_parquet('../data/marketing_sample_walmart.parq.gzip')

In [3]:
data.head()

Unnamed: 0,Uniq Id,Crawl Timestamp,Product Url,Product Name,Description,List Price,Sale Price,Brand,Item Number,Gtin,Package Size,Category,Postal Code,Available
0,51b010b871cde349bd32159a1cc1a15f,2020-01-24 16:08:36 +0000,https://www.walmart.com/ip/Allegiance-Economy-...,Allegiance Economy Dual-scale Digital Thermometer,We aim to show you accurate product informati...,11.11,11.11,Cardinal Health,,707389636164,,Health | Medicine Cabinet | Thermometers | Dig...,,True
1,d6a7f100e44a626a3701804e99236ad6,2020-01-24 15:54:21 +0000,https://www.walmart.com/ip/Kenneth-Cole-Reacti...,Kenneth Cole Reaction Eau De Parfum Spray For ...,We aim to show you accurate product informati...,23.99,23.99,Kenneth Cole,,191565696101,,Premium Beauty | Premium Fragrance | Premium P...,,True
2,99d2b7da7e3e427a942f864937dacd9d,2020-01-24 18:34:28 +0000,https://www.walmart.com/ip/Kid-Tough-Fitness-I...,Kid Tough Fitness Inflatable Free-Standing Pun...,We aim to show you accurate product informati...,30.76,30.76,BONK FIT,563852139.0,855523007070,,Sports & Outdoors | Outdoor Sports | Hunting |...,,True
3,4c76d170c2c6a759cbce812d790a0b88,2020-01-24 11:08:53 +0000,https://www.walmart.com/ip/THE-FIRST-YEARS/167...,THE FIRST YEARS,We aim to show you accurate product informati...,6.99,6.99,The First Years,553299941.0,71463046263,,Baby | Diapering | Baby Wipes,,True
4,8ac95837dc8baa01e504fd8f633ffaf2,2020-03-10 07:37:21 +0000,https://www.walmart.com/ip/4-Pack-MD-USA-Seaml...,4 Pack - MD USA Seamless Toe-Wave-In Mesh Diab...,We aim to show you accurate product informatio...,28.27,28.27,MD USA,,191897514500,,Health | Diabetes Care | Diabetic Socks,,True


In [4]:
data.shape

(30000, 14)

Many of these URLs are invalid (two years old), so I'm going to treat the `Product Name` as the title that would've been retrieved from URL HTML.  Otherwise, we would fetch the titles and/or actual HTML content.

In [5]:
products = data['Product Name'].to_list()

In [6]:
products[:10]

['Allegiance Economy Dual-scale Digital Thermometer',
 'Kenneth Cole Reaction Eau De Parfum Spray For Women 3.40 Oz',
 'Kid Tough Fitness Inflatable Free-Standing Punching Bag + Machine Washable Fabric Cover South Carolina Gamecocks Kids Workout Buddy by Bonk Fit',
 'THE FIRST YEARS',
 '4 Pack - MD USA Seamless Toe-Wave-In Mesh Diabetic Crew Socks, Black, Medium, 1 Pair',
 'Gerber 2nd Foods Apple Baby Food 4 oz. Tubs 2 Count',
 'Kushies Ultra-Lite All-In-One Form-Fitted Washable Cloth Diapers (Blue Whales, Infant)',
 'sunmark Stop Smoking Aid 14 mg Strength Transdermal Patch, 70677003101 - Box of 14',
 'Berkley PowerBait Glitter Chroma-Glow Dough Fishing Bait',
 'Mikasa Rubber Basketball, Intermediate, 28.5']

# Embed Product Names

In [22]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings = model.encode(products, show_progress_bar=True)

Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 938/938 [02:46<00:00,  5.63it/s]


# Dimensionality Reduction and Clustering

In [74]:
red = umap.UMAP(n_components=int(embeddings.shape[1]*.2), metric='cosine')
red_embed = red.fit_transform(embeddings)

In [75]:
sc = StandardScaler()
red_embed = sc.fit_transform(red_embed)

In [76]:
clust = hdbscan.HDBSCAN(min_cluster_size=5, cluster_selection_epsilon=.25)
clust.fit(red_embed)

In [77]:
res = pd.DataFrame({
    'product': products,
    'cluster': clust.labels_
})

In [78]:
res.groupby('cluster').count().sort_values('product', ascending=False)[:20]

Unnamed: 0_level_0,product
cluster,Unnamed: 1_level_1
-1,8677
100,281
151,246
750,246
5,227
782,165
688,149
682,144
727,142
710,136


In [79]:
clust.condensed_tree_.to_pandas().head()

Unnamed: 0,parent,child,lambda_val,child_size
0,30000,30001,0.020398,29988
1,30000,30002,0.020398,12
2,30001,30003,0.025556,29974
3,30001,30004,0.025556,14
4,30002,11780,65.167837,1


# Explore a Example Cluster

In [80]:
mask = res['cluster'] == 797
res[mask]

Unnamed: 0,product,cluster
2556,"Dial Antibacterial Deodorant Bar Soap, Gold, 4...",797
2874,J.r. Watkins Foaming Hand Soap - Grapefruit - ...,797
2969,"Alaffia Authentic African Black Soap, Tangerin...",797
3301,money soap - it cleans! it brings wealth! real...,797
3316,"Joy 00614 Dishwashing Liquid, Lemon, 12.6 Oz B...",797
...,...,...
28936,(Pack of 4) Dial Antibacterial Liquid Hand Soa...,797
29081,"Mrs. Meyers 1237791 Liquid Hand Soap Refill, B...",797
29550,"Mrs. Meyers Hand Soap Lemon Verbena, 12.5 Flui...",797
29582,Assorted Flip Flop Decorative Hand Soap - Set ...,797


A list makes it easier to read the full product names

In [81]:
[p for p in res.loc[mask, 'product']]

['Dial Antibacterial Deodorant Bar Soap, Gold, 4 Ounce, 12 Bars',
 'J.r. Watkins Foaming Hand Soap - Grapefruit - Pack of 6 - 9 Fl Oz',
 'Alaffia Authentic African Black Soap, Tangerine Citrus 2 oz',
 'money soap - it cleans! it brings wealth! real money in every bar from 1$ to 50$ - 5 oz (141g)',
 'Joy 00614 Dishwashing Liquid, Lemon, 12.6 Oz Bottle, 25/carton',
 'Mrs. Meyers Clean Day Liquid Hand Soap Refill, Lemon Verbena, 33 fl oz',
 'One With Nature Dead Sea Mineral Olive Oil Soap - 7 oz',
 'MICRELL Antibacterial Lotion Soap, Gold, 800 mL Soap Refill for MICRELL Bag-in-Box Push-Style Dispenser (Pack of 6) - 9756-06',
 'Mrs. Meyers Rhubarb Liquid Hand Soap (6x12.5 Oz)',
 'Yardley London Soothing Luxurious Hand Soap, Oatmeal & Almond, 8.4 Oz Bar',
 'Rubbermaid Commercial FG750112 AutoFoam 1,100mL Moisturizing Hand Soap Refill (4-Pack)',
 'Areej 1 LB Clear Glycerin Melt and Pour Soap Base',
 '3 Pack - Dial Anti-Bacterial Hand Soap with Moisturizer, Sweet Watermelon 7.50 oz',
 'Softso

# Find Most Common Words in Cluster

These would be topics.  We're doing a simple frequency analysis (vice TF-IDF) as we expect documents to be similar, thus aren't interested in words that distinguish them from others in the clusters, but rather words that are common within the cluster.

Intuitively, these results make sense.

In [53]:
from collections import Counter
import re

In [83]:
mask = res['cluster'] == 797
bow = re.findall(r'\w+', ''.join([p for p in res.loc[mask, 'product']]))
c = Counter(bow)
c.most_common()[:5]

[('Soap', 87), ('Hand', 40), ('Pack', 24), ('oz', 24), ('Liquid', 24)]

In [84]:
from sklearn.feature_extraction.text import CountVectorizer

In [121]:
c = CountVectorizer(
    strip_accents='ascii',
    stop_words='english',
    ngram_range=(2,3)
)

In [122]:
x = c.fit_transform([p for p in res.loc[mask, 'product']])
x.shape

(98, 1302)

In [123]:
pd.DataFrame(x.toarray(), columns=c.get_feature_names_out()).sum(0).sort_values(ascending=False).head(5)

hand soap           34
liquid hand         15
liquid hand soap    15
fl oz               12
soap refill         10
dtype: int64

------------

In [124]:
red = umap.UMAP(n_components=int(embeddings.shape[1]*.2))
red_embed = red.fit_transform(embeddings)

In [125]:
sc = StandardScaler()
red_embed = sc.fit_transform(red_embed)

In [126]:
clust = hdbscan.HDBSCAN(min_cluster_size=5, cluster_selection_epsilon=.25)
clust.fit(red_embed)

In [127]:
res = pd.DataFrame({
    'product': products,
    'cluster': clust.labels_
})

In [128]:
res.groupby('cluster').count().sort_values('product', ascending=False)[:20]

Unnamed: 0_level_0,product
cluster,Unnamed: 1_level_1
-1,9112
236,237
5,224
97,183
705,172
120,166
783,153
737,142
565,139
110,128


In [136]:
mask = res['cluster'] == 679
res[mask]

Unnamed: 0,product,cluster
3351,Watermelon Cooler Flavor Fountain and Color (4...,679
3738,"Gatorade G2 Cool Blue Sports Drink, 12 Fl. Oz....",679
4302,"Gatorade Thirst Quencher Frost Sports Drink, G...",679
5500,"Gatorade Fierce Intense Melon Sports Drink, 28...",679
8269,"(1 Can) Red Bull Energy Drink, Plum-Twist, 12 ...",679
8633,"(24 Cans) Rockstar Recovery Energy Drink, Lemo...",679
9309,"Powerade Zero Sports Drink, Fruit Punch, 20 Fl...",679
10834,Snowy River Cocktail Sugar Red (1x4oz),679
11109,"POWERADE Strawberry Lemonade Bottle, 28 fl oz",679
11259,(2 pack) Mountain Lightning Energy Drink with ...,679


In [137]:
mask = res['cluster'] == 679
bow = re.findall(r'\w+', ''.join([p.lower() for p in res.loc[mask, 'product']]))
c = Counter(bow)
c.most_common()[:5]

[('fl', 19), ('drink', 18), ('oz', 15), ('energy', 14), ('12', 10)]

In [139]:
c = CountVectorizer(
    strip_accents='ascii',
    stop_words='english',
    ngram_range=(2,3)
)

x = c.fit_transform([p for p in res.loc[mask, 'product']])
pd.DataFrame(x.toarray(), columns=c.get_feature_names_out()).sum(0).sort_values(ascending=False).head(5)

fl oz           19
energy drink    10
sports drink     7
12 fl            5
12 fl oz         5
dtype: int64

In [140]:
mask = res['cluster'] == 21
res[mask]

Unnamed: 0,product,cluster
719,OneTouch Verio Test Strip (100 count)-Box of 100,21
918,Contour Next Blood Glucose Test Strips for Sel...,21
1002,FreeStyle Freedom Lite Blood Glucose Monitorin...,21
1102,Second Generation FIT At Home Colon Cancer Tes...,21
1318,2 Pack - Autolet Impression Lancing Device 1 Each,21
...,...,...
26596,"Domqga Blood Sugar Monitoring, Blood Sugar Tes...",21
26771,Glucocard Vital Test Strip (50 count)-Box of 50,21
26975,4 Pack Onetouch Verio Blood Glucose Monitoring...,21
29199,5 Pack Quality Choice Urinary Tract Infection ...,21


In [143]:
mask = res['cluster'] == 21
bow = re.findall(r'\w+', ''.join([p.lower() for p in res.loc[mask, 'product']]))
c = Counter(bow)
c.most_common()[:5]

[('test', 71), ('strips', 63), ('glucose', 54), ('blood', 53), ('50', 42)]

In [144]:
c = CountVectorizer(
    strip_accents='ascii',
    stop_words='english',
    ngram_range=(1,3)
)

x = c.fit_transform([p for p in res.loc[mask, 'product']])
pd.DataFrame(x.toarray(), columns=c.get_feature_names_out()).sum(0).sort_values(ascending=False).head(5)

test           72
strips         68
test strips    56
glucose        55
blood          54
dtype: int64