# Prelimniaries

In [1]:
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
import umap
import hdbscan
from sklearn.preprocessing import StandardScaler

  from .autonotebook import tqdm as notebook_tqdm


# Data

In [2]:
data = pd.read_parquet('../data/marketing_sample_walmart.parq.gzip')

In [3]:
data.head()

Unnamed: 0,Uniq Id,Crawl Timestamp,Product Url,Product Name,Description,List Price,Sale Price,Brand,Item Number,Gtin,Package Size,Category,Postal Code,Available
0,51b010b871cde349bd32159a1cc1a15f,2020-01-24 16:08:36 +0000,https://www.walmart.com/ip/Allegiance-Economy-...,Allegiance Economy Dual-scale Digital Thermometer,We aim to show you accurate product informati...,11.11,11.11,Cardinal Health,,707389636164,,Health | Medicine Cabinet | Thermometers | Dig...,,True
1,d6a7f100e44a626a3701804e99236ad6,2020-01-24 15:54:21 +0000,https://www.walmart.com/ip/Kenneth-Cole-Reacti...,Kenneth Cole Reaction Eau De Parfum Spray For ...,We aim to show you accurate product informati...,23.99,23.99,Kenneth Cole,,191565696101,,Premium Beauty | Premium Fragrance | Premium P...,,True
2,99d2b7da7e3e427a942f864937dacd9d,2020-01-24 18:34:28 +0000,https://www.walmart.com/ip/Kid-Tough-Fitness-I...,Kid Tough Fitness Inflatable Free-Standing Pun...,We aim to show you accurate product informati...,30.76,30.76,BONK FIT,563852139.0,855523007070,,Sports & Outdoors | Outdoor Sports | Hunting |...,,True
3,4c76d170c2c6a759cbce812d790a0b88,2020-01-24 11:08:53 +0000,https://www.walmart.com/ip/THE-FIRST-YEARS/167...,THE FIRST YEARS,We aim to show you accurate product informati...,6.99,6.99,The First Years,553299941.0,71463046263,,Baby | Diapering | Baby Wipes,,True
4,8ac95837dc8baa01e504fd8f633ffaf2,2020-03-10 07:37:21 +0000,https://www.walmart.com/ip/4-Pack-MD-USA-Seaml...,4 Pack - MD USA Seamless Toe-Wave-In Mesh Diab...,We aim to show you accurate product informatio...,28.27,28.27,MD USA,,191897514500,,Health | Diabetes Care | Diabetic Socks,,True


In [4]:
data.shape

(30000, 14)

Many of these URLs are invalid (two years old), so I'm going to treat the `Product Name` as the title that would've been retrieved from URL HTML.  Otherwise, we would fetch the titles and/or actual HTML content.

In [5]:
products = data['Product Name'].to_list()

In [6]:
products[:10]

['Allegiance Economy Dual-scale Digital Thermometer',
 'Kenneth Cole Reaction Eau De Parfum Spray For Women 3.40 Oz',
 'Kid Tough Fitness Inflatable Free-Standing Punching Bag + Machine Washable Fabric Cover South Carolina Gamecocks Kids Workout Buddy by Bonk Fit',
 'THE FIRST YEARS',
 '4 Pack - MD USA Seamless Toe-Wave-In Mesh Diabetic Crew Socks, Black, Medium, 1 Pair',
 'Gerber 2nd Foods Apple Baby Food 4 oz. Tubs 2 Count',
 'Kushies Ultra-Lite All-In-One Form-Fitted Washable Cloth Diapers (Blue Whales, Infant)',
 'sunmark Stop Smoking Aid 14 mg Strength Transdermal Patch, 70677003101 - Box of 14',
 'Berkley PowerBait Glitter Chroma-Glow Dough Fishing Bait',
 'Mikasa Rubber Basketball, Intermediate, 28.5']

# Embed Product Names

In [7]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings = model.encode(products, show_progress_bar=True)

Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 938/938 [00:10<00:00, 88.66it/s]


# Dimensionality Reduction and Clustering

In [8]:
red = umap.UMAP(n_components=int(embeddings.shape[1]*.2), metric='cosine')
red_embed = red.fit_transform(embeddings)

In [9]:
sc = StandardScaler()
red_embed = sc.fit_transform(red_embed)

In [10]:
clust = hdbscan.HDBSCAN(min_cluster_size=5, cluster_selection_epsilon=.25)
clust.fit(red_embed)

In [11]:
res = pd.DataFrame({
    'product': products,
    'cluster': clust.labels_
})

In [12]:
res.groupby('cluster').count().sort_values('product', ascending=False)[:20]

Unnamed: 0_level_0,product
cluster,Unnamed: 1_level_1
-1,9208
187,244
8,227
837,133
823,121
7,120
182,114
710,113
577,111
100,103


In [13]:
clust.condensed_tree_.to_pandas().head()

Unnamed: 0,parent,child,lambda_val,child_size
0,30000,30001,0.019493,29984
1,30000,30002,0.019493,16
2,30001,30003,0.022214,29970
3,30001,30004,0.022214,14
4,30002,15352,1.917264,1


# Explore a Example Cluster

In [None]:
#mask = res['cluster'] == 797
print(res[mask])

A list makes it easier to read the full product names

In [15]:
[p for p in res.loc[mask, 'product']]

['Live Clean Shampoo And Wash - Tearless - Baby - 10 Fl Oz',
 'Pipette Baby Shampoo & Wash with Calming Aromas and Plant-Based Squalane, 12 fl oz',
 'Babyses Tearless pH Shampoo, 250ml',
 '4 Pack - JOHNSONS Calming Baby Shampoo with Soothing NaturalCalm Scent 13.6 oz',
 '(2 Pack) Batiste Dry Shampoo, Tropical Fragrance, Mini 1.6 fl. oz.',
 'Aloe VestaÂ 2-n-1 Body Wash and Shampoo-Size: 8 oz - UOM = Each 1',
 '4 Pack - Burts Bees Baby Shampoo & Wash, Original 21 oz',
 '4 Pack - JOHNSONS Head-To-Toe Gentle Tear- Free Baby Wash & Shampoo for Babys Sensitive Skin 1.70 oz',
 'Batiste Dry Shampoo, Fresh Fragrance, 6.73 fl. oz.',
 'Babyganics Gentle Shampoo & Body Wash, Chamomile Verbena, 16 fl oz',
 '2 Pack - AVEENO Baby Wash and Shampoo 8 oz',
 'Aussie Kids 3 in 1 Shampoo, Conditioner & Body Wash, Coral Reef Cupcake, 26.2 Fl Oz',
 'Johnsons CottonTouch Newborn Baby Wash & Shampoo, 6.8 fl. oz',
 'Ricitos de Oro Hypoallergenic Honey Baby Shampoo 400ml - Champu de Miel para Bebe (Pack of 6)',


# Find Most Common Words in Cluster

These would be topics.  We're doing a simple frequency analysis (vice TF-IDF) as we expect documents to be similar, thus aren't interested in words that distinguish them from others in the clusters, but rather words that are common within the cluster.

Intuitively, these results make sense.

In [16]:
from collections import Counter
import re

In [17]:
mask = res['cluster'] == 797
bow = re.findall(r'\w+', ''.join([p for p in res.loc[mask, 'product']]))
c = Counter(bow)
c.most_common()[:5]

[('Shampoo', 17), ('Wash', 10), ('Baby', 9), ('oz', 8), ('and', 6)]

In [18]:
from sklearn.feature_extraction.text import CountVectorizer

In [19]:
c = CountVectorizer(
    strip_accents='ascii',
    stop_words='english',
    ngram_range=(2,3)
)

In [20]:
x = c.fit_transform([p for p in res.loc[mask, 'product']])
x.shape

(19, 272)

In [21]:
pd.DataFrame(x.toarray(), columns=c.get_feature_names_out()).sum(0).sort_values(ascending=False).head(5)

fl oz                7
baby shampoo         5
wash shampoo         4
baby wash shampoo    3
shampoo wash         3
dtype: int64

------------

In [22]:
red = umap.UMAP(n_components=int(embeddings.shape[1]*.2))
red_embed = red.fit_transform(embeddings)

In [23]:
sc = StandardScaler()
red_embed = sc.fit_transform(red_embed)

In [24]:
clust = hdbscan.HDBSCAN(min_cluster_size=5, cluster_selection_epsilon=.25)
clust.fit(red_embed)

KeyboardInterrupt: 

In [None]:
res = pd.DataFrame({
    'product': products,
    'cluster': clust.labels_
})

In [None]:
res.groupby('cluster').count().sort_values('product', ascending=False)[:20]

In [None]:
mask = res['cluster'] == 679
res[mask]

In [None]:
mask = res['cluster'] == 679
bow = re.findall(r'\w+', ''.join([p.lower() for p in res.loc[mask, 'product']]))
c = Counter(bow)
c.most_common()[:5]

In [None]:
c = CountVectorizer(
    strip_accents='ascii',
    stop_words='english',
    ngram_range=(2,3)
)

x = c.fit_transform([p for p in res.loc[mask, 'product']])
pd.DataFrame(x.toarray(), columns=c.get_feature_names_out()).sum(0).sort_values(ascending=False).head(5)

In [None]:
mask = res['cluster'] == 21
res[mask]

In [None]:
mask = res['cluster'] == 21
bow = re.findall(r'\w+', ''.join([p.lower() for p in res.loc[mask, 'product']]))
c = Counter(bow)
c.most_common()[:5]

In [None]:
c = CountVectorizer(
    strip_accents='ascii',
    stop_words='english',
    ngram_range=(1,3)
)

x = c.fit_transform([p for p in res.loc[mask, 'product']])
pd.DataFrame(x.toarray(), columns=c.get_feature_names_out()).sum(0).sort_values(ascending=False).head(5)