In [1]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')

In [2]:
df = pd.read_csv('../data_versions/clusters.csv')
df.head()

Unnamed: 0,Description,Cluster
0,WHITE HANGING HEART T-LIGHT HOLDER,21
1,WHITE METAL LANTERN,22
2,CREAM CUPID HEARTS COAT HANGER,23
3,KNITTED UNION FLAG HOT WATER BOTTLE,1
4,RED WOOLLY HOTTIE WHITE HEART.,6


In [3]:
# construct documents consisting of all products strings belonging to one cluster
cluster_docs = df.groupby('Cluster').agg(lambda x: ' '.join(x))
cluster_docs.head()

Unnamed: 0_level_0,Description
Cluster,Unnamed: 1_level_1
1,KNITTED UNION FLAG HOT WATER BOTTLE HAND WARME...
2,RETROSPOT LARGE MILK JUG SET OF 6 FUNKY BEAKER...
3,WOOD 2 DRAWER CABINET WHITE FINISH WOOD S/3 CA...
4,VINTAGE UNION JACK CUSHION COVER FELTCRAFT CUS...
5,STARS GIFT TAPE PAPER CHAIN KIT 50'S CHRISTMA...


In [10]:
# while removing stopwords, get the term frequency for all terms in a cluster-doc (node degree)
terms_ranking_per_cluster = []
stop_words = set(stopwords.words('english'))
for i in range(cluster_docs.shape[0]):
    terms = cluster_docs.iloc[i,0].split(' ')
    filtered = [term for term in terms if term.lower() not in stop_words]
    terms = pd.Series(filtered)
    terms_counts = terms.value_counts()
    terms = terms_counts.index
    terms_ranking_per_cluster.append(terms)

In [11]:
# get most frequent term in each cluster
cluster_labels = []
for cluster in terms_ranking_per_cluster:
    for term in cluster:
        if len(term) > 0:
            cluster_labels.append(term)
            break

In [12]:
len(cluster_labels)

26

In [13]:
cluster_labels

['MUG',
 'GLASS',
 'DRAWER',
 'CUSHION',
 'WRAP',
 'PINK',
 'SIGN',
 'CLOCK',
 'KEY',
 'MIRROR',
 'WALL',
 'NOTEBOOK',
 'STAND',
 'NUMBER',
 'HOME',
 'BAG',
 'CARD',
 'SET',
 'BOX',
 'NECKLACE',
 'T-LIGHT',
 'CANDLE',
 'HEART',
 'FLOWER',
 'CHRISTMAS',
 'EGG']

In [14]:
# save the cluster labels in a dataframe, so we can easily join with other datasets to attach labels to clusters
df_labels = pd.DataFrame(
    {
        'Cluster': np.arange(1,27),
        'Label': cluster_labels
    }
)

In [16]:
df_labels.head()

Unnamed: 0,Cluster,Label
0,1,MUG
1,2,GLASS
2,3,DRAWER
3,4,CUSHION
4,5,WRAP


In [17]:
df_labels.to_csv('../data_versions/cluster_labels.csv', index=False)