<a href="https://colab.research.google.com/github/flaviorv/ml_clustering/blob/main/clusterization_tp1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

###Question 2 - Clusterization with K-Means

In [26]:
import kagglehub
import pandas as pd
import numpy as np
from scipy.sparse import hstack
from sklearn.feature_extraction.text import TfidfVectorizer

# loading the dataset and setting column names
path = kagglehub.dataset_download("lakritidis/product-classification-and-categorization")
dataset = pd.read_csv(path + '/shopmania.csv', header=None)
dataset.columns = ['product_id', 'product', 'category_id', 'category']

# tfidf vectorization
vectorizer = TfidfVectorizer()
vec_product = vectorizer.fit_transform(dataset['product'])

# encoding the nominal categorical feature
enc_category = pd.get_dummies(dataset['category'], drop_first=True, dtype=int).to_numpy()

# concatenating the two preprocessed features
x = hstack([enc_category, vec_product])

In [27]:
from sklearn.cluster import KMeans

# clusterization
k = 10
kmeans = KMeans(n_clusters=k, random_state=42)
dataset['cluster'] = kmeans.fit_predict(x)

In [28]:
# getting the n terms with the highest weight in each cluster
terms = np.array(vectorizer.get_feature_names_out())

def top_terms_per_cluster(kmeans, category_cols_len, n_terms=10):
    for i, center in enumerate(kmeans.cluster_centers_[:, category_cols_len:]):
        top_indices = center.argsort()[::-1][:n_terms]
        print(f"\nCluster {i}:")
        print(", ".join(terms[top_indices]))

print(top_terms_per_cluster(kmeans, enc_category.shape[1]))


Cluster 0:
iphone, plus, case, for, 6s, samsung, galaxy, phone, hybrid, protective

Cluster 1:
accessories, beanie, 00, size, hat, women, 039, unbranded, cap, winter

Cluster 2:
letter, tab, box, pack, 11, white, sheets, binder, legal, file

Cluster 3:
size, girls, junior, bottoms, tops, dresses, mo, dress, skirts, blue

Cluster 4:
accessories, 039, women, size, unbranded, 00, oz, black, belt, scarf

Cluster 5:
men, sleeve, 039, shirt, tops, size, boys, long, short, junior

Cluster 6:
na, bags, women, 039, size, bag, shoulder, leather, handbags, crossbody

Cluster 7:
vibrator, anal, silicone, vibrating, black, cock, fantasy, 039, vibe, plug

Cluster 8:
chair, table, cb2, by, mattress, sofa, with, furniture, black, grey

Cluster 9:
qty, part, no, per, box, jobst, model, knee, toe, 30
None


###Question 5 - Applying PCA to get dimensionality reduction

In [39]:
from sklearn.decomposition import PCA

# transforming category dummies in 2 components
components = 2
pca = PCA(n_components=components)
pca_category = pca.fit_transform(enc_category)

x_pca = hstack([pca_category, vec_product])

# creating clusters with pca components
k = 10
kmeans_pca = KMeans(n_clusters=k, random_state=42)
dataset['pca_cluster'] = kmeans_pca.fit_predict(x_pca)

print(top_terms_per_cluster(kmeans_pca, 1))


Cluster 0:
iphone6, plusbar, casebound, 6sct, for0910, samudra, 00, galaxys, protecto, hybridlx

Cluster 1:
oz06, ml2010, de0240, eau0047, toilettte, of290, casebound, organics, sprayed, fl1060004

Cluster 2:
accessorires, belt0111, womens, size3, 03906, unbreakable, 00, 000, beaniebk, black360

Cluster 3:
underwire, womens, size3, 03906, briefcase, qu, part8, no1, black360, topseller

Cluster 4:
00, jewelrybadger, 000, unbreakable, clothmf, 03906, womens, setlakwe, necklaces, size3

Cluster 5:
mena, 03906, dogcrate, foodie, shortbread, pet618, bag0111, slim2, teeter, cat5

Cluster 6:
00, na00971pp, baguette, womens, 03906, size3, bag0111, shoulderdolly, leathercraft, handblown

Cluster 7:
accessorires, scarfand, 000, unbreakable, womens, size3, 03906, duckie, oz06, grayboe

Cluster 8:
black360, inch101, snowboardjacke, packable, 03906, with10, model1006a, whiteblack, jacketed, red16

Cluster 9:
size3, 00, topseller, girly, boysenberry, juniors, sleeved, mo260, botton, shirts
None


###Question 4 - K-means to vector quantization

In [63]:
from sklearn.datasets import make_blobs

# dataset params
n_samples = 100
n_features = 2
n_clusters = 5

# creating the dataset
x, y_true = make_blobs(n_samples=n_samples, n_features=n_features, centers=n_clusters, random_state=42)

# grouping into clusters
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans.fit(x)

# getting the centroids
centroids = kmeans.cluster_centers_

# new samples
new_points = np.array([[0, 0], [5, 5], [10, 10], [0 ,1], [3, 20]])

# comparing with centróids to grouping the new points too
quantized_labels = kmeans.predict(new_points)

print("New points:\n", new_points)
print("Quantizated labels:", quantized_labels)

New points:
 [[ 0  0]
 [ 5  5]
 [10 10]
 [ 0  1]
 [ 3 20]]
Quantizated labels: [4 2 2 4 0]
