In [1]:
from datasets import load_dataset

dataset = load_dataset("ag_news")
train_data = dataset['train']

import pandas as pd
df = pd.DataFrame(train_data)


In [2]:
# df = df.groupby('label').apply(lambda x: x.sample(1000)).reset_index(drop=True)


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=1000)
X_tfidf = vectorizer.fit_transform(df['text'])


In [4]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L12-v2')
X_sbert = model.encode(df['text'], show_progress_bar=True)


Batches:   0%|          | 0/3750 [00:00<?, ?it/s]

In [6]:
import numpy as np
np.save("sbert_embeddings.npy", X_sbert)

In [7]:
df["label_name"] = df["label"].map({
    0: "World", 1: "Sports", 2: "Business", 3: "Sci/Tech"
})



In [None]:
from pathlib import Path

import kagglehub
import pandas as pd
import plotly.express as px
import plotly.io as pio
import torch
from PIL import Image
from numpy.typing import NDArray
from sklearn import cluster
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.mixture import GaussianMixture
from sklearn.neighbors import NearestNeighbors
from tqdm import tqdm
from transformers import AutoFeatureExtractor, AutoModel
from umap import UMAP
from sklearn.metrics.cluster import adjusted_rand_score

In [None]:
def project_vectors(data: NDArray, technique: str = "tsne", **options) -> NDArray:
    if technique == "pca":
        transformer = PCA(**options)
    elif technique == "tsne":
        transformer = TSNE(**options)
    elif technique == "umap":
        transformer = UMAP(**options)
    else:
        raise ValueError(
            f"Invalid technique: {technique}. Choose from 'pca', 'tsne', or 'umap'."
        )

    transformed_data = transformer.fit_transform(data)
    return transformed_data

In [None]:
def plot_embeddings(embeddings: NDArray,
                    clustering_results: NDArray,
                    symbol: str = "class",
                    color: str = "clustering_results",
                    reduction_techniques: str = "tsne",
                    classes: list[str] = df["label"],
                    text: list[str] = df["text"],
                    plot_3d: bool = False):
    if embeddings.shape[1] > 2:
        reduced_embeddings = project_vectors(embeddings, technique=reduction_techniques,
                                             n_components=3 if plot_3d else 2, random_state=6)
        print(f"Reduced embeddings from {embeddings.shape[1]} to {reduced_embeddings.shape[1]}")
    else:
        reduced_embeddings = embeddings

    df = pd.DataFrame({
        "x": reduced_embeddings[:, 0],
        "y": reduced_embeddings[:, 1],
        "class": classes,
        "text": list(map(str, text)),
        "clustering_results": clustering_results
    })

    if plot_3d:
        df["z"] = reduced_embeddings[:, 2]

    df = df.astype({
        "class": "category",
        "clustering_results": "category"
    })
    if plot_3d:
        fig = px.scatter_3d(df, x="x", y="y", z="z", color=color, symbol=symbol,
                            title=f"{reduction_techniques} reduction technique. Visualization of Image Embeddings")
    else:
        fig = px.scatter(df, x="x", y="y", color=color, symbol=symbol,
                         title=f"{reduction_techniques} reduction technique. Visualization of Image Embeddings")

    fig.update_traces(textfont_size=25, marker=dict(size=3))
    fig.update_layout(template="plotly")
    fig.show()


In [None]:
# px.scatter(x=X_tfidf_pca[:,0], y=X_tfidf_pca[:,1], color=df["label_name"], title="TF-IDF PCA").show()
# px.scatter(x=X_sbert_pca[:,0], y=X_sbert_pca[:,1], color=df["label_name"], title="SBERT PCA").show()
plot_embeddings(X_tfidf.toarray(), 0, symbol="class", reduction_techniques="pca", color="class")
plot_embeddings(X_sbert, 0, symbol="class", reduction_techniques="pca", color="class")


Reduced embeddings from 1000 to 2


Reduced embeddings from 384 to 2


In [None]:
# px.scatter(x=X_tfidf_tsne[:,0], y=X_tfidf_tsne[:,1], color=df["label_name"], title="TF-IDF TSNE").show()
# px.scatter(x=X_sbert_tsne[:,0], y=X_sbert_tsne[:,1], color=df["label_name"], title="SBERT TSNE").show()
plot_embeddings(X_tfidf.toarray(), 0, symbol="class", reduction_techniques="tsne", color="class")
plot_embeddings(X_sbert, 0, symbol="class", reduction_techniques="tsne", color="class")

Reduced embeddings from 1000 to 2


Reduced embeddings from 384 to 2


In [None]:
# px.scatter(x=X_tfidf_umap[:,0], y=X_tfidf_umap[:,1], color=df["label_name"], title="TF-IDF UMAP").show()
# px.scatter(x=X_sbert_umap[:,0], y=X_sbert_umap[:,1], color=df["label_name"], title="SBERT UMAP").show()
plot_embeddings(X_tfidf.toarray(), 0, symbol="class", reduction_techniques="umap", color="class")
plot_embeddings(X_sbert, 0, symbol="class", reduction_techniques="umap", color="class")


'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



Reduced embeddings from 1000 to 2



'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



Reduced embeddings from 384 to 2


In [None]:


def cluster_embeddings(embeddings: NDArray, algorithm_name: str = 'KMeans', **kwargs):
    """
    Clusters embeddings using a specified clustering algorithm from sklearn.
    
    Parameters:
    -----------
    embeddings : numpy.ndarray
        The embeddings to cluster, shape (n_samples, n_features)
    algorithm_name : str
        Name of the clustering algorithm to use (must be available in sklearn.cluster
        or be GaussianMixture)
    **kwargs : 
        Additional parameters to pass to the clustering algorithm
        
    Returns:
    --------
    labels : numpy.ndarray
        Cluster labels for each embedding, shape (n_samples,)
    model : object
        The fitted clustering model
    """
    if not isinstance(embeddings, np.ndarray):
        raise TypeError("Embeddings must be a numpy array")

    if len(embeddings.shape) != 2:
        raise ValueError(f"Embeddings must be 2D array, got shape {embeddings.shape}")

    if algorithm_name == 'GaussianMixture':
        algorithm_class = GaussianMixture
    else:
        try:
            algorithm_class = getattr(cluster, algorithm_name)
        except AttributeError:
            raise ValueError(f"Algorithm '{algorithm_name}' not found in sklearn.cluster or is not GaussianMixture")

    model = algorithm_class(**kwargs)

    if hasattr(model, 'fit_predict'):
        labels = model.fit_predict(embeddings)
    elif hasattr(model, 'fit') and hasattr(model, 'predict'):

        model.fit(embeddings)
        labels = model.predict(embeddings)
    else:
        raise ValueError(f"Algorithm '{algorithm_name}' does not support required methods")

    return labels, model
