In [18]:
from datasets import load_dataset

dataset = load_dataset("ag_news")
train_data = dataset['train']

import pandas as pd
df = pd.DataFrame(train_data)


In [38]:
from sklearn.model_selection import train_test_split

df = df.groupby('label').apply(lambda x: x.sample(1250)).reset_index(drop=True)
X = df['text']
y = df['label']





In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=1000)
X_tfidf = vectorizer.fit_transform(X)


In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L12-v2')
X_sbert = model.encode(X, show_progress_bar=True)


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

In [23]:
import numpy as np
#np.save("sbert_embeddings.npy", X_sbert)

In [24]:
df["label_name"] = df["label"].map({
    0: "World", 1: "Sports", 2: "Business", 3: "Sci/Tech"
})



In [25]:
from pathlib import Path

import kagglehub
import pandas as pd
import plotly.express as px
import plotly.io as pio
import torch
from PIL import Image
from numpy.typing import NDArray
from sklearn import cluster
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.mixture import GaussianMixture
from sklearn.neighbors import NearestNeighbors
from tqdm import tqdm
from transformers import AutoFeatureExtractor, AutoModel
from umap import UMAP
from sklearn.metrics.cluster import adjusted_rand_score

In [30]:
def project_vectors(data: NDArray, technique: str = "tsne", **options) -> NDArray:
    if technique == "pca":
        transformer = PCA(**options)
    elif technique == "tsne":
        transformer = TSNE(**options)
    elif technique == "umap":
        transformer = UMAP(**options)
    else:
        raise ValueError(
            f"Invalid technique: {technique}. Choose from 'pca', 'tsne', or 'umap'."
        )

    transformed_data = transformer.fit_transform(data)
    return transformed_data

In [35]:
def plot_embeddings(embeddings: NDArray,
                    clustering_results: NDArray,
                    symbol: str = "class",
                    color: str = "clustering_results",
                    reduction_techniques: str = "tsne",
                    classes: list[str] = y,
                    text: list[str] = X,
                    plot_3d: bool = False):
    if embeddings.shape[1] > 2:
        reduced_embeddings = project_vectors(embeddings, technique=reduction_techniques,
                                             n_components=3 if plot_3d else 2, random_state=6)
        print(f"Reduced embeddings from {embeddings.shape[1]} to {reduced_embeddings.shape[1]}")
    else:
        reduced_embeddings = embeddings

    df = pd.DataFrame({
        "x": reduced_embeddings[:, 0],
        "y": reduced_embeddings[:, 1],
        "class": classes,
        "text": list(map(str, text)),
        "clustering_results": clustering_results
    })

    if plot_3d:
        df["z"] = reduced_embeddings[:, 2]

    df = df.astype({
        "class": "category",
        "clustering_results": "category"
    })
    if plot_3d:
        fig = px.scatter_3d(df, x="x", y="y", z="z", color=color, symbol=symbol,
                            title=f"{reduction_techniques} reduction technique. Visualization of Image Embeddings")
    else:
        fig = px.scatter(df, x="x", y="y", color=color, symbol=symbol,
                         title=f"{reduction_techniques} reduction technique. Visualization of Image Embeddings")

    fig.update_traces(textfont_size=25, marker=dict(size=3))
    fig.update_layout(template="plotly")
    fig.show()


In [36]:
plot_embeddings(X_tfidf.toarray(), 0, symbol="class", reduction_techniques="pca", color="class")
plot_embeddings(X_sbert, 0, symbol="class", reduction_techniques="pca", color="class")


Reduced embeddings from 1000 to 2


Reduced embeddings from 384 to 2


In [37]:
plot_embeddings(X_tfidf.toarray(), 0, symbol="class", reduction_techniques="tsne", color="class")
plot_embeddings(X_sbert, 0, symbol="class", reduction_techniques="tsne", color="class")

Reduced embeddings from 1000 to 2


Reduced embeddings from 384 to 2


In [58]:
X_umap_tfidf = project_vectors(X_tfidf.toarray(), technique="umap", n_components=2)
X_umap_sbert = project_vectors(X_sbert, technique="umap", n_components=2)

plot_embeddings(X_umap_tfidf, 0, symbol="class", reduction_techniques="umap", color="class")
plot_embeddings(X_umap_sbert, 0, symbol="class", reduction_techniques="umap", color="class")


'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.


'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.



In [62]:
from typing import Tuple
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

def classify_knn(X_train, X_test, y_train, y_test) -> Tuple[float, float]:
    knn_classifier = KNeighborsClassifier(n_neighbors=5, metric='cosine')
    knn_classifier.fit(X_train, y_train)
    y_pred = knn_classifier.predict(X_test)
    return f1_score(y_test, y_pred, average="weighted"), accuracy_score(y_test, y_pred)


In [63]:
X_full_sbert_train, X_full_sbert_test, y_full_sbert_train, y_full_sbert_test = train_test_split(X_sbert,y,test_size=0.2, random_state=42)
X_full_tfidf_train, X_full_tfidf_test, y_full_tfidf_train, y_full_tfidf_test = train_test_split(X_tfidf,y,test_size=0.2, random_state=42)
X_umap_sbert_train, X_umap_sbert_test, y_umap_sbert_train, y_umap_sbert_test = train_test_split(X_umap_sbert, y, test_size=0.2, random_state=42)
X_umap_tfidf_train, X_umap_tfidf_test, y_umap_tfidf_train, y_umap_tfidf_test = train_test_split(X_umap_tfidf, y, test_size=0.2, random_state=42)

In [64]:
f1, acc = classify_knn(X_full_sbert_train, X_full_sbert_test, y_full_sbert_train, y_full_sbert_test)
print(f"Full SBERT -> F1 Score: {f1:.4f}, Accuracy: {acc:.4f}")

f1, acc = classify_knn(X_full_tfidf_train, X_full_tfidf_test, y_full_tfidf_train, y_full_tfidf_test)
print(f"Full TF-IDF -> F1 Score: {f1:.4f}, Accuracy: {acc:.4f}")

f1, acc = classify_knn(X_umap_sbert_train, X_umap_sbert_test, y_umap_sbert_train, y_umap_sbert_test)
print(f"UMAP + SBERT -> F1 Score: {f1:.4f}, Accuracy: {acc:.4f}")

f1, acc = classify_knn(X_umap_tfidf_train, X_umap_tfidf_test, y_umap_tfidf_train, y_umap_tfidf_test)
print(f"UMAP + TF-IDF -> F1 Score: {f1:.4f}, Accuracy: {acc:.4f}")




Full SBERT -> F1 Score: 0.9024, Accuracy: 0.9030
Full TF-IDF -> F1 Score: 0.7832, Accuracy: 0.7850
UMAP + SBERT -> F1 Score: 0.7539, Accuracy: 0.7560
UMAP + TF-IDF -> F1 Score: 0.5688, Accuracy: 0.5810


In [None]:
import xgboost as xgb

def classify_xgboost(X_train, X_test, y_train, y_test):
    xgb_classifier = xgb.XGBRFClassifier(max_depth=5, random_state=42)
    xgb_classifier.fit(X_train, y_train)
    y_pred = xgb_classifier.predict(X_test)
    return f1_score(y_test, y_pred, average="weighted"), accuracy_score(y_test, y_pred)


In [66]:
f1, acc = classify_xgboost(X_full_sbert_train, X_full_sbert_test, y_full_sbert_train, y_full_sbert_test)
print(f"[XGBoost] Full SBERT -> F1 Score: {f1:.4f}, Accuracy: {acc:.4f}")

f1, acc = classify_xgboost(X_full_tfidf_train, X_full_tfidf_test, y_full_tfidf_train, y_full_tfidf_test)
print(f"[XGBoost] Full TF-IDF -> F1 Score: {f1:.4f}, Accuracy: {acc:.4f}")

f1, acc = classify_xgboost(X_umap_sbert_train, X_umap_sbert_test, y_umap_sbert_train, y_umap_sbert_test)
print(f"[XGBoost] UMAP + SBERT -> F1 Score: {f1:.4f}, Accuracy: {acc:.4f}")

f1, acc = classify_xgboost(X_umap_tfidf_train, X_umap_tfidf_test, y_umap_tfidf_train, y_umap_tfidf_test)
print(f"[XGBoost] UMAP + TF-IDF -> F1 Score: {f1:.4f}, Accuracy: {acc:.4f}")


[XGBoost] Full SBERT -> F1 Score: 0.7953, Accuracy: 0.7960
[XGBoost] Full TF-IDF -> F1 Score: 0.6369, Accuracy: 0.6270
[XGBoost] UMAP + SBERT -> F1 Score: 0.8862, Accuracy: 0.8870
[XGBoost] UMAP + TF-IDF -> F1 Score: 0.7679, Accuracy: 0.7690


In [None]:
results = []

# knn
f1, acc = classify_knn(X_full_sbert_train, X_full_sbert_test, y_full_sbert_train, y_full_sbert_test)
results.append(("KNN", "Full SBERT", f1, acc))

f1, acc = classify_knn(X_full_tfidf_train, X_full_tfidf_test, y_full_tfidf_train, y_full_tfidf_test)
results.append(("KNN", "Full TF-IDF", f1, acc))

f1, acc = classify_knn(X_umap_sbert_train, X_umap_sbert_test, y_umap_sbert_train, y_umap_sbert_test)
results.append(("KNN", "UMAP + SBERT", f1, acc))

f1, acc = classify_knn(X_umap_tfidf_train, X_umap_tfidf_test, y_umap_tfidf_train, y_umap_tfidf_test)
results.append(("KNN", "UMAP + TF-IDF", f1, acc))

# xgb
f1, acc = classify_xgboost(X_full_sbert_train, X_full_sbert_test, y_full_sbert_train, y_full_sbert_test)
results.append(("XGBoost", "Full SBERT", f1, acc))

f1, acc = classify_xgboost(X_full_tfidf_train, X_full_tfidf_test, y_full_tfidf_train, y_full_tfidf_test)
results.append(("XGBoost", "Full TF-IDF", f1, acc))

f1, acc = classify_xgboost(X_umap_sbert_train, X_umap_sbert_test, y_umap_sbert_train, y_umap_sbert_test)
results.append(("XGBoost", "UMAP + SBERT", f1, acc))

f1, acc = classify_xgboost(X_umap_tfidf_train, X_umap_tfidf_test, y_umap_tfidf_train, y_umap_tfidf_test)
results.append(("XGBoost", "UMAP + TF-IDF", f1, acc))






  Model   Feature Set  F1 Score  Accuracy
    KNN    Full SBERT  0.902434     0.903
    KNN   Full TF-IDF  0.783205     0.785
    KNN  UMAP + SBERT  0.753902     0.756
    KNN UMAP + TF-IDF  0.568809     0.581
XGBoost    Full SBERT  0.795331     0.796
XGBoost   Full TF-IDF  0.636861     0.627
XGBoost  UMAP + SBERT  0.886184     0.887
XGBoost UMAP + TF-IDF  0.767923     0.769


In [None]:
df = pd.DataFrame(results, columns=["Model", "Data", "F1 Score", "Accuracy"])

print(df.sort_values(by="F1 Score", ascending=False).to_string(index=False))

  Model          Data  F1 Score  Accuracy
    KNN    Full SBERT  0.902434     0.903
XGBoost  UMAP + SBERT  0.886184     0.887
XGBoost    Full SBERT  0.795331     0.796
    KNN   Full TF-IDF  0.783205     0.785
XGBoost UMAP + TF-IDF  0.767923     0.769
    KNN  UMAP + SBERT  0.753902     0.756
XGBoost   Full TF-IDF  0.636861     0.627
    KNN UMAP + TF-IDF  0.568809     0.581


In [15]:


def cluster_embeddings(embeddings: NDArray, algorithm_name: str = 'KMeans', **kwargs):
    """
    Clusters embeddings using a specified clustering algorithm from sklearn.
    
    Parameters:
    -----------
    embeddings : numpy.ndarray
        The embeddings to cluster, shape (n_samples, n_features)
    algorithm_name : str
        Name of the clustering algorithm to use (must be available in sklearn.cluster
        or be GaussianMixture)
    **kwargs : 
        Additional parameters to pass to the clustering algorithm
        
    Returns:
    --------
    labels : numpy.ndarray
        Cluster labels for each embedding, shape (n_samples,)
    model : object
        The fitted clustering model
    """
    if not isinstance(embeddings, np.ndarray):
        raise TypeError("Embeddings must be a numpy array")

    if len(embeddings.shape) != 2:
        raise ValueError(f"Embeddings must be 2D array, got shape {embeddings.shape}")

    if algorithm_name == 'GaussianMixture':
        algorithm_class = GaussianMixture
    else:
        try:
            algorithm_class = getattr(cluster, algorithm_name)
        except AttributeError:
            raise ValueError(f"Algorithm '{algorithm_name}' not found in sklearn.cluster or is not GaussianMixture")

    model = algorithm_class(**kwargs)

    if hasattr(model, 'fit_predict'):
        labels = model.fit_predict(embeddings)
    elif hasattr(model, 'fit') and hasattr(model, 'predict'):

        model.fit(embeddings)
        labels = model.predict(embeddings)
    else:
        raise ValueError(f"Algorithm '{algorithm_name}' does not support required methods")

    return labels, model
