In [None]:
import numpy as np
import pandas as pd
import polars as pl

import optuna
import torch

from cuml.cluster import HDBSCAN
# from cuml.manifold import UMAP
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.manifold import trustworthiness
from umap import UMAP
import pacmap

import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt

In [None]:
print(f"Доступно GPU: {torch.cuda.device_count()}")
device = 0 if torch.cuda.is_available() else -1
device

In [None]:
# Загрузка sample-выборки эмбеддингов (10000, 768)
embeddings = np.load('embeddings.npy')

data = pd.read_csv("data.csv")
texts = data.text.to_list()
timestamp = data.datetime.to_list()

### Глоабальная оптимизация гиперпараметров PCA+HDBSCAN

In [None]:
def objective_global(trial):
    # Параметры UMAP
    umap_n_neighbors = trial.suggest_int("umap_n_neighbors", 5, 100)
    umap_min_dist = trial.suggest_float("umap_min_dist", 0.0, 0.5, step=0.01)
    umap_n_components = trial.suggest_int("umap_n_components", 10, 80)

    # Применяем UMAP прямо к исходным эмбеддингам
    umap_model = UMAP(
        n_neighbors=umap_n_neighbors,
        min_dist=umap_min_dist,
        n_components=umap_n_components,
        metric='cosine',
        random_state=42
    )
    embedding_intermediate = umap_model.fit_transform(embeddings)

    # Параметры HDBSCAN
    hdbscan_min_cluster_size = trial.suggest_int("hdbscan_min_cluster_size", 5, 100)
    hdbscan_min_samples = trial.suggest_int("hdbscan_min_samples", 1, hdbscan_min_cluster_size)
    cluster_selection_epsilon = trial.suggest_uniform("cluster_selection_epsilon", 0.0, 1.0)

    clusterer = HDBSCAN(
        min_cluster_size=hdbscan_min_cluster_size,
        min_samples=hdbscan_min_samples,
        cluster_selection_method='eom',
        cluster_selection_epsilon=cluster_selection_epsilon,
    )
    labels = clusterer.fit_predict(embedding_intermediate)

    # Вычисляем silhouette score только для ненойзовых точек
    valid = labels != -1
    if np.sum(valid) < 10 or len(np.unique(labels[valid])) < 2:
        return -1.0
    score = silhouette_score(embedding_intermediate[valid], labels[valid])
    return score

print("Запускаем глобальную оптимизацию...")
study_global = optuna.create_study(direction="maximize")
study_global.optimize(objective_global, n_trials=200, show_progress_bar=True)

print("Глобальная оптимизация завершена")
print("Лучшие глобальные гиперпараметры:")
for key, value in study_global.best_params.items():
    print(f"  {key}: {value}")
print("Лучший глобальный silhouette score:", study_global.best_value)

Глобальная оптимизация завершена 
Лучшие глобальные гиперпараметры: 
* umap_n_neighbors: 48
* umap_min_dist: 0.05
* umap_n_components: 10
* hdbscan_min_cluster_size: 15
* hdbscan_min_samples: 11
* cluster_selection_epsilon: 0.002172166608873129

Лучший глобальный silhouette score: 0.7065192461013794


### Локальная (refined) оптимизация гиперпараметров

In [None]:
global_best = study_global.best_params

def refined_range(val, delta, low_bound, high_bound):
    """Функция для вычисления новых диапазонов вокруг глобальных лучших значений"""
    return (max(low_bound, val - delta), min(high_bound, val + delta))

# Задаём уточнённые диапазоны:
n_neighbors_range = refined_range(global_best["umap_n_neighbors"], 20, 5, 100)
min_dist_range = refined_range(global_best["umap_min_dist"], 0.05, 0.0, 0.5)
n_components_range = refined_range(global_best["umap_n_components"], 10, 10, 80)
hdbscan_cluster_range = refined_range(global_best["hdbscan_min_cluster_size"], 10, 5, 100)

def objective_refined(trial):
    # Параметры UMAP
    umap_n_neighbors = trial.suggest_int("umap_n_neighbors", n_neighbors_range[0], n_neighbors_range[1])
    umap_min_dist = trial.suggest_float("umap_min_dist", min_dist_range[0], min_dist_range[1], step=0.01)
    umap_n_components = trial.suggest_int("umap_n_components", n_components_range[0], n_components_range[1])

    umap_model = UMAP(
        n_neighbors=umap_n_neighbors,
        min_dist=umap_min_dist,
        n_components=umap_n_components,
        metric="cosine",
        random_state=42
    )
    embedding_intermediate = umap_model.fit_transform(embeddings)

    # Параметры HDBSCAN
    hdbscan_min_cluster_size = trial.suggest_int("hdbscan_min_cluster_size", hdbscan_cluster_range[0], hdbscan_cluster_range[1])
    hdbscan_min_samples = trial.suggest_int("hdbscan_min_samples", 1, hdbscan_min_cluster_size)
    cluster_selection_epsilon = trial.suggest_uniform("cluster_selection_epsilon", 0.0, 0.2)

    clusterer = HDBSCAN(
        min_cluster_size=hdbscan_min_cluster_size,
        min_samples=hdbscan_min_samples,
        cluster_selection_method='eom',
        cluster_selection_epsilon=cluster_selection_epsilon,
    )
    labels = clusterer.fit_predict(embedding_intermediate)

    valid = labels != -1
    if np.sum(valid) < 10 or len(np.unique(labels[valid])) < 2:
        return -1.0
    score = silhouette_score(embedding_intermediate[valid], labels[valid])
    return score

print("Запускаем уточненную оптимизацию...")
study_refined = optuna.create_study(direction="maximize")
study_refined.optimize(objective_refined, n_trials=100, show_progress_bar=True)

print("Уточненная оптимизация завершена")
print("Лучшие уточненные гиперпараметры:")
for key, value in study_refined.best_params.items():
    print(f"  {key}: {value}")
print("Лучший refined silhouette score:", study_refined.best_value)

Уточненная оптимизация завершена 
Лучшие уточненные гиперпараметры:
* umap_n_neighbors: 48
* umap_min_dist: 0.05
* umap_n_components: 15
* hdbscan_min_cluster_size: 16
* hdbscan_min_samples: 15
* cluster_selection_epsilon: 0.022176326579653287

Лучший refined silhouette score: 0.7122868299484253

In [None]:
##############################################
# 3. Визуалзация оптимизации гиперпараметров (глобальная оптимизация) с Plotly
##############################################
# Собираем историю глобальной оптимизации
records = []
for trial in study_global.trials:
    if trial.state.name != "COMPLETE":
        continue
    rec = trial.params.copy()
    rec["silhouette_score"] = trial.value
    records.append(rec)
for trial in study_refined.trials:
    if trial.state.name != "COMPLETE":
        continue
    rec = trial.params.copy()
    rec["silhouette_score"] = trial.value
    records.append(rec)

df_trials = pd.DataFrame(records)

# Преобразуем все подходящие столбцы в числовой тип
for col in df_trials.columns:
    try:
        df_trials[col] = pd.to_numeric(df_trials[col])
    except Exception:
        pass

fig_parallel = px.parallel_coordinates(
    df_trials[df_trials.silhouette_score > 0.64],
    color="silhouette_score",
    labels={
        "umap_n_neighbors": "UMAP n_neighbors",
        "umap_min_dist": "UMAP min_dist",
        "umap_n_components": "UMAP n_components",
        "hdbscan_min_cluster_size": "HDBSCAN min_cluster_size",
        "hdbscan_min_samples": "HDBSCAN min_samples",
        "silhouette_score": "Silhouette"
    },
    color_continuous_scale=px.colors.sequential.Inferno,
    title="Глобальная оптимизация гиперпараметров (UMAP + HDBSCAN)",
    template="plotly_dark",
    height=800
)
fig_parallel.show()
df_trials[df_trials.silhouette_score > 0.64].sort_values("silhouette_score", ascending=False)

In [None]:
df_trials.to_csv("hpo.csv")

### Финальное обучение модели с лучшими (refined) параметрами

In [None]:
##############################################
# 4. Финальное обучение модели с лучшими (refined) параметрами
##############################################
refined_best = study_refined.best_params

# Финальный UMAP на исходных эмбеддингах
umap_final = UMAP(
    n_neighbors=refined_best["umap_n_neighbors"],
    min_dist=refined_best["umap_min_dist"],
    n_components=refined_best["umap_n_components"],
    metric="cosine",
    random_state=42
)
embedding_intermediate_final = umap_final.fit_transform(embeddings)

# Финальная кластеризация HDBSCAN
hdbscan_final = HDBSCAN(
    min_cluster_size=refined_best["hdbscan_min_cluster_size"],
    min_samples=refined_best["hdbscan_min_samples"],
    cluster_selection_epsilon=refined_best["cluster_selection_epsilon"],
    cluster_selection_method='eom',
)
final_labels = hdbscan_final.fit_predict(embedding_intermediate_final)
n_clusters = len(np.unique(final_labels[final_labels != -1]))
n_noise = np.sum(final_labels == -1)
print(f"Финальная кластеризация: кластеров = {n_clusters}, noise = {n_noise} точек")

Финальная кластеризация: кластеров = 86, noise = 3841 точек

In [None]:
# Фильтруем данные: рассматриваем только ненойзовые точки
mask_valid = final_labels != -1
if np.sum(mask_valid) < 2:
    print("Недостаточно валидных точек для вычисления метрик.")
else:
    # Берем промежуточное представление, использованное для финальной кластеризации
    X_valid = embedding_intermediate_final[mask_valid]
    labels_valid = final_labels[mask_valid]

    # Вычисляем метрику Calinski-Harabasz
    ch_score = calinski_harabasz_score(X_valid, labels_valid)
    # Вычисляем метрику Davies-Bouldin
    db_score = davies_bouldin_score(X_valid, labels_valid)

    print(f"Calinski-Harabasz Score: {ch_score:.2f}")
    print(f"Davies-Bouldin Score: {db_score:.2f}")

Calinski-Harabasz Score: 43867.78  
Davies-Bouldin Score: 0.34

### Визуализация итогового 2D графика кластеризации с Plotly

In [None]:
pacmap_mapper = pacmap.PaCMAP(n_components=2, random_state=42, MN_ratio=30, FP_ratio=20)
embedding_2d = pacmap_mapper.fit_transform(embedding_intermediate_final)
# Центрируем 2D-проекцию вокруг [0, 0]
embedding_2d_centered = embedding_2d - embedding_2d.mean(axis=0)

In [None]:
df_vis = pd.DataFrame({
    "Dim1": embedding_2d_centered[:, 0],
    "Dim2": embedding_2d_centered[:, 1],
    "Cluster": final_labels.astype(str)  # преобразуем метки в строку для категорий
})

fig_clusters = px.scatter(
    df_vis,
    x="Dim1",
    y="Dim2",
    color="Cluster",
    title="2D-проекция кластеризации (PaCMAP)",
    labels={"Dim1": "Dimension 1", "Dim2": "Dimension 2"}
)
fig_clusters.update_layout(legend_title_text="Кластер", height=800)
fig_clusters.show()