In [None]:
import torch.nn.functional as F
from tqdm import tqdm
import polars as pl
from torch import Tensor
import torch
from transformers import AutoModel, AutoTokenizer
import joblib
import numpy as np
from sklearn.impute import KNNImputer
from hdbscan import HDBSCAN

# Load the MTR dataset from a Parquet file
mtr = pl.read_parquet("../MTR.parquet")

# Load the labels dataset from a Parquet file
labels = pl.read_parquet("df_imputed_filled_labels.parquet")

# Join the MTR dataset with the labels dataset on the "код СКМТР" column
data = mtr[["код СКМТР", "Наименование"]].join(labels, on="код СКМТР")

# Prepare the texts for embedding by adding a prefix
texts = ["passage: " + text for text in data["Наименование"].to_numpy()]

# Load the tokenizer and model from the pretrained multilingual-e5-small model
tokenizer = AutoTokenizer.from_pretrained('intfloat/multilingual-e5-small')
model = AutoModel.from_pretrained('intfloat/multilingual-e5-small')

# Check if multiple GPUs are available and use them if so
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = torch.nn.DataParallel(model, device_ids=[0, 1, 2, 3])

# Move the model to the GPU
model.to('cuda')

def get_embeddings(texts, batch_size=128):
    """
    Generate embeddings for the given texts in batches.

    Args:
        texts (list): List of texts to generate embeddings for.
        batch_size (int): Number of texts to process in each batch.

    Returns:
        Tensor: Concatenated embeddings for all texts.
    """
    all_embeddings = []
    for i in tqdm(range(0, len(texts), batch_size)):
        batch_texts = texts[i:i + batch_size]
        inputs = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True).to('cuda')
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings = outputs.last_hidden_state[:, 0, :]
        all_embeddings.append(embeddings.cpu())
    return torch.cat(all_embeddings, dim=0)

# Generate embeddings for the texts
embeddings = get_embeddings(texts)

# Create a DataFrame from the embeddings and save it to a Parquet file
df = pl.DataFrame(embeddings.numpy())
df.write_parquet("e5_small_emb.parquet")

# Add the "ОКПД2" and "код СКМТР" columns to the DataFrame for clustering
df_for_clustering = df.with_columns(
    mtr["ОКПД2"].alias("ОКПД2"),
    mtr["код СКМТР"].alias("код СКМТР")
)

# Separate the DataFrame into rows with null and non-null "ОКПД2" values
df_for_clustering_null = df_for_clustering.filter(
    pl.col("ОКПД2").is_null()
)

df_for_clustering_not_null = df_for_clustering.filter(
    ~(pl.col("ОКПД2").is_null())
)

# Load the label encoder and encode the "ОКПД2" column
le = joblib.load('label_encoder_e5_clusters.pkl')
df_encoded = df_for_clustering_not_null.with_columns(
    pl.Series(le.transform(df_for_clustering_not_null["ОКПД2"]), dtype=float).alias("ОКПД2")
)

# Concatenate the encoded DataFrame with the null DataFrame
all_data_encoded = pl.concat(
    [
        df_encoded, df_for_clustering_null.with_columns(
            pl.Series(np.array([None]*len(df_for_clustering_null), dtype=np.float64)).alias("ОКПД2")
        )
    ],
    how="vertical"
)

# Drop the "код СКМТР" column and convert the DataFrame to a NumPy array
data = all_data_encoded.drop(["код СКМТР"]).to_numpy()

# Impute missing values using KNNImputer
imputer = KNNImputer(n_neighbors=1)
data_imputed = imputer.fit_transform(data)

# Create a DataFrame from the imputed data
df_imputed = pl.DataFrame(data_imputed)
df_imputed = pl.concat([df_imputed, all_data_encoded[["код СКМТР"]]], how="horizontal")

# Get the unique "ОКПД2" values
unique_okpd2 = df_imputed["column_384"].unique().to_numpy()

# Cluster the data for each unique "ОКПД2" value
full_data = None
for c, okpd2 in enumerate(tqdm(unique_okpd2)):
    group_df = df_imputed.filter(pl.col("column_384") == okpd2)
    features = group_df.drop(["column_384", "код СКМТР"])

    if len(features) < 2:
        group_df = group_df[["column_384", "код СКМТР"]].with_columns(
            pl.Series([-1]*len(features)).alias("cluster")
        )
    else:
        if len(features) > 1000:
            print(len(features))
        hdbscan = HDBSCAN(min_cluster_size=2, n_jobs=-1, allow_single_cluster=True)
        group_df = group_df[["column_384", "код СКМТР"]].with_columns(
            pl.Series(hdbscan.fit_predict(features)).alias("cluster")
        )

    if c == 0:
        full_data = group_df
    else:
        full_data = pl.concat([full_data, group_df], how="vertical")

# Save the clustered data to a Parquet file
full_data.write_parquet("e5_small_clusters_v2.parquet")

# Load the label encoder and decode the "ОКПД2" column
le = joblib.load('label_encoder_e5_clusters.pkl')
full_data_clustered = full_data.with_columns(
    pl.Series(
        le.inverse_transform(
            list(map(lambda x: int(x), full_data["column_384"].to_list()))
        )
    ).alias("ОКПД2")
).drop(["column_384"])

# Join the clustered data with the original MTR data and save it to a Parquet file
full_data_clustered.join(mtr.drop(["ОКПД2"]), on="код СКМТР").write_parquet("e5_small_clustered_data_v2.parquet")
