In [1]:
import os
import dotenv
import pandas as pd
import numpy as np
import json
import re
from tqdm import tqdm
from itertools import product
import umap
import hdbscan
import pickle
from sklearn.metrics import silhouette_score
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
import torch
print("GPU is available:", torch.cuda.is_available())

if torch.cuda.is_available():
  print("Current device:", torch.cuda.get_device_name(0))

device = 'cuda' if torch.cuda.is_available() else 'cpu'

cwd = os.getcwd()
data_path = os.path.join(cwd,'data')
env_path = os.path.join(cwd,'.env')
models_path = os.path.join(cwd,'models')


GPU is available: True
Current device: NVIDIA GeForce RTX 3060


In [2]:
def get_embeddings(text_list):
    embeddings = []
    for text in tqdm(text_list, desc="Generating embeddings"):
        # Tokenize and prepare inputs
        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)

        # Move input tensors to the same device as the model (GPU or CPU)
        inputs = {key: value.to(device) for key, value in inputs.items()}

        with torch.no_grad():
            # Forward pass to get model outputs
            outputs = model(**inputs, output_hidden_states=True)  # Ensure hidden states are returned

            # Extract the hidden states (usually from the last layer)
            hidden_states = outputs.hidden_states[-1]  # Last hidden layer
            # Compute embeddings (using mean pooling over the token embeddings)
            embeddings.append(hidden_states.mean(dim=1).squeeze().cpu().numpy())  # Move to CPU for numpy conversion
    return embeddings

def optimize_umap_hdbscan(embeddings, reviews_val, models_path, metric='euclidean', min_clusters=3):
    """
    Optimize UMAP and HDBSCAN hyperparameters for clustering embeddings using grid search.
    
    Parameters:
    - embeddings (np.array): Precomputed embeddings for clustering.
    - reviews_val (list): List of original review texts.
    - models_path (str): Path to load and save model data if necessary.
    - metric (str): Distance metric for HDBSCAN clustering.
    
    Returns:
    - DataFrame with text and cluster assignments for best hyperparameters.
    - Best UMAP and HDBSCAN parameters.
    """

    embedding_length = len(embeddings)

    # Define parameter grids for UMAP and HDBSCAN
    umap_params = {
        #'n_neighbors': [10, 25, 50, 100],  # Values to explore
        'n_neighbors': [int(embedding_length*0.01), int(embedding_length*0.5), int(embedding_length*0.1), int(embedding_length*0.15)],  # Values to explore
        'n_components': [2, 5, 10],  # Dimensionality reduction choices
        'min_dist': [0.0, 0.1, 0.25, 0.5, 0.75]  # Minimum distance values for UMAP
    }
    hdbscan_params = {
        #'min_cluster_size': [2, 5, 10],
        'min_cluster_size': [int(embedding_length*0.1), int(embedding_length*0.15), int(embedding_length*0.2)],
        'metric': ['euclidean', 'manhattan'],
        'cluster_selection_method': ['eom', 'leaf']
    }

    best_score = -np.inf
    best_params = None
    best_labels = None
    umap_reducer = None

    # Iterate over combinations of UMAP parameters
    for umap_combination in tqdm(product(*umap_params.values()), desc="UMAP parameter search"):
        umap_kwargs = dict(zip(umap_params.keys(), umap_combination))

        # Reduce dimensionality using UMAP
        umap_reducer = umap.UMAP(**umap_kwargs, random_state=42)
        reduced_embeddings = umap_reducer.fit_transform(embeddings)

        # Iterate over combinations of HDBSCAN parameters
        for hdbscan_combination in product(*hdbscan_params.values()):
            hdbscan_kwargs = dict(zip(hdbscan_params.keys(), hdbscan_combination))

            # Perform clustering
            clusterer = hdbscan.HDBSCAN(**hdbscan_kwargs)
            labels = clusterer.fit_predict(reduced_embeddings)

            # Evaluate clustering using silhouette score if more than minimum clusters found
            if len(set(labels)) >= min_clusters:
                score = silhouette_score(reduced_embeddings, labels, metric=metric)
                if score > best_score:
                    best_score = score
                    best_params = {'umap': umap_kwargs, 'hdbscan': hdbscan_kwargs}
                    best_labels = labels

    # Create DataFrame with best clustering results
    df = pd.DataFrame({"text": reviews_val, "cluster": best_labels})

    print(f"Best score: {best_score}")
    print(f"Best UMAP params: {best_params['umap']}")
    print(f"Best HDBSCAN params: {best_params['hdbscan']}")

    return df, best_params

In [3]:
import numpy as np
import pandas as pd
from itertools import product
from tqdm import tqdm
import umap
import hdbscan
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
import warnings
warnings.filterwarnings('ignore')

def evaluate_clustering(reduced_embeddings, labels, metrics=['silhouette', 'calinski', 'davies']):
    """
    Evaluate clustering using multiple metrics.
    """
    scores = []
    unique_labels = set(labels)
    n_clusters = len([x for x in unique_labels if x != -1])
    
    if n_clusters < 2:
        return -np.inf
        
    try:
        if 'silhouette' in metrics:
            sil_score = silhouette_score(reduced_embeddings, labels)
            scores.append(sil_score)
            
        if 'calinski' in metrics:
            cal_score = calinski_harabasz_score(reduced_embeddings, labels)
            scores.append(np.log1p(cal_score) / 10)
            
        if 'davies' in metrics:
            dav_score = davies_bouldin_score(reduced_embeddings, labels)
            scores.append(1 / (1 + dav_score))
            
        return np.mean(scores)
    except:
        return -np.inf

def optimize_umap_hdbscan(embeddings, reviews_val, models_path, metric='euclidean', min_clusters=3, 
                         random_seed=42):
    """
    Optimize UMAP and HDBSCAN hyperparameters with focused search.
    """
    np.random.seed(random_seed)
    embedding_length = len(embeddings)
    
    # Focused parameter grids
    umap_params = {
        'n_neighbors': [
            max(5, int(embedding_length * 0.01)),
            max(15, int(embedding_length * 0.05)),
            max(30, int(embedding_length * 0.1))
        ],
        'n_components': [2, 5, 8],
        'min_dist': [0.0, 0.1, 0.3, 0.5],
        'metric': ['euclidean', 'cosine']
    }
    
    hdbscan_params = {
        'min_cluster_size': [
            max(5, int(embedding_length * 0.05)),
            max(10, int(embedding_length * 0.1)),
            max(20, int(embedding_length * 0.15))
        ],
        'min_samples': [1, 5, 10],
        'cluster_selection_epsilon': [0.0, 0.5, 1.0],
        'cluster_selection_method': ['eom', 'leaf']
    }
    
    best_score = -np.inf
    best_params = None
    best_labels = None
    
    # Calculate total iterations for progress bar
    total_iterations = (len(list(product(*umap_params.values()))) * 
                       len(list(product(*hdbscan_params.values()))))
    
    print(f"Starting grid search with {total_iterations} combinations...")
    
    # Create progress bar
    pbar = tqdm(total=total_iterations, desc="Grid search progress")
    
    # Iterate over UMAP parameters
    for umap_combo in product(*umap_params.values()):
        umap_kwargs = dict(zip(umap_params.keys(), umap_combo))
        
        try:
            # Reduce dimensionality using UMAP
            umap_reducer = umap.UMAP(**umap_kwargs, random_state=random_seed)
            reduced_embeddings = umap_reducer.fit_transform(embeddings)
            
            # Iterate over HDBSCAN parameters
            for hdbscan_combo in product(*hdbscan_params.values()):
                hdbscan_kwargs = dict(zip(hdbscan_params.keys(), hdbscan_combo))
                
                try:
                    # Perform clustering
                    clusterer = hdbscan.HDBSCAN(**hdbscan_kwargs)
                    labels = clusterer.fit_predict(reduced_embeddings)
                    
                    # Evaluate if sufficient clusters found
                    if len(set(labels)) >= min_clusters:
                        score = evaluate_clustering(reduced_embeddings, labels)
                        
                        if score > best_score:
                            best_score = score
                            best_params = {
                                'umap': umap_kwargs,
                                'hdbscan': hdbscan_kwargs
                            }
                            best_labels = labels
                
                except Exception as e:
                    pass
                
                pbar.update(1)
                
        except Exception as e:
            pbar.update(len(list(product(*hdbscan_params.values()))))
    
    pbar.close()
    
    # Create DataFrame with best clustering results
    df = pd.DataFrame({
        "text": reviews_val,
        "cluster": best_labels,
    })
    
    # Add cluster statistics
    cluster_stats = df.groupby('cluster').agg({
        'text': 'count'
    }).rename(columns={'text': 'cluster_size'})
    
    print(f"\nBest score: {best_score:.4f}")
    print(f"Best UMAP params: {best_params['umap']}")
    print(f"Best HDBSCAN params: {best_params['hdbscan']}")
    print("\nCluster statistics:")
    print(cluster_stats)
    
    return df, best_params

In [4]:
COLAB_KEY = dotenv.dotenv_values(env_path)['COLAB_KEY']

In [5]:
!huggingface-cli login --token $COLAB_KEY

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
The token `Colab` has been saved to C:\Users\eduar\.cache\huggingface\stored_tokens
Your token has been saved to C:\Users\eduar\.cache\huggingface\token
Login successful.
The current active token is: `Colab`


In [6]:
size = 15000

file_path = os.path.join(data_path,'final_reviews.csv')
df = pd.read_csv(file_path).dropna(subset='review')

good_reviews = df.loc[df['voted_up'] == True]['review'].drop_duplicates()
bad_reviews = df.loc[df['voted_up'] == False]['review'].drop_duplicates()

good_reviews = list(good_reviews.sample(size,random_state=42))
bad_reviews = list(bad_reviews.sample(size,random_state=42))

good_reviews = [str(i) for i in good_reviews]
bad_reviews = [str(i) for i in bad_reviews]

In [7]:
#meta-llama/Meta-Llama-3-8B alternativa
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B")
model.to(device)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # Use EOS token as padding token

In [None]:
n = 5000

reviews_dict = {
    'bad_reviews':bad_reviews[:n],
    #'good_reviews':good_reviews[:1000],
          }

for reviews_key,reviews_val in reviews_dict.items():

    embeddings = np.load(os.path.join(models_path, f'{reviews_key}_embeddings.npy'))[:n]
    
    df_optimized, best_params = optimize_umap_hdbscan(embeddings, reviews_val, models_path)
    
    df_optimized.to_csv(os.path.join(models_path,f'{reviews_key}_optimized_llama_topics.csv'), index=False)
    
    with open(os.path.join(models_path,f'{reviews_key}_best_params.json'), 'w') as file:
        json.dump(best_params, file)
    
    analysis_df = df_optimized.groupby('cluster').sample(10,random_state=42).sort_values('cluster').head(50)
    display(analysis_df)

In [8]:
cluster = 1
n = 5000

reviews_key = 'bad_reviews'

df_optimized = pd.read_csv(os.path.join(models_path,f'{reviews_key}_optimized_llama_topics.csv'))

cluster_df = df_optimized.loc[df_optimized['cluster'] == cluster]

text_list = list(cluster_df['text'])
cluster_indexes = cluster_df.index

embeddings = np.load(os.path.join(models_path, f'{reviews_key}_embeddings.npy'))[:n]

embedding_indexes = embeddings[cluster_indexes]

df_optimized, best_params = optimize_umap_hdbscan(embedding_indexes, text_list, models_path)
    
df_optimized.to_csv(os.path.join(models_path,f'{reviews_key}_cluster_{cluster}_optimized_llama_topics.csv'), index=False)

with open(os.path.join(models_path,f'{reviews_key}_cluster_{cluster}_best_params.json'), 'w') as file:
    json.dump(best_params, file)

analysis_df = df_optimized.groupby('cluster').sample(10,random_state=42).sort_values('cluster').head(50)
display(analysis_df)

Starting grid search with 3888 combinations...


Grid search progress: 100%|████████████████████████████████████████████████████████| 3888/3888 [13:41<00:00,  4.73it/s]


Best score: 0.4739
Best UMAP params: {'n_neighbors': 13, 'n_components': 2, 'min_dist': 0.1, 'metric': 'euclidean'}
Best HDBSCAN params: {'min_cluster_size': 205, 'min_samples': 1, 'cluster_selection_epsilon': 0.0, 'cluster_selection_method': 'eom'}

Cluster statistics:
         cluster_size
cluster              
-1                572
 0                236
 1                559





Unnamed: 0,text,cluster
1227,i love the souls series and bloodbourne specif...,-1
189,This game is simply full of bullshit. Every en...,-1
303,For those who look for entertaining gameplay a...,-1
965,[h1] NO ELDEN BUSSY [/h1]\n\n100+ HOURS INTO T...,-1
1296,Its with a heavy heart I cannot reccommend thi...,-1
761,Great game! A world you can lose yourself into...,-1
439,FIX YOUR GODDAMN MEMORY LEAK OF A FUCKING GAME...,-1
75,"I'm near the final boss, and I don't even care...",-1
210,"Oh boy, I've no idea where to start so let's b...",-1
731,This is a review from an experienced player of...,-1
