In [8]:
import os
import dotenv
import pandas as pd
import numpy as np
import json
from tqdm import tqdm
import itertools
from itertools import product
import umap
import hdbscan
import pickle
from sklearn.metrics import silhouette_score
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
import torch
print("GPU is available:", torch.cuda.is_available())

if torch.cuda.is_available():
  print("Current device:", torch.cuda.get_device_name(0))

device = 'cuda' if torch.cuda.is_available() else 'cpu'

cwd = os.getcwd()
data_path = os.path.join(cwd,'data')
env_path = os.path.join(cwd,'.env')
models_path = os.path.join(cwd,'models')

GPU is available: True
Current device: NVIDIA GeForce RTX 3060


In [9]:
def optimize_bertopic_with_param_dict(docs, embedding_model=None):
    best_model = None
    best_score = -1
    best_params = {}

    umap_params = {
        'n_neighbors': [5, 10, 25, 50],
        'n_components': [2, 5, 10],
        'min_dist': [0.0, 0.1, 0.25, 0.5]
    }
    
    hdbscan_params = {
        'min_cluster_size': [2, 5, 10],
        'metric': ['euclidean', 'manhattan'],
        'cluster_selection_method': ['eom', 'leaf']
    }
    
    vectorizer_params = [
        {'ngram_range': (1, 1)},
        {'ngram_range': (1, 2)}
    ]

    # Iterate over all combinations of UMAP parameters
    umap_combinations = [
        dict(zip(umap_params.keys(), values)) 
        for values in itertools.product(*umap_params.values())
    ]
    
    # Iterate over all combinations of HDBSCAN parameters
    hdbscan_combinations = [
        dict(zip(hdbscan_params.keys(), values)) 
        for values in itertools.product(*hdbscan_params.values())
    ]

    # Iterate over vectorizer settings
    for vectorizer_param in vectorizer_params:
        vectorizer = CountVectorizer(**vectorizer_param)

        for umap_param in umap_combinations:
            umap_model = umap.UMAP(**umap_param, random_state=42)

            for hdbscan_param in hdbscan_combinations:
                hdbscan_model = hdbscan.HDBSCAN(**hdbscan_param)

                # Fit BERTopic with specified parameters
                topic_model = BERTopic(
                    embedding_model=embedding_model, 
                    vectorizer_model=vectorizer,
                    umap_model=umap_model,
                    hdbscan_model=hdbscan_model
                )
                topics, probs = topic_model.fit_transform(docs)

                # Compute silhouette score for evaluation
                embeddings = topic_model._extract_embeddings(docs, topic_model.embedding_model)
                if embeddings is None:
                    continue  # Skip if no embeddings available
                
                # Use cluster labels generated by HDBSCAN for silhouette score calculation
                cluster_labels = hdbscan_model.labels_
                valid_idx = cluster_labels != -1
                if np.sum(valid_idx) < 2:  # Ensure enough data points for silhouette scoring
                    continue
                    
                score = silhouette_score(embeddings[valid_idx], cluster_labels[valid_idx])
                
                # Track the best performing model
                if score > best_score:
                    best_score = score
                    best_model = topic_model
                    best_params = {
                        'vectorizer_params': vectorizer_param,
                        'umap_params': umap_param,
                        'hdbscan_params': hdbscan_param
                    }
                    
    print(f"Best Silhouette Score: {best_score}")
    print(f"Best Parameters: {best_params}")
    return best_model, best_params

In [10]:
COLAB_KEY = dotenv.dotenv_values(env_path)['COLAB_KEY']

In [11]:
!huggingface-cli login --token $COLAB_KEY

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
The token `Colab` has been saved to C:\Users\eduar\.cache\huggingface\stored_tokens
Your token has been saved to C:\Users\eduar\.cache\huggingface\token
Login successful.
The current active token is: `Colab`


In [12]:
file_path = os.path.join(data_path,'final_reviews.csv')
df = pd.read_csv(file_path).dropna(subset='review')

good_reviews = df.loc[df['voted_up'] == True]['review'].drop_duplicates()
bad_reviews = df.loc[df['voted_up'] == False]['review'].drop_duplicates()

good_reviews = list(good_reviews.sample(1000,random_state=665))
bad_reviews = list(bad_reviews.sample(1000,random_state=665))

good_reviews = [str(i) for i in good_reviews]
bad_reviews = [str(i) for i in bad_reviews]

In [14]:
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Optimize and generate topics for good reviews
print("Optimizing topics for good reviews...")
optimized_good_reviews_model, optimized_good_reviews_params = optimize_bertopic_with_param_dict(
    good_reviews, 
    embedding_model=embedding_model,
)

# Optimize and generate topics for bad reviews
print("Optimizing topics for bad reviews...")
optimized_bad_reviews_model,  optimized_bad_reviews_params= optimize_bertopic_with_param_dict(
    bad_reviews, 
    embedding_model=embedding_model,
)

# Display topics
print("Topics for Good Reviews:")
print(optimized_good_reviews_model.get_topics())

print("Topics for Bad Reviews:")
print(optimized_bad_reviews_model.get_topics())

if optimized_good_reviews_model:
    optimized_good_reviews_model.save(os.path.join(models_path,'BERTOPIC_optimized_good_reviews_model'))

if optimized_bad_reviews_model:
    optimized_bad_reviews_model.save(os.path.join(models_path,'BERTOPIC_optimized_bad_reviews_model'))

Optimizing topics for good reviews...
Best Silhouette Score: 0.1164262667298317
Best Parameters: {'vectorizer_params': {'ngram_range': (1, 1)}, 'umap_params': {'n_neighbors': 50, 'n_components': 10, 'min_dist': 0.5}, 'hdbscan_params': {'min_cluster_size': 10, 'metric': 'manhattan', 'cluster_selection_method': 'eom'}}
Optimizing topics for bad reviews...




Best Silhouette Score: 0.11897186189889908
Best Parameters: {'vectorizer_params': {'ngram_range': (1, 1)}, 'umap_params': {'n_neighbors': 25, 'n_components': 5, 'min_dist': 0.5}, 'hdbscan_params': {'min_cluster_size': 10, 'metric': 'manhattan', 'cluster_selection_method': 'leaf'}}
Topics for Good Reviews:
{-1: [('the', 0.049086629349647574), ('game', 0.04543562059336187), ('and', 0.04196893583599362), ('of', 0.04047611679748806), ('to', 0.03817950123758635), ('you', 0.03650701434223236), ('it', 0.03448669782507999), ('is', 0.032950811827304485), ('this', 0.03218870102955126), ('die', 0.02669914883432965)], 0: [('souls', 0.06058079881208062), ('the', 0.05144011420049065), ('game', 0.04805193603643161), ('to', 0.04732877043360757), ('and', 0.04583350920951867), ('this', 0.045417303588121155), ('of', 0.04139342189707563), ('it', 0.03634622106962502), ('you', 0.03599868657229931), ('in', 0.03438875214938815)], 1: [('the', 0.05419852500502299), ('and', 0.04918110526853857), ('elden', 0.0488

