In [5]:
import os
import dotenv
import pandas as pd
import numpy as np
import json
import re
from tqdm import tqdm
from itertools import product
import umap
import hdbscan
import pickle
from sklearn.metrics import silhouette_score
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
import torch
print("GPU is available:", torch.cuda.is_available())

if torch.cuda.is_available():
  print("Current device:", torch.cuda.get_device_name(0))

device = 'cuda' if torch.cuda.is_available() else 'cpu'

cwd = os.getcwd()
data_path = os.path.join(cwd,'data')
env_path = os.path.join(cwd,'.env')
models_path = os.path.join(cwd,'models')

Error importing huggingface_hub.hf_api: cannot import name 'logging' from 'huggingface_hub' (C:\Users\eduar\AppData\Local\Programs\Python\Python311\Lib\site-packages\huggingface_hub\__init__.py)


ImportError: cannot import name 'logging' from 'huggingface_hub' (C:\Users\eduar\AppData\Local\Programs\Python\Python311\Lib\site-packages\huggingface_hub\__init__.py)

In [None]:
def get_embeddings(text_list):
    embeddings = []
    for text in tqdm(text_list, desc="Generating embeddings"):
        # Tokenize and prepare inputs
        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)

        # Move input tensors to the same device as the model (GPU or CPU)
        inputs = {key: value.to(device) for key, value in inputs.items()}

        with torch.no_grad():
            # Forward pass to get model outputs
            outputs = model(**inputs, output_hidden_states=True)  # Ensure hidden states are returned

            # Extract the hidden states (usually from the last layer)
            hidden_states = outputs.hidden_states[-1]  # Last hidden layer
            # Compute embeddings (using mean pooling over the token embeddings)
            embeddings.append(hidden_states.mean(dim=1).squeeze().cpu().numpy())  # Move to CPU for numpy conversion
    return embeddings

def optimize_umap_hdbscan(embeddings, reviews_val, models_path, metric='euclidean', min_clusters=3):
    """
    Optimize UMAP and HDBSCAN hyperparameters for clustering embeddings using grid search.
    
    Parameters:
    - embeddings (np.array): Precomputed embeddings for clustering.
    - reviews_val (list): List of original review texts.
    - models_path (str): Path to load and save model data if necessary.
    - metric (str): Distance metric for HDBSCAN clustering.
    
    Returns:
    - DataFrame with text and cluster assignments for best hyperparameters.
    - Best UMAP and HDBSCAN parameters.
    """

    embedding_length = len(embeddings)

    # Define parameter grids for UMAP and HDBSCAN
    umap_params = {
        #'n_neighbors': [10, 25, 50, 100],  # Values to explore
        'n_neighbors': [int(embedding_length*0.01), int(embedding_length*0.5), int(embedding_length*0.1), int(embedding_length*0.15)],  # Values to explore
        'n_components': [2, 5, 10],  # Dimensionality reduction choices
        'min_dist': [0.0, 0.1, 0.25, 0.5, 0.75]  # Minimum distance values for UMAP
    }
    hdbscan_params = {
        #'min_cluster_size': [2, 5, 10],
        'min_cluster_size': [int(embedding_length*0.1), int(embedding_length*0.15), int(embedding_length*0.2)],
        'metric': ['euclidean', 'manhattan'],
        'cluster_selection_method': ['eom', 'leaf']
    }

    best_score = -np.inf
    best_params = None
    best_labels = None
    umap_reducer = None

    # Iterate over combinations of UMAP parameters
    for umap_combination in tqdm(product(*umap_params.values()), desc="UMAP parameter search"):
        umap_kwargs = dict(zip(umap_params.keys(), umap_combination))

        # Reduce dimensionality using UMAP
        umap_reducer = umap.UMAP(**umap_kwargs, random_state=42)
        reduced_embeddings = umap_reducer.fit_transform(embeddings)

        # Iterate over combinations of HDBSCAN parameters
        for hdbscan_combination in product(*hdbscan_params.values()):
            hdbscan_kwargs = dict(zip(hdbscan_params.keys(), hdbscan_combination))

            # Perform clustering
            clusterer = hdbscan.HDBSCAN(**hdbscan_kwargs)
            labels = clusterer.fit_predict(reduced_embeddings)

            # Evaluate clustering using silhouette score if more than minimum clusters found
            if len(set(labels)) >= min_clusters:
                score = silhouette_score(reduced_embeddings, labels, metric=metric)
                if score > best_score:
                    best_score = score
                    best_params = {'umap': umap_kwargs, 'hdbscan': hdbscan_kwargs}
                    best_labels = labels

    # Create DataFrame with best clustering results
    df = pd.DataFrame({"text": reviews_val, "cluster": best_labels})

    print(f"Best score: {best_score}")
    print(f"Best UMAP params: {best_params['umap']}")
    print(f"Best HDBSCAN params: {best_params['hdbscan']}")

    return df, best_params

In [3]:
COLAB_KEY = dotenv.dotenv_values(env_path)['COLAB_KEY']

NameError: name 'env_path' is not defined

In [4]:
!huggingface-cli login --token $COLAB_KEY

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Traceback (most recent call last):
  File "C:\Users\eduar\anaconda3\envs\deep\Lib\site-packages\huggingface_hub\utils\_http.py", line 406, in hf_raise_for_status
    response.raise_for_status()
  File "C:\Users\eduar\anaconda3\envs\deep\Lib\site-packages\requests\models.py", line 1024, in raise_for_status
    raise HTTPError(http_error_msg, response=self)
requests.exceptions.HTTPError: 401 Client Error: Unauthorized for url: https://huggingface.co/api/whoami-v2

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "C:\Users\eduar\anaconda3\envs\deep\Lib\site-packages\huggingface_hub\hf_api.py", line 1670, in whoami
    hf_raise_for_status(r)
  File "C:\Users\eduar\anaconda3\envs\deep\Lib\site-packages\hug

In [None]:
size = 15000

file_path = os.path.join(data_path,'final_reviews.csv')
df = pd.read_csv(file_path).dropna(subset='review')

good_reviews = df.loc[df['voted_up'] == True]['review'].drop_duplicates()
bad_reviews = df.loc[df['voted_up'] == False]['review'].drop_duplicates()

good_reviews = list(good_reviews.sample(size,random_state=42))
bad_reviews = list(bad_reviews.sample(size,random_state=42))

good_reviews = [str(i) for i in good_reviews]
bad_reviews = [str(i) for i in bad_reviews]

In [None]:
#meta-llama/Meta-Llama-3-8B alternativa
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B")
model.to(device)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # Use EOS token as padding token

In [None]:
reviews_dict = {
    'bad_reviews':bad_reviews,
    'good_reviews':good_reviews,
          }

In [None]:
for reviews_key,reviews_val in reviews_dict.items():

    #embeddings = get_embeddings(reviews_val)
    
    embeddings_path = os.path.join(models_path,f'{reviews_key}_embeddings.npy')

    embeddings = np.load(embeddings_path)

In [None]:
for reviews_key,reviews_val in reviews_dict.items():

    #embeddings = get_embeddings(reviews_val)
    embeddings_path = os.path.join(models_path,f'{reviews_key}_embeddings.npy')
    
    #np.save(embeddings_path, embeddings)
    #print(f"Embeddings saved to: {embeddings_path}")

    embeddings = np.load(embeddings_path)
    print(f"Embeddings loaded from to: {embeddings_path}")
    
    # Reduce dimensionality with UMAP
    umap_reducer = umap.UMAP(n_neighbors=15, n_components=5, random_state=42)
    reduced_embeddings = umap_reducer.fit_transform(embeddings)
    
    # Cluster with HDBSCAN
    clusterer = hdbscan.HDBSCAN(min_cluster_size=2, metric='euclidean', cluster_selection_method='eom')
    labels = clusterer.fit_predict(reduced_embeddings)
    
    hdbscan_path = os.path.join(models_path,f'{reviews_key}_hdbscan.pkl')
    with open(hdbscan_path, 'wb') as f:
        pickle.dump(clusterer, f)
    
    print(f"Model saved to: {hdbscan_path}")
    
    # Visualize or print the results
    df = pd.DataFrame({"text": reviews_val, "cluster": labels})
    df.to_csv(os.path.join(models_path,f'{reviews_key}_llama_topics.csv'), index=False)

In [11]:
reviews_dict = {
    'bad_reviews':bad_reviews[:1000],
    'good_reviews':good_reviews[:1000],
          }

for reviews_key,reviews_val in reviews_dict.items():

    embeddings = np.load(os.path.join(models_path, f'{reviews_key}_embeddings.npy'))[:1000]
    
    df_optimized, best_params = optimize_umap_hdbscan(embeddings, reviews_val, models_path)
    
    df_optimized.to_csv(os.path.join(models_path,f'{reviews_key}_optimized_llama_topics.csv'), index=False)
    
    with open(os.path.join(models_path,f'{reviews_key}_best_params.json'), 'w') as file:
        json.dump(best_params, file)
    
    analysis_df = df_optimized.groupby('cluster').sample(10,random_state=42).sort_values('cluster').head(50)
    display(analysis_df)

UMAP parameter search: 60it [03:51,  3.85s/it]

Best score: 0.23197989165782928
Best UMAP params: {'n_neighbors': 150, 'n_components': 5, 'min_dist': 0.0}
Best HDBSCAN params: {'min_cluster_size': 100, 'metric': 'manhattan', 'cluster_selection_method': 'eom'}





Unnamed: 0,text,cluster
630,Title: Elden Ring's Tragic Descent: When Commu...,-1
34,Esse jogo eu não recomendo para jogadores como...,-1
676,STRIAGHT ASSSS good game but ASS,-1
219,would be amazing with multi player working rig...,-1
438,This game is utter trash and i wont be recomme...,-1
724,Dark souls 2 rip off. I will not elaborate.,-1
804,"THE GAME IS FUCKING SHIT, NEVER PLAY. GENERAL ...",-1
993,Hard game but not actually that interesting.,-1
41,"I love this game, it´s easily one of my favori...",-1
487,"this game is horrible lol, Trash. \n\n\n\n\n\n...",-1


UMAP parameter search: 60it [03:46,  3.78s/it]

Best score: 0.21578027307987213
Best UMAP params: {'n_neighbors': 10, 'n_components': 2, 'min_dist': 0.0}
Best HDBSCAN params: {'min_cluster_size': 100, 'metric': 'euclidean', 'cluster_selection_method': 'eom'}





Unnamed: 0,text,cluster
735,A M A Z I N G game\n,-1
315,This game is wayyyyyyyyyyyy toooooooooo harrrr...,-1
785,Fantastically bleak as one would expect from F...,-1
18,Boulder\n\,-1
267,Dark souls but open world,-1
685,⠀⠘⠀ you've been Bing Bonged... ...,-1
858,once you get mad enough u become Goku and it a...,-1
605,zdcasdasda,-1
626,GOAT + Big PP,-1
288,still not Elden lord :c,-1
