In [1]:
import json
import os
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.datasets import fetch_20newsgroups
import torch
from transformers import RobertaTokenizer, RobertaModel
import tools as tl
import tqdm

### For Macs

In [2]:
if torch.backends.mps.is_available():
    mps_device = torch.device("mps")
    x = torch.ones(1, device=mps_device)
    print (x)
else:
    print ("MPS device not found.")

tensor([1.], device='mps:0')


### Loading dataset

In [5]:
# Load the 20 newsgroups dataset
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

# Create a DataFrame from the dataset
df = pd.DataFrame({'text': newsgroups.data, 'label': newsgroups.target})

# Map target labels to target names
df['label'] = df['label'].map(lambda x: newsgroups.target_names[x])

# df = df[df['label'].isin(['rec.motorcycles', 'rec.sport.baseball'])]
df = df.head(5000)


### Creating Embeddings

In [7]:
# Initialize distilroberta tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('distilroberta-base')
model = RobertaModel.from_pretrained('distilroberta-base')
# Generate embeddings
print("Generating embeddings...")
embeddings = tl.generate_embeddings(df['text'].tolist(), tokenizer, model)
print("Embeddings generated!")

Generating embeddings...


Generating Embeddings: 100%|██████████| 313/313 [04:51<00:00,  1.07batch/s]

Embeddings generated!





### Clustering 

In [8]:
reduced_embeddings = tl.reduce_dimensionality(embeddings, n_components=50, algo='umap')
print("Clustering...")
reduced_embeddings = torch.tensor(reduced_embeddings)
clusters = tl.perform_clustering(reduced_embeddings, algo='hdbscan', n_clusters=20)
df['cluster'] = clusters
df['embedding'] = reduced_embeddings.tolist()
print("Clustering complete!")

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


Clustering...
Clustering complete!




### Evaluation


In [10]:
from sklearn.metrics import silhouette_score

X = df['embedding'].apply(lambda x: np.array(x)).to_list()
y = df['cluster']
ss = silhouette_score(X, y)
print(f"Silhouette Score: {ss}")

Silhouette Score: 0.7322524671957974


### Saving

In [10]:
# Save results to a CSV for further analysis
df[['text', 'cluster']].to_csv('../outputs/amazon_reviews/clustered_reviews.csv', index=False)
print("Results saved to clustered_reviews.csv")

Results saved to clustered_reviews.csv
