### SBERT

Dataset: "Amazon Reviews: Musical Instruments" from https://cseweb.ucsd.edu/~jmcauley/datasets/amazon_v2/

In [8]:
import json
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import torch
from transformers import RobertaTokenizer, RobertaModel
import tools as tl
import numpy as np

### For Mac

In [2]:
if torch.backends.mps.is_available():
    mps_device = torch.device("mps")
    x = torch.ones(1, device=mps_device)
    print (x)
else:
    print ("MPS device not found.")

tensor([1.], device='mps:0')


### Defining Functions

In [3]:
def read_json_array(path):
  data = []
  with open(path, 'r') as file:
    for line in file:
      json_object = json.loads(line.strip())
      data.append(json_object)
  return data

### Loading dataset

In [4]:
dataset_path = '../datasets/amazon_reviews/'
data_path = dataset_path + "Musical_Instruments.json"
meta_path = dataset_path + "meta_Musical_Instruments.json"

data = read_json_array(data_path)
# Extract relevant fields for processing
df = pd.DataFrame(data)
df['text'] = df['reviewText']

In [5]:
# Taking only first 1000 for demo 
df = df.head(1000)

### Creating Embeddings

In [6]:
# Initialize distilroberta tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('distilroberta-base')
model = RobertaModel.from_pretrained('distilroberta-base')
# Generate embeddings
print("Generating embeddings...")
embeddings = tl.generate_embeddings(df['text'].tolist(), tokenizer, model)
print("Embeddings generated!")


Generating embeddings...


Generating Embeddings: 100%|██████████| 63/63 [00:27<00:00,  2.27batch/s]

Embeddings generated!





### Clustering 

In [None]:
reduced_embeddings = tl.reduce_dimensionality(embeddings, n_components=300, algo='umap')
print("Clustering...")
reduced_embeddings = torch.tensor(reduced_embeddings)
clusters = tl.perform_clustering(reduced_embeddings, n_clusters=20, algo='kmeans')
df['cluster'] = clusters
df['embedding'] = reduced_embeddings.tolist()
print("Clustering complete!")



Clustering...
Clustering complete!


### Evaluation

In [19]:
from sklearn.metrics import silhouette_score

X = df['embedding'].apply(lambda x: np.array(x)).to_list()
y = df['cluster']
ss = silhouette_score(X, y)
print(f"Silhouette Score: {ss}")

Silhouette Score: 0.3377364548651243


In [None]:
df[['text', 'cluster']].to_csv('../outputs/amazon_reviews/clustered_reviews.csv', index=False)
print("Results saved to clustered_reviews.csv")

Cluster distribution:
cluster
0    436
1    220
2    192
4     78
3     74
Name: count, dtype: int64
Results saved to clustered_reviews.csv
