This notebook explores the use of SentenceBERT to generate representations of sequences (sentences, documents) and clustering those representations using K-means.

In [None]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-2.3.1-py3-none-any.whl.metadata (11 kB)
Collecting torch>=1.11.0 (from sentence-transformers)
  Downloading torch-2.2.0-cp311-none-macosx_10_9_x86_64.whl.metadata (25 kB)
Collecting sentencepiece (from sentence-transformers)
  Downloading sentencepiece-0.1.99-cp311-cp311-macosx_10_9_x86_64.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting sympy (from torch>=1.11.0->sentence-transformers)
  Downloading sympy-1.12-py3-none-any.whl (5.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.7/5.7 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting networkx (from torch>=1.11.0->sentence-transformers)
  Downloading networkx-3.2.1-py3-none-any.whl.metadata (5.2 kB)
Collecting mpmath>=0.19 (from sympy->torch>=1.11.0->sentence-transformers)
  Downloading mpmath-1.3.0-py3-non

In [None]:
# Get movies summaries and book titles to cluster
!wget https://raw.githubusercontent.com/dbamman/anlp23/main/data/plot_summaries.txt
!wget https://raw.githubusercontent.com/dbamman/anlp23/main/data/loc/dev.tsv -O book_titles.txt

In [None]:
from sklearn.cluster import KMeans
from math import sqrt
from sentence_transformers import SentenceTransformer
import numpy as np

In [None]:
def read_data(filename):
    data=[]
    with open(filename, encoding="utf-8") as file:
        for line in file:
            cols=line.rstrip().split("\t")
            idd=cols[0]
            summary=cols[1]
            data.append((idd, summary))
    return data

In [None]:
movies=read_data("plot_summaries.txt")
book_titles=read_data("book_titles.txt")

In [None]:
sentence_model = SentenceTransformer('sentence-transformers/all-distilroberta-v1')

In [None]:
embedding=sentence_model.encode("this is a sentence")
print(embedding.shape)

In [None]:
def cosine(one, two):
  return np.dot(one,two)/(sqrt(np.dot(one,one)) * sqrt(np.dot(two,two)))

In [None]:
def run_all(data, model, num_clusters=10):

    X=[]

    # Get sentence embeddings for each doc
    for idx, doc in data:
      embedding=model.encode(doc)
      X.append(embedding)

    X=np.array(X)

    # Run K-means
    kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(X)

    # For each cluster, print out the n documents closest to the cluster center
    clusters={}
    for idx, label in enumerate(kmeans.labels_):
      if label not in clusters:
        clusters[label]=[]
      clusters[label].append((idx, data[idx][1]))

    for label in clusters:
      sims={}
      cluster_center=kmeans.cluster_centers_[label]
      for idx, doc in clusters[label]:
        sim=cosine(cluster_center, X[idx])
        sims[idx]=sim
      for k, v in sorted(sims.items(), key=lambda item: item[1], reverse=True)[:5]:
        print(k,"%.3f" % v, data[k][1])

      print()


# Book titles

In [None]:
run_all(book_titles[:1000], sentence_model, num_clusters=10)

# Movie summaries

In [None]:
run_all(movies[:100], sentence_model, num_clusters=10)

A1: Play around with this method and vary the number of movies clustered, along with the number of clusters.  How would you rate the coherence and interepretability of these clusters? Try to label some of the clusters and discuss with your neighbors about the overall coherence.)