In [None]:
!pip install sentence-transformers

In [2]:
import requests

response = requests.get('https://raw.githubusercontent.com/ddmitov/magna-carta/master/magna-carta.txt')
text = response.text

In [3]:
corpus = text.split('\n\n')

In [None]:
from sentence_transformers import SentenceTransformer, util
from sklearn.cluster import KMeans, AgglomerativeClustering

model = SentenceTransformer(
    'silencesys/paraphrase-xlm-r-multilingual-v1-fine-tuned-for-medieval-latin'
)

In [5]:
corpus_embeddings = model.encode(
    corpus,
    batch_size = 1024,
    convert_to_tensor=True
)

In [6]:
# Method 1:
clusters = util.community_detection(
    corpus_embeddings,
    min_community_size=1,
    threshold=0.75
)

In [None]:
for cluster_number, cluster in enumerate(clusters):
    print('==========')
    print("Cluster ", cluster_number + 1)
    print('==========')

    for sentence_id in cluster:
        display(corpus[sentence_id])
    
    print("")

In [None]:
# Method 2:
import numpy as np

corpus_embeddings = (
    corpus_embeddings /
    np.linalg.norm(
        corpus_embeddings,
        axis=1,
        keepdims=True
    )
)

clustering_model = AgglomerativeClustering(
    n_clusters=None,
    affinity='cosine',
    linkage='average',
    distance_threshold=0.25
)

clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_

In [None]:
import collections

clustered_sentences = {}

for sentence_id, cluster_id in enumerate(cluster_assignment):
    if cluster_id not in clustered_sentences:
        clustered_sentences[cluster_id] = []

    clustered_sentences[cluster_id].append(corpus[sentence_id])

clustered_sentences_ordered = \
    collections.OrderedDict(sorted(clustered_sentences.items()))

for cluster_number, cluster in clustered_sentences_ordered.items():
    print('==========')
    print("Cluster ", cluster_number + 1)
    print('==========')
    display(cluster)
    print("")