# Задание 1 (5 балла)

Имплементируйте алгоритм Леска (описание есть в семинаре) и оцените качество его работы на датасете `data/corpus_wsd_50k.txt`

В качестве метрики близости вы должны попробовать два подхода:

1) Jaccard score на множествах слов (определений и контекста)
2) Cosine distance на эмбедингах sentence_transformers

В качестве метрики используйте accuracy (% правильных ответов). Предсказывайте только многозначные слова в датасете

Контекст вы можете определить самостоятельно (окно вокруг целевого слова или все предложение). Также можете поэкспериментировать с предобработкой для обоих методов.

In [None]:
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
import re
from nltk import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
stopwords = set(stopwords.words('english'))

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [5]:
!python -m pip install torch torchvision torchaudio
!python -m pip install sentence_transformers transformers accelerate -U

Collecting sentence_transformers
  Downloading sentence_transformers-3.1.1-py3-none-any.whl.metadata (10 kB)
Collecting transformers
  Downloading transformers-4.45.1-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.21,>=0.20 (from transformers)
  Downloading tokenizers-0.20.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading sentence_transformers-3.1.1-py3-none-any.whl (245 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.3/245.3 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading transformers-4.45.1-py3-none-any.whl (9.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m81.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.20.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [6]:
from sklearn.metrics.pairwise import cosine_distances
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


In [None]:
corpus_wsd = []
corpus = open('/content/corpus_wsd_50k.txt').read().split('\n\n')
for sent in corpus:
    corpus_wsd.append([s.split('\t') for s in sent.split('\n')])

In [None]:
def extract_polysemy_words(corpus):
    words = [word for word in corpus if word[0] != '']
    return words

In [None]:
import typing as tp

In [None]:
def lesk(definition, context):
    intersect = len(set(definition) & context)
    return intersect

def tokenization(definition):
    sentences = sent_tokenize(definition)
    tokenized_definition = [word_tokenize(sentence) for sentence in sentences]
    tokenized_definition = [[token.lower() for token in sentence if not re.match(r'\W+', token) and token.lower() not in stopwords]
                       for sentence in tokenized_definition]
    return tokenized_definition[0]

def fake_tokenization(definition):
    return [definition]

def find_best_sense(word: str, context: tp.Set[str], scorefunc=lesk, tokenization = tokenization):
    best_sense = 0
    max_score = 0
    for i, synset in enumerate(wn.synsets(word)):
        definition = synset.definition()
        definition = tokenization(definition)
        result = scorefunc(definition, context)
        if result > max_score:
            max_score = result
            best_sense = i
    return best_sense


In [None]:
def jaccard(definition, context):
    intersection = (set(definition) & context)
    union = (set(definition) | context)
    jaccard =  len(intersection) / len(union)
    return jaccard

In [7]:
from tqdm import tqdm

In [None]:
#Леск
definition_indexes = []
for x in tqdm(corpus_wsd):
    if len(x) < 3:
        continue
    context = [i[2].lower() for i in x if not re.match(r'\W+', i[2])]
    context = set(context)
    words = extract_polysemy_words(x)
    wordlist = [i[1] for i in words]
    wordlist = set(wordlist)
    wlasl = list(wordlist)
    wlasl = sorted(wlasl)
    sentence_indexes = [find_best_sense(word, context) for word in wlasl]
    definition_indexes += sentence_indexes

100%|██████████| 49453/49453 [04:49<00:00, 170.89it/s]


In [None]:
#Жаккард
definition_indexes_jaccard = []
for x in tqdm(corpus_wsd):
    if len(x) < 3:
        continue
    context = [i[2].lower() for i in x if not re.match(r'\W+', i[2])]
    context = set(context)
    words = extract_polysemy_words(x)
    wordlist = [i[1] for i in words]
    wordlist = set(wordlist)
    wlasl = list(wlasl)
    wlasl = sorted(wlasl)
    sentence_indexes = [find_best_sense(word, context, jaccard) for word in wlasl]
    definition_indexes_jaccard += sentence_indexes


100%|██████████| 49453/49453 [04:48<00:00, 171.32it/s]


In [None]:
# accuracy Жаккард
acc = 0
for j, e in zip(definition_indexes_jaccard, definition_indexes):
    if j == e:
        acc += 1
acc/len(definition_indexes_jaccard)

0.8951515814818868

In [8]:
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
embed = model.encode

def cossim(definition: tp.List[str], context: tp.List[str]):
    #to do: don't ignore stopwords
    definition = ' '.join(definition)
    context = ' '.join(context)
    definition_emb = embed(definition)
    context_emb = embed(context)
    result = cosine_distances(context_emb.reshape(1, -1), definition_emb.reshape(1, -1)) #import sklearn
    return result

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

NameError: name 'tp' is not defined

In [None]:
corpus_for_emb = corpus_wsd[:100] # берем только первые 100

In [None]:
# косинусная близость с помощью трансформеров
definition_indexes_cossim = []
for x in tqdm(corpus_for_emb):
    if len(x) < 3:
        continue
    context = [i[2].lower() for i in x if not re.match(r'\W+', i[2])]
    words = extract_polysemy_words(x)
    wordlist = [i[1] for i in words]
    wordlist = set(wordlist)
    wlasl = list(wordlist)
    wlasl = sorted(wlasl)
    sentence_indexes = [find_best_sense(word, context, cossim, fake_tokenization) for word in wlasl]
    definition_indexes_cossim += sentence_indexes

100%|██████████| 100/100 [02:51<00:00,  1.71s/it]


In [None]:
# accuracy косинусная близость
definition_indexes_cossim
acc = 0
for j, e in zip(definition_indexes_cossim, definition_indexes):
    if j == e:
        acc += 1
acc/len(definition_indexes_cossim)

0.27471116816431324

# Задание 2 (5 балла)
Попробуйте разные алгоритмы кластеризации на датасете - `https://github.com/nlpub/russe-wsi-kit/blob/initial/data/main/wiki-wiki/train.csv`

Используйте код из семинара как основу. Используйте ARI как метрику качества.

Попробуйте все 4 алгоритма кластеризации, про которые говорилось на семинаре. Для каждого из алгоритмов попробуйте настраивать гиперпараметры (посмотрите их в документации). Прогоните как минимум 5 экспериментов (не обязательно успешных) с разными параметрами на каждый алгоритме кластеризации и оцените: качество кластеризации, скорость работы, интуитивность параметров.

Помимо этого также выберите 1 дополнительный алгоритм кластеризации отсюда - https://scikit-learn.org/stable/modules/clustering.html , опишите своими словами принцип его работы  и проделайте аналогичные эксперименты.

In [9]:
from sklearn.cluster import KMeans, DBSCAN, AffinityPropagation, AgglomerativeClustering, SpectralClustering
import numpy as np
from sklearn.metrics import adjusted_rand_score

from IPython.display import Image
from IPython.core.display import HTML

In [10]:
import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/nlpub/russe-wsi-kit/initial/data/main/wiki-wiki/train.csv', sep='\t')
grouped_df = df.groupby('word')[['word', 'context', 'gold_sense_id']]

DBSCAN

In [11]:
ARI = []

# 1
for key, _ in tqdm(grouped_df):
    # вытаскиваем контексты
    texts = grouped_df.get_group(key)['context'].values

    # создаем пустую матрицу для векторов
    X = np.zeros((len(texts), 768))

    # переводим тексты в векторы и кладем в матрицу
    for i, text in enumerate(texts):
        X[i] = embed(text)

    cluster = DBSCAN(min_samples=1, eps=0.1)

    cluster.fit(X)
    labels = np.array(cluster.labels_)+1

    # расчитываем метрику для отдельного слова
    ARI.append(adjusted_rand_score(grouped_df.get_group(key)['gold_sense_id'], labels))

print('\n', np.mean(ARI)) # усредненная метрика

100%|██████████| 4/4 [00:13<00:00,  3.33s/it]


 0.001053019960000099





In [12]:
# 2
ARI = []
for key, _ in grouped_df:
    texts = grouped_df.get_group(key)['context'].values

    X = np.zeros((len(texts), 768))

    for i, text in enumerate(texts):
        X[i] = embed(text)

    cluster = DBSCAN(min_samples=3, eps=0.1)

    cluster.fit(X)
    labels = np.array(cluster.labels_)+1

    ARI.append(adjusted_rand_score(grouped_df.get_group(key)['gold_sense_id'], labels))

print(np.mean(ARI))

-0.0021290615824144776


In [13]:
# 3
ARI = []
for key, _ in grouped_df:
    texts = grouped_df.get_group(key)['context'].values

    X = np.zeros((len(texts), 768))

    for i, text in enumerate(texts):
        X[i] = embed(text)

    cluster = DBSCAN(min_samples=1, eps=5)

    cluster.fit(X)
    labels = np.array(cluster.labels_)+1

    ARI.append(adjusted_rand_score(grouped_df.get_group(key)['gold_sense_id'], labels))

print(np.mean(ARI))

0.0


In [14]:
# 4
ARI = []
for key, _ in tqdm(grouped_df):
    texts = grouped_df.get_group(key)['context'].values

    X = np.zeros((len(texts), 768))

    for i, text in enumerate(texts):
        X[i] = embed(text)

    cluster = DBSCAN(min_samples=1, eps=500)

    cluster.fit(X)
    labels = np.array(cluster.labels_)+1

    ARI.append(adjusted_rand_score(grouped_df.get_group(key)['gold_sense_id'], labels))

print('\n', np.mean(ARI))

100%|██████████| 4/4 [00:11<00:00,  2.91s/it]


 0.0





In [15]:
# 5
ARI = []
for key, _ in tqdm(grouped_df):
    texts = grouped_df.get_group(key)['context'].values

    X = np.zeros((len(texts), 768))

    for i, text in enumerate(texts):
        X[i] = embed(text)

    cluster = DBSCAN(min_samples=5, eps=1000)

    cluster.fit(X)
    labels = np.array(cluster.labels_)+1

    ARI.append(adjusted_rand_score(grouped_df.get_group(key)['gold_sense_id'], labels))

print('\n', np.mean(ARI))

100%|██████████| 4/4 [00:11<00:00,  2.96s/it]


 0.0





Лучший результат: min_samples=1, eps=0.1 (№1)
Время: ~11 sec

Affinity Propagation

In [16]:
# 1
ARI = []
for key, _ in tqdm(grouped_df):
    texts = grouped_df.get_group(key)['context'].values

    X = np.zeros((len(texts), 768))

    for i, text in enumerate(texts):
        X[i] = embed(text)

    cluster = AffinityPropagation(damping=0.9)

    cluster.fit(X)
    labels = np.array(cluster.labels_)+1

    ARI.append(adjusted_rand_score(grouped_df.get_group(key)['gold_sense_id'], labels))

print('\n', np.mean(ARI))

100%|██████████| 4/4 [00:13<00:00,  3.29s/it]


 0.05297560306165972





In [17]:
# 2
ARI = []
for key, _ in grouped_df:
    texts = grouped_df.get_group(key)['context'].values

    X = np.zeros((len(texts), 768))

    for i, text in enumerate(texts):
        X[i] = embed(text)

    cluster = AffinityPropagation(damping=0.5)

    cluster.fit(X)
    labels = np.array(cluster.labels_)+1

    ARI.append(adjusted_rand_score(grouped_df.get_group(key)['gold_sense_id'], labels))

print(np.mean(ARI))

0.042740969848549505


In [18]:
# 3
ARI = []
for key, _ in grouped_df:
    texts = grouped_df.get_group(key)['context'].values

    X = np.zeros((len(texts), 768))

    for i, text in enumerate(texts):
        X[i] = embed(text)

    cluster = AffinityPropagation(damping=0.7)

    cluster.fit(X)
    labels = np.array(cluster.labels_)+1

    ARI.append(adjusted_rand_score(grouped_df.get_group(key)['gold_sense_id'], labels))

print(np.mean(ARI))

0.04154515818974152


In [19]:
# 4
ARI = []
for key, _ in tqdm(grouped_df):
    texts = grouped_df.get_group(key)['context'].values

    X = np.zeros((len(texts), 768))

    for i, text in enumerate(texts):
        X[i] = embed(text)

    cluster = AffinityPropagation(damping=0.8)

    cluster.fit(X)
    labels = np.array(cluster.labels_)+1

    ARI.append(adjusted_rand_score(grouped_df.get_group(key)['gold_sense_id'], labels))

print('\n', np.mean(ARI))

100%|██████████| 4/4 [00:12<00:00,  3.09s/it]


 0.04154515818974152





In [20]:
# 5
ARI = []
for key, _ in tqdm(grouped_df):
    texts = grouped_df.get_group(key)['context'].values

    X = np.zeros((len(texts), 768))

    for i, text in enumerate(texts):
        X[i] = embed(text)

    cluster = AffinityPropagation(damping=0.6)

    cluster.fit(X)
    labels = np.array(cluster.labels_)+1

    ARI.append(adjusted_rand_score(grouped_df.get_group(key)['gold_sense_id'], labels))

print('\n', np.mean(ARI))

100%|██████████| 4/4 [00:12<00:00,  3.07s/it]


 0.042740969848549505





Лучший результат: damping=0.9 (№1)
Среднее время 12 секунд


Agglomerative clustering

In [21]:
# 1
ARI = []
for key, _ in tqdm(grouped_df):
    texts = grouped_df.get_group(key)['context'].values

    X = np.zeros((len(texts), 768))

    for i, text in enumerate(texts):
        X[i] = embed(text)

    cluster = AgglomerativeClustering(n_clusters = 2)

    cluster.fit(X)
    labels = np.array(cluster.labels_)+1

    ARI.append(adjusted_rand_score(grouped_df.get_group(key)['gold_sense_id'], labels))

print('\n', np.mean(ARI))

100%|██████████| 4/4 [00:13<00:00,  3.35s/it]


 -0.011976265536517934





In [22]:
# 2
ARI = []
for key, _ in tqdm(grouped_df):
    texts = grouped_df.get_group(key)['context'].values

    X = np.zeros((len(texts), 768))

    for i, text in enumerate(texts):
        X[i] = embed(text)

    cluster = AgglomerativeClustering(n_clusters = 1)

    cluster.fit(X)
    labels = np.array(cluster.labels_)+1

    ARI.append(adjusted_rand_score(grouped_df.get_group(key)['gold_sense_id'], labels))

print('\n', np.mean(ARI))

100%|██████████| 4/4 [00:12<00:00,  3.08s/it]


 0.0





In [23]:
# 3
ARI = []
for key, _ in tqdm(grouped_df):
    texts = grouped_df.get_group(key)['context'].values

    X = np.zeros((len(texts), 768))

    for i, text in enumerate(texts):
        X[i] = embed(text)

    cluster = AgglomerativeClustering(n_clusters = 5)

    cluster.fit(X)
    labels = np.array(cluster.labels_)+1

    ARI.append(adjusted_rand_score(grouped_df.get_group(key)['gold_sense_id'], labels))

print('\n', np.mean(ARI))

100%|██████████| 4/4 [00:14<00:00,  3.71s/it]


 0.03379296274962468





In [24]:
# 4
ARI = []
for key, _ in tqdm(grouped_df):
    texts = grouped_df.get_group(key)['context'].values

    X = np.zeros((len(texts), 768))

    for i, text in enumerate(texts):
        X[i] = embed(text)

    cluster = AgglomerativeClustering(n_clusters = 10)

    cluster.fit(X)
    labels = np.array(cluster.labels_)+1

    ARI.append(adjusted_rand_score(grouped_df.get_group(key)['gold_sense_id'], labels))

print('\n', np.mean(ARI))

100%|██████████| 4/4 [00:12<00:00,  3.11s/it]


 0.05972280261375722





In [25]:
# 5
ARI = []
for key, _ in tqdm(grouped_df):
    texts = grouped_df.get_group(key)['context'].values

    X = np.zeros((len(texts), 768))

    for i, text in enumerate(texts):
        X[i] = embed(text)

    cluster = AgglomerativeClustering(n_clusters = 20)

    cluster.fit(X)
    labels = np.array(cluster.labels_)+1

    ARI.append(adjusted_rand_score(grouped_df.get_group(key)['gold_sense_id'], labels))

print('\n', np.mean(ARI))

100%|██████████| 4/4 [00:12<00:00,  3.13s/it]


 0.04406294885576405





Результаты:
Лучший - n_clusters = 10 (№4)
Среднее время - 13 сек

KMeans

In [26]:
# 1
ARI = []
for key, _ in tqdm(grouped_df):
    texts = grouped_df.get_group(key)['context'].values

    X = np.zeros((len(texts), 768))

    for i, text in enumerate(texts):
        X[i] = embed(text)

    cluster = KMeans(3)

    cluster.fit(X)
    labels = np.array(cluster.labels_)+1

    ARI.append(adjusted_rand_score(grouped_df.get_group(key)['gold_sense_id'], labels))

print('\n', np.mean(ARI))

100%|██████████| 4/4 [00:12<00:00,  3.07s/it]


 0.10585667504095987





In [27]:
# 2
ARI = []
for key, _ in tqdm(grouped_df):
    texts = grouped_df.get_group(key)['context'].values

    X = np.zeros((len(texts), 768))

    for i, text in enumerate(texts):
        X[i] = embed(text)

    cluster = KMeans(1)

    cluster.fit(X)
    labels = np.array(cluster.labels_)+1

    ARI.append(adjusted_rand_score(grouped_df.get_group(key)['gold_sense_id'], labels))

print('\n', np.mean(ARI))

100%|██████████| 4/4 [00:12<00:00,  3.07s/it]


 0.0





In [28]:
# 3
ARI = []
for key, _ in tqdm(grouped_df):
    texts = grouped_df.get_group(key)['context'].values

    X = np.zeros((len(texts), 768))

    for i, text in enumerate(texts):
        X[i] = embed(text)

    cluster = KMeans(7)

    cluster.fit(X)
    labels = np.array(cluster.labels_)+1

    ARI.append(adjusted_rand_score(grouped_df.get_group(key)['gold_sense_id'], labels))

print('\n', np.mean(ARI))

100%|██████████| 4/4 [00:12<00:00,  3.11s/it]


 0.06762425768051768





In [29]:
# 4
ARI = []
for key, _ in tqdm(grouped_df):
    texts = grouped_df.get_group(key)['context'].values

    X = np.zeros((len(texts), 768))

    for i, text in enumerate(texts):
        X[i] = embed(text)

    cluster = KMeans(11)

    cluster.fit(X)
    labels = np.array(cluster.labels_)+1

    ARI.append(adjusted_rand_score(grouped_df.get_group(key)['gold_sense_id'], labels))

print('\n', np.mean(ARI))

100%|██████████| 4/4 [00:12<00:00,  3.13s/it]


 0.04233854151058292





In [30]:
# 5
ARI = []
for key, _ in tqdm(grouped_df):
    texts = grouped_df.get_group(key)['context'].values

    X = np.zeros((len(texts), 768))

    for i, text in enumerate(texts):
        X[i] = embed(text)

    cluster = KMeans(8)

    cluster.fit(X)
    labels = np.array(cluster.labels_)+1

    ARI.append(adjusted_rand_score(grouped_df.get_group(key)['gold_sense_id'], labels))

print('\n', np.mean(ARI))

100%|██████████| 4/4 [00:12<00:00,  3.24s/it]


 0.05962759956136734





Результаты:
Лучший - KMeans(3) (№1)
Среднее время - 12 сек


Spectral Clustering

В спектральной кластеризации используется спектр матрицы сходства данных для того, чтобы снизить разномерность перед кластеризацией в пространствах меньших размерностей. На вход идет матрица сходства, состоящая из количественных оценок относительной схожести каждой пары точек в данных.

In [None]:
# 1
ARI = []
for key, _ in tqdm(grouped_df):
    texts = grouped_df.get_group(key)['context'].values

    X = np.zeros((len(texts), 768))

    for i, text in enumerate(texts):
        X[i] = embed(text)

    cluster = SpectralClustering(n_clusters = 8)

    cluster.fit(X)
    labels = np.array(cluster.labels_)+1

    ARI.append(adjusted_rand_score(grouped_df.get_group(key)['gold_sense_id'], labels))

print('\n', np.mean(ARI))

100%|██████████| 4/4 [00:14<00:00,  3.56s/it]


 0.04443416846309805





In [31]:
# 1
ARI = []
for key, _ in tqdm(grouped_df):
    texts = grouped_df.get_group(key)['context'].values

    X = np.zeros((len(texts), 768))

    for i, text in enumerate(texts):
        X[i] = embed(text)

    cluster = SpectralClustering(n_clusters = 4)

    cluster.fit(X)
    labels = np.array(cluster.labels_)+1

    ARI.append(adjusted_rand_score(grouped_df.get_group(key)['gold_sense_id'], labels))

print('\n', np.mean(ARI))

100%|██████████| 4/4 [00:12<00:00,  3.14s/it]


 0.06831081300040959





In [32]:
# 3
ARI = []
for key, _ in tqdm(grouped_df):
    texts = grouped_df.get_group(key)['context'].values

    X = np.zeros((len(texts), 768))

    for i, text in enumerate(texts):
        X[i] = embed(text)

    cluster = SpectralClustering(n_clusters = 12)

    cluster.fit(X)
    labels = np.array(cluster.labels_)+1

    ARI.append(adjusted_rand_score(grouped_df.get_group(key)['gold_sense_id'], labels))

print('\n', np.mean(ARI))

100%|██████████| 4/4 [00:12<00:00,  3.17s/it]


 0.06003911864303416





In [33]:
# 4
ARI = []
for key, _ in tqdm(grouped_df):
    texts = grouped_df.get_group(key)['context'].values

    X = np.zeros((len(texts), 768))

    for i, text in enumerate(texts):
        X[i] = embed(text)

    cluster = SpectralClustering(n_clusters = 2)

    cluster.fit(X)
    labels = np.array(cluster.labels_)+1

    ARI.append(adjusted_rand_score(grouped_df.get_group(key)['gold_sense_id'], labels))

print('\n', np.mean(ARI))

100%|██████████| 4/4 [00:12<00:00,  3.17s/it]


 0.028270047379386973





In [34]:
# 5
ARI = []
for key, _ in tqdm(grouped_df):
    texts = grouped_df.get_group(key)['context'].values

    X = np.zeros((len(texts), 768))

    for i, text in enumerate(texts):
        X[i] = embed(text)

    cluster = SpectralClustering(n_clusters = 5)

    cluster.fit(X)
    labels = np.array(cluster.labels_)+1

    ARI.append(adjusted_rand_score(grouped_df.get_group(key)['gold_sense_id'], labels))

print('\n', np.mean(ARI))

100%|██████████| 4/4 [00:12<00:00,  3.16s/it]


 0.07540576691096501





Результаты:
Лучший результат - n_clusters = 5 (№5)
Среднее время - 12 секунд

Лучшие результаты получились у Kmeans и Spectral clustering, средняя скорость выполнения почти везде одинаковая: +- 12 секунд. Kmeans кажется самым удобным и понятным типом кластеризации.