In [1]:
from search_clustering.pipeline import *
from search_clustering.client import *
from search_clustering.preprocessing import *
from search_clustering.embedding import *
from search_clustering.reduction import *
from search_clustering.clustering.spatial import *
from search_clustering.clustering.temporal import *
from search_clustering.labeling import *
from search_clustering.utils.odp_239 import *

data = load_odp239()
data = embed_odp239_labels(data)

In [2]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import fowlkes_mallows_score
import numpy as np
from collections import Counter
from scipy.stats import describe
from search_clustering.pipeline import *
from flair.data import Sentence
from flair.embeddings import DocumentPoolEmbeddings, WordEmbeddings
from typing import Optional
from pprint import pprint
from tqdm import tqdm

In [3]:
pipe = SpatialPipeline(ColumnMerger(["title", "snippet"]), SentenceMiniLM("merged"), Umap(), KMeans(), FrequentPhrases(language="english"))

embeddings = DocumentPoolEmbeddings(WordEmbeddings("en-glove"))

clusters = {}
label_embeddings = {}

for category in tqdm(data):
    data_cat = data[category]
    _, clusters_cat, labels_cat = pipe.fit_transform(data_cat["data"], visualize=False, verbose=False)
    label_embeddings_cat = [embed_target_name(label, embeddings) for label in labels_cat]
    clusters[category] = clusters_cat
    label_embeddings[category] = np.vstack(label_embeddings_cat)

  0%|                                                    | 0/14 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


100%|████████████████████████████████████████| 14/14 [1:16:01<00:00, 325.80s/it]


In [4]:
mean_scores = [0]
median_scores = [0]
std_scores = [0]

for i in tqdm(range(1, 25)):
    scores = []
    for category in data:
        score_cat = score_knn(data[category], clusters[category], label_embeddings[category], n_neighbors = i)
        scores.append(score_cat)
    mean_scores.append(np.mean(scores))
    median_scores.append(np.median(scores))
    std_scores.append(np.std(scores))

100%|███████████████████████████████████████████| 24/24 [00:03<00:00,  7.81it/s]


In [10]:
mean_scores[1]

0.3298723613199628

In [6]:
subtopics = []

for category in data:
    counts = list(Counter(data[category]["target_embeddings"].values()).values())
    subtopics.extend(counts)
    
Counter(subtopics)

In [14]:
from scipy.stats import describe

describe(mean_scores)
labels_cat

['cycling, bicycle, bike (108)',
 'team, club, athletic (60)',
 'bowling, bowls, bowl (88)',
 'rodeo, association, rodeo association (97)',
 'golf, club, golf club (100)',
 'cricket, club, cricket club (121)',
 'skate, skateboarding, skateparks (96)',
 'fantasy, league, football (64)',
 'cheerleading, university, cheer (75)',
 'tennis, college, women s tennis (92)',
 'boxing, club, boxing club (82)',
 'paintball, games, paintball games (82)',
 'ryu, karate, goju (97)',
 'adventure, racing, adventure racing (176)',
 'sports, association, special (92)',
 'football, com, soccer (130)',
 'lacrosse, men, men s lacrosse (118)',
 'basketball, college, men s basketball (108)',
 'curly, horse, farm (92)',
 'university, college, baseball (118)',
 'com, rowing, surf (99)',
 'com, quot, jr (136)',
 'volleyball, club, volleyball club (50)',
 'baseball, league, com (100)',
 'track, field, track and field (128)',
 'hockey, men, university (110)',
 'racing, autocross, com (105)',
 'hash, harriers, hou