In [1]:
import ast
import pandas as pd

import src.edge_constructors as edge
import src.clustering as clu
import src.metrics_fr as fr
import src.utils as utils

[Taichi] version 1.7.1, llvm 15.0.7, commit 0f143b2f, osx, python 3.11.8


[I 05/27/24 18:32:23.466 4962603] [shell.py:_shell_pop_print@23] Graphical python shell detected, using wrapped sys.stdout


[Taichi] Starting on arch=metal


# Thesis Evaluation

## Helper Functions

In [2]:
def load_data(filepath, n=None):
    assert filepath[-4:] == ".csv", "Must be a .csv file"
    data = pd.read_csv(filepath)
    if n:
        data = data.head(n)

    attrs = {
        "titles": data["title"].tolist(),
        "text": data["text"].tolist(),
        "tags": data["tags"].apply(ast.literal_eval).tolist(),
        "ids": data.index.tolist()
    }

    if "simplified_tags" in data.columns:
        attrs["simplified_tags"] = data["simplified_tags"].apply(ast.literal_eval).tolist()

    return attrs

def load_embeddings(dataset_name, model_names):
    embeddings = []
    for name in model_names:
        embeddings.append(utils.load_from_pickle(f"embeddings/{dataset_name}_{name}_n10000.pickle"))
    return embeddings

## Data: interview_prep.csv

In [3]:
data_name = "interview_prep"
data = load_data(f"data/{data_name}.csv")

### Embedding and Similarity Scores

In [4]:
df1 = fr.get_embedding_similarity_metrics_per_dataset("interview_prep", data["tags"],
                                             ["minilm", "mpnet", "nomic", "bert", "specter", "word2vec"],
                                             ["mean"])
df1.to_csv("analysis/metric1_interview.csv")
df1.head(10)

Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 165.34it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 64.78it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 102.42it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 515.71it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 45.88it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 289.74it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 1032.06it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 47.52it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 1897.88it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 382.13it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 53.91it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 2746.76it/s]
Calculati

Unnamed: 0,data_source,embedding_model,agg_method,metric_name,metric,between_all_nodes,between_shared_tags
0,interview_prep,minilm,mean,cosine,mean,0.68491,0.706979
1,interview_prep,minilm,mean,cosine,median,0.674775,0.715765
2,interview_prep,minilm,mean,cosine,std_dev,0.069721,0.070315
3,interview_prep,minilm,mean,soft_cosine,mean,0.641387,0.656642
4,interview_prep,minilm,mean,soft_cosine,median,0.644162,0.661419
5,interview_prep,minilm,mean,soft_cosine,std_dev,0.12629,0.134193
6,interview_prep,minilm,mean,euclidean,mean,0.537721,0.56198
7,interview_prep,minilm,mean,euclidean,median,0.521811,0.566406
8,interview_prep,minilm,mean,euclidean,std_dev,0.075639,0.077599
0,interview_prep,mpnet,mean,cosine,mean,0.707133,0.725702


### Cluster Purity, Homogeneity, and Completeness

In [5]:
df2 = fr.compare_cluster_metrics("interview_prep",
                                    ["minilm", "mpnet", "nomic", "bert", "specter", "word2vec"],
                                    ["mean"],
                                    {
                                        "kmeans5": lambda x: clu.kmeans(x),
                                        "kmeans2": lambda x: clu.kmeans(x, 2)
                                    }, data["ids"], data["tags"], k=2)
df2.to_csv("analysis/metric2_interview.csv")
df2.head(10)

Unnamed: 0,embedding_model,agg_method,clusterer,homogeneity,completeness,tag_concentration_purity,cluster_tag_purity
0,minilm,mean,kmeans5,0.285,0.133,"{'haha': 0.833, 'hehe': 0.667}","{'haha': 0.357, 'hehe': 0.143}"
1,minilm,mean,kmeans2,0.205,0.223,"{'haha': 0.8, 'hehe': 0.75}","{'haha': 0.571, 'hehe': 0.214}"
2,mpnet,mean,kmeans5,0.335,0.139,"{'haha': 0.75, 'hehe': 0.667}","{'haha': 0.214, 'hehe': 0.143}"
3,mpnet,mean,kmeans2,0.084,0.106,"{'haha': 0.727, 'hehe': 0.273}","{'haha': 0.571, 'hehe': 0.214}"
4,nomic,mean,kmeans5,0.43,0.178,"{'haha': 1.0, 'hehe': 0.667}","{'haha': 0.286, 'hehe': 0.143}"
5,nomic,mean,kmeans2,0.084,0.106,"{'haha': 0.727, 'hehe': 0.273}","{'haha': 0.571, 'hehe': 0.214}"
6,bert,mean,kmeans5,0.191,0.086,"{'haha': 0.833, 'hehe': 0.5}","{'haha': 0.357, 'hehe': 0.071}"
7,bert,mean,kmeans2,0.121,0.305,"{'haha': 0.692, 'hehe': 0.308}","{'haha': 0.643, 'hehe': 0.286}"
8,specter,mean,kmeans5,0.544,0.247,"{'haha': 0.75, 'hehe': 0.5}","{'haha': 0.214, 'hehe': 0.214}"
9,specter,mean,kmeans2,0.084,0.106,"{'haha': 0.727, 'hehe': 0.273}","{'haha': 0.571, 'hehe': 0.214}"


### Edge Assignment Evaluation: Tag Connectivity and Degree of Separation

In [6]:
df3 = fr.compare_edge_assignment_metrics("interview_prep",
                                        ["minilm", "mpnet", "nomic", "bert", "specter", "word2vec"],
                                        ["mean"],
                                        {
                                            "kmeans5": lambda x: clu.kmeans(x),
                                            "kmeans2": lambda x: clu.kmeans(x, 2)
                                        },
                                        {
                                            "knn3": lambda sim_mat, ids: edge.knn(sim_mat, ids, k=3),
                                            "knn5": lambda sim_mat, ids: edge.knn(sim_mat, ids, k=5)
                                        }, data["ids"], data["tags"], data["titles"], max_depth=3)
df3.to_csv("analysis/metric3_interview.csv")
df3.head(10)

Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 263.76it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 108.79it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 628.93it/s]


graphs/interview_prep_minilm_cosine_mean_knn3_kmeans5


Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 1786.33it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 2748.56it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 1842.03it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 2579.52it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 4481.09it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 2841.67it/s]


graphs/interview_prep_minilm_cosine_mean_knn3_kmeans2


Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 1675.04it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 2807.43it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 2513.06it/s]


graphs/interview_prep_minilm_cosine_mean_knn5_kmeans5


Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 2799.94it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 4032.98it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 2182.26it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 2501.08it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 4328.49it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 2336.66it/s]


graphs/interview_prep_minilm_cosine_mean_knn5_kmeans2


Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 1550.00it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 4332.96it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 1420.35it/s]

graphs/interview_prep_minilm_soft_cosine_mean_knn3_kmeans5



Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 1481.56it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 1054.91it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 485.17it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 1094.26it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 1515.28it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 477.06it/s]


graphs/interview_prep_minilm_soft_cosine_mean_knn3_kmeans2


Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 241.96it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 796.79it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 1031.05it/s]

graphs/interview_prep_minilm_soft_cosine_mean_knn5_kmeans5



Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 948.72it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 1187.85it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 549.78it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 944.24it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 1451.32it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 551.59it/s]

graphs/interview_prep_minilm_soft_cosine_mean_knn5_kmeans2







Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 185.01it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 327.88it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 1001.98it/s]

graphs/interview_prep_minilm_euclidean_mean_knn3_kmeans5







Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 3077.26it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 2206.37it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 4048.56it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 2468.69it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 3637.73it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 1726.76it/s]


graphs/interview_prep_minilm_euclidean_mean_knn3_kmeans2


Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 2142.14it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 1483.66it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 2924.90it/s]

graphs/interview_prep_minilm_euclidean_mean_knn5_kmeans5



Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 326.91it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 170.29it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 889.00it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 1272.16it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 2968.37it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 1485.76it/s]


graphs/interview_prep_minilm_euclidean_mean_knn5_kmeans2


Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 2590.68it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 1166.70it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 1785.57it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 784.42it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 60.93it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 972.71it/s]

graphs/interview_prep_mpnet_cosine_mean_knn3_kmeans5



Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 3204.20it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 1400.44it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 1462.45it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 3679.21it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 1831.57it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 2294.48it/s]

graphs/interview_prep_mpnet_cosine_mean_knn3_kmeans2



Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 1060.51it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 6364.65it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 1521.88it/s]


graphs/interview_prep_mpnet_cosine_mean_knn5_kmeans5


Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 2974.68it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 1927.53it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 1368.01it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 785.01it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 3002.37it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 1873.29it/s]

graphs/interview_prep_mpnet_cosine_mean_knn5_kmeans2



Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 970.01it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 2087.76it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 5882.61it/s]

graphs/interview_prep_mpnet_soft_cosine_mean_knn3_kmeans5



Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 998.88it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 207.59it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 400.33it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 639.86it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 268.56it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 171.11it/s]

graphs/interview_prep_mpnet_soft_cosine_mean_knn3_kmeans2



Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 63.76it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 162.94it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 275.63it/s]

graphs/interview_prep_mpnet_soft_cosine_mean_knn5_kmeans5



Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 358.64it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 327.86it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 1022.75it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 566.87it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 265.66it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 289.02it/s]


graphs/interview_prep_mpnet_soft_cosine_mean_knn5_kmeans2


Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 77.23it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 561.19it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 663.55it/s]


graphs/interview_prep_mpnet_euclidean_mean_knn3_kmeans5


Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 593.51it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 2666.44it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 1451.82it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 1260.69it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 598.42it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 703.86it/s]

graphs/interview_prep_mpnet_euclidean_mean_knn3_kmeans2



Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 2770.35it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 681.56it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 3070.50it/s]


graphs/interview_prep_mpnet_euclidean_mean_knn5_kmeans5


Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 1343.90it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 2325.00it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 364.31it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 1253.15it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 479.84it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 1727.47it/s]


graphs/interview_prep_mpnet_euclidean_mean_knn5_kmeans2


Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 476.63it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 382.55it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 1779.51it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 185.94it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 49.32it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 1221.05it/s]

graphs/interview_prep_nomic_cosine_mean_knn3_kmeans5



Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 2202.89it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 196.15it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 3095.43it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 1153.23it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 1567.38it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 1937.32it/s]

graphs/interview_prep_nomic_cosine_mean_knn3_kmeans2



Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 787.96it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 1138.83it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 2251.37it/s]


graphs/interview_prep_nomic_cosine_mean_knn5_kmeans5


Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 1135.13it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 1372.03it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 2092.97it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 4760.84it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 2576.35it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 1272.16it/s]


graphs/interview_prep_nomic_cosine_mean_knn5_kmeans2


Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 1307.04it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 1336.62it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 1686.49it/s]


graphs/interview_prep_nomic_soft_cosine_mean_knn3_kmeans5


Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 629.30it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 819.68it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 343.43it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 248.99it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 493.62it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 210.61it/s]


graphs/interview_prep_nomic_soft_cosine_mean_knn3_kmeans2


Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 104.13it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 583.76it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 313.38it/s]

graphs/interview_prep_nomic_soft_cosine_mean_knn5_kmeans5







Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 321.62it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 177.79it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 197.94it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 660.10it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 889.57it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 170.56it/s]

graphs/interview_prep_nomic_soft_cosine_mean_knn5_kmeans2



Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 35.52it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 216.26it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 340.47it/s]

graphs/interview_prep_nomic_euclidean_mean_knn3_kmeans5



Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 865.70it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 393.39it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 1506.57it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 894.31it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 620.37it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 2577.94it/s]


graphs/interview_prep_nomic_euclidean_mean_knn3_kmeans2


Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 1262.58it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 2232.20it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 1639.68it/s]


graphs/interview_prep_nomic_euclidean_mean_knn5_kmeans5


Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 1230.00it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 1536.38it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 2785.06it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 1372.03it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 1104.93it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 530.19it/s]

graphs/interview_prep_nomic_euclidean_mean_knn5_kmeans2







Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 1358.70it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 1189.20it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 739.61it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 471.01it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 58.69it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 231.92it/s]


graphs/interview_prep_bert_cosine_mean_knn3_kmeans5


Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 2293.22it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 1565.04it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 648.57it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 659.17it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 1685.81it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 645.97it/s]


graphs/interview_prep_bert_cosine_mean_knn3_kmeans2


Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 1111.07it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 1519.68it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 2452.81it/s]


graphs/interview_prep_bert_cosine_mean_knn5_kmeans5


Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 240.66it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 2584.29it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 891.27it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 1952.66it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 2114.06it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 1927.53it/s]

graphs/interview_prep_bert_cosine_mean_knn5_kmeans2



Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 1683.78it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 4032.98it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 1302.17it/s]


graphs/interview_prep_bert_soft_cosine_mean_knn3_kmeans5


Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 271.90it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 497.01it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 266.95it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 267.51it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 438.41it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 236.85it/s]


graphs/interview_prep_bert_soft_cosine_mean_knn3_kmeans2


Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 67.57it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 67.66it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 376.64it/s]


graphs/interview_prep_bert_soft_cosine_mean_knn5_kmeans5


Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 175.78it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 768.61it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 466.60it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 265.97it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 424.65it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 301.57it/s]


graphs/interview_prep_bert_soft_cosine_mean_knn5_kmeans2


Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 69.25it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 1007.28it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 719.31it/s]


graphs/interview_prep_bert_euclidean_mean_knn3_kmeans5


Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 1703.62it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 1912.59it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 2888.64it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 628.55it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 949.58it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 702.92it/s]


graphs/interview_prep_bert_euclidean_mean_knn3_kmeans2


Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 1176.19it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 1457.37it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 1398.57it/s]


graphs/interview_prep_bert_euclidean_mean_knn5_kmeans5


Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 2666.44it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 3813.00it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 1445.31it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 2263.52it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 1194.62it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 1186.17it/s]


graphs/interview_prep_bert_euclidean_mean_knn5_kmeans2


Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 1257.66it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 2644.58it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 2067.18it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 290.71it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 63.85it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 742.35it/s]


graphs/interview_prep_specter_cosine_mean_knn3_kmeans5


Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 2114.06it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 714.17it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 1291.75it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 2381.77it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 1964.55it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 1804.78it/s]

graphs/interview_prep_specter_cosine_mean_knn3_kmeans2







Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 335.92it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 750.32it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 1399.97it/s]

graphs/interview_prep_specter_cosine_mean_knn5_kmeans5



Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 936.23it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 6288.31it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 2381.77it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 1560.38it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 1555.75it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 3938.31it/s]


graphs/interview_prep_specter_cosine_mean_knn5_kmeans2


Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 969.78it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 936.23it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 1647.41it/s]


graphs/interview_prep_specter_soft_cosine_mean_knn3_kmeans5


Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 449.26it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 158.99it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 528.45it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 1463.98it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 182.61it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 109.89it/s]


graphs/interview_prep_specter_soft_cosine_mean_knn3_kmeans2


Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 42.75it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 765.10it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 346.15it/s]


graphs/interview_prep_specter_soft_cosine_mean_knn5_kmeans5


Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 384.76it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 715.87it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 786.78it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 406.78it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 152.72it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 190.88it/s]

graphs/interview_prep_specter_soft_cosine_mean_knn5_kmeans2



Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 64.03it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 722.53it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 682.00it/s]


graphs/interview_prep_specter_euclidean_mean_knn3_kmeans5


Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 1213.63it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 3097.71it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 1582.76it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 4116.10it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 2101.35it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 1133.90it/s]


graphs/interview_prep_specter_euclidean_mean_knn3_kmeans2


Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 1729.61it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 1168.33it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 2000.14it/s]


graphs/interview_prep_specter_euclidean_mean_knn5_kmeans5


Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 2212.19it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 2531.26it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 982.27it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 1351.26it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 2666.44it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 2624.72it/s]


graphs/interview_prep_specter_euclidean_mean_knn5_kmeans2


Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 2993.79it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 592.50it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 1527.42it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 551.81it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 94.45it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 1584.55it/s]

graphs/interview_prep_word2vec_cosine_mean_knn3_kmeans5



Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 1845.27it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 1781.78it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 3625.15it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 1524.09it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 3647.22it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 3498.17it/s]

graphs/interview_prep_word2vec_cosine_mean_knn3_kmeans2







Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 2246.55it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 1685.81it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 2024.28it/s]

graphs/interview_prep_word2vec_cosine_mean_knn5_kmeans5



Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 1218.21it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 4466.78it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 1703.62it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 2132.34it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 4369.07it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 2012.62it/s]


graphs/interview_prep_word2vec_cosine_mean_knn5_kmeans2


Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 3289.65it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 1848.53it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 3758.34it/s]


graphs/interview_prep_word2vec_soft_cosine_mean_knn3_kmeans5


Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 238.03it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 86.80it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 536.49it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 1552.87it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 705.76it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 484.55it/s]


graphs/interview_prep_word2vec_soft_cosine_mean_knn3_kmeans2


Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 173.50it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 752.34it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 608.66it/s]

graphs/interview_prep_word2vec_soft_cosine_mean_knn5_kmeans5







Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 373.82it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 1400.90it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 1051.73it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 1075.19it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 1472.20it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 393.68it/s]


graphs/interview_prep_word2vec_soft_cosine_mean_knn5_kmeans2


Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 238.65it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 501.95it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 1069.43it/s]


graphs/interview_prep_word2vec_euclidean_mean_knn3_kmeans5


Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 2341.88it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 8050.49it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 3452.10it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 2732.45it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 3050.40it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 2659.67it/s]

graphs/interview_prep_word2vec_euclidean_mean_knn3_kmeans2







Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 2451.38it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 2273.34it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 1726.76it/s]


graphs/interview_prep_word2vec_euclidean_mean_knn5_kmeans5


Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 1424.70it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 1366.22it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 2664.74it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 2416.07it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 3086.32it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 2065.14it/s]

graphs/interview_prep_word2vec_euclidean_mean_knn5_kmeans2



Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 1582.16it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 2063.11it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 1211.88it/s]


Unnamed: 0,embedding_model,agg_method,similarity,edge constructor,clusterer,depth,connected_nodes,percentage_connected,degree_of_separation
0,minilm,mean,cosine,knn3,kmeans5,1,2,0.1,1.66821
1,minilm,mean,cosine,knn3,kmeans5,2,3,0.15,1.66821
2,minilm,mean,cosine,knn3,kmeans5,3,11,0.55,1.66821
3,minilm,mean,cosine,knn3,kmeans2,1,2,0.118,1.338812
4,minilm,mean,cosine,knn3,kmeans2,2,7,0.412,1.338812
5,minilm,mean,cosine,knn3,kmeans2,3,4,0.235,1.338812
6,minilm,mean,cosine,knn5,kmeans5,1,1,0.05,1.610567
7,minilm,mean,cosine,knn5,kmeans5,2,4,0.2,1.610567
8,minilm,mean,cosine,knn5,kmeans5,3,13,0.65,1.610567
9,minilm,mean,cosine,knn5,kmeans2,1,2,0.118,1.277447


## Data: Medium (n=10000)

In [7]:
data_name = "medium_1k_tags_simplified"
data = load_data(f"data/{data_name}.csv", n=5000)
ids = data["ids"]
titles = data["titles"]
tags = data["simplified_tags"]

### Embedding and Similarity Scores

In [8]:
# df1 = fr.get_embedding_similarity_metrics_per_dataset("medium1k", tags,
#                                              ["minilm", "mpnet", "nomic", "bert", "specter", "word2vec"],
#                                              ["mean"])
# df1.to_csv("analysis/metric1_medium1k.csv")
# df1.head(10)

In [8]:
df1 = fr.get_embedding_similarity_metrics_per_dataset("medium1k", tags,
                                             ["minilm"],
                                             ["mean"])
df1.to_csv("analysis/metric1_medium1k_minilm.csv")
df1.head(10)

Calculating cosine similarities: 100%|██████████| 20/20 [00:13<00:00,  1.48it/s]
Calculating soft_cosine similarities: 100%|██████████| 20/20 [14:04<00:00, 42.22s/it]
Calculating euclidean similarities: 100%|██████████| 20/20 [00:01<00:00, 11.82it/s]


Unnamed: 0,data_source,embedding_model,agg_method,metric_name,metric,between_all_nodes,between_shared_tags
0,medium1k,minilm,mean,cosine,mean,0.585432,0.607958
1,medium1k,minilm,mean,cosine,median,0.572935,0.59694
2,medium1k,minilm,mean,cosine,std_dev,0.070676,0.078426
3,medium1k,minilm,mean,soft_cosine,mean,0.577098,0.594107
4,medium1k,minilm,mean,soft_cosine,median,0.574291,0.591635
5,medium1k,minilm,mean,soft_cosine,std_dev,0.094922,0.102319
6,medium1k,minilm,mean,euclidean,mean,0.440976,0.462361
7,medium1k,minilm,mean,euclidean,median,0.425653,0.446588
8,medium1k,minilm,mean,euclidean,std_dev,0.06625,0.076233


In [9]:
df1 = fr.get_embedding_similarity_metrics_per_dataset("medium1k", tags,
                                             ["mpnet", "nomic", "bert", "specter", "word2vec"],
                                             ["mean"])
df1.to_csv("analysis/metric1_medium1k_rest.csv")
df1.head(10)

Calculating cosine similarities: 100%|██████████| 20/20 [00:37<00:00,  1.89s/it]
Calculating soft_cosine similarities: 100%|██████████| 20/20 [39:16<00:00, 117.82s/it]
Calculating euclidean similarities: 100%|██████████| 20/20 [00:00<00:00, 22.42it/s]
Calculating cosine similarities: 100%|██████████| 20/20 [00:42<00:00,  2.11s/it]
Calculating soft_cosine similarities: 100%|██████████| 20/20 [33:38<00:00, 100.94s/it]
Calculating euclidean similarities: 100%|██████████| 20/20 [00:00<00:00, 20.76it/s]
Calculating cosine similarities: 100%|██████████| 20/20 [00:37<00:00,  1.88s/it]
Calculating soft_cosine similarities: 100%|██████████| 20/20 [34:48<00:00, 104.43s/it]
Calculating euclidean similarities: 100%|██████████| 20/20 [00:00<00:00, 21.54it/s]
Calculating cosine similarities: 100%|██████████| 20/20 [00:34<00:00,  1.70s/it]
Calculating soft_cosine similarities: 100%|██████████| 20/20 [32:07<00:00, 96.35s/it]
Calculating euclidean similarities: 100%|██████████| 20/20 [00:01<00:00, 18.7

Unnamed: 0,data_source,embedding_model,agg_method,metric_name,metric,between_all_nodes,between_shared_tags
0,medium1k,mpnet,mean,cosine,mean,0.567329,0.590647
1,medium1k,mpnet,mean,cosine,median,0.554505,0.578365
2,medium1k,mpnet,mean,cosine,std_dev,0.06223,0.071342
3,medium1k,mpnet,mean,soft_cosine,mean,0.553276,0.577064
4,medium1k,mpnet,mean,soft_cosine,median,0.549274,0.572625
5,medium1k,mpnet,mean,soft_cosine,std_dev,0.080918,0.088043
6,medium1k,mpnet,mean,euclidean,mean,0.424325,0.445684
7,medium1k,mpnet,mean,euclidean,median,0.41025,0.430301
8,medium1k,mpnet,mean,euclidean,std_dev,0.056616,0.067507
0,medium1k,nomic,mean,cosine,mean,0.777387,0.792314


### Cluster Purity, Homogeneity, and Completeness

In [12]:
k_values = [1, 2, 5, 10, 15, 20, 30, 40, 50, 65, 80, 100, 120, 150]

clustering_methods = {
    **{f"kmeans{k}": lambda x, k=k: clu.kmeans(x, k) for k in k_values},
    **{f"dbscan_eps{eps}_min{min_samples}": lambda x, eps=eps, min_samples=min_samples: clu.dbscan(x, eps=eps, min_samples=min_samples) for eps in [0.1, 0.3, 0.5, 0.7, 1.0] for min_samples in [3, 5, 10, 15]},
    **{f"gmm{n_components}": lambda x, n_components=n_components: clu.gmm(x, n_components) for n_components in k_values},
    **{f"birch{k}": lambda x, k=k: clu.birch(x, k) for k in k_values}
}

df2 = fr.compare_cluster_metrics("medium1k",
                                    ["minilm", "mpnet", "nomic", "bert", "specter", "word2vec"],
                                    ["mean"],
                                    clustering_methods,
                                    ids, tags, k=2)

df2.to_csv("analysis/metric2_medium1k.csv")
df2.head(10)

In [None]:
df2 = fr.compare_cluster_metrics("medium1k",
                                    ["minilm", "mpnet", "nomic", "bert", "specter", "word2vec"],
                                    ["mean"],
                                    {
                                        "kmeans5": lambda x: clu.kmeans(x),
                                        "kmeans2": lambda x: clu.kmeans(x, 2),
                                        "kmeans10": lambda x: clu.kmeans(x, 10),
                                        "dbscan_default": lambda x: clu.dbscan(x, eps=0.),
                                        "dbscan_eps0.3": lambda x: clu.dbscan(x, eps=0.3),
                                        "dbscan_eps0.7": lambda x: clu.dbscan(x, eps=0.7),
                                        "gmm_default": lambda x: clu.gmm(x, n_components=3),
                                        "gmm_n4": lambda x: clu.gmm(x, n_components=4),
                                        "gmm_n5": lambda x: clu.gmm(x, n_components=5),
                                        "birch_default": lambda x: clu.birch(x),
                                        "birch_n5": lambda x: clu.birch(x, n_clusters=5),
                                        "birch_n10": lambda x: clu.birch(x, n_clusters=10)
                                    }, ids, tags, k=2)


### Edge Assignment Evaluation: Tag Connectivity and Degree of Separation

In [None]:
k_values = [1, 2, 5, 10, 15, 20, 30, 40, 50]
threshold_values = [0.1, 0.3, 0.5, 0.7, 0.9]

edge_assignment_methods = {
    **{f"knn{k}": lambda sim_mat, ids, k=k: edge.knn(sim_mat, ids, k) for k in k_values},
    **{f"knn_mst{k}": lambda sim_mat, ids, k=k: edge.knn_mst(sim_mat, ids, k) for k in k_values},
    **{f"threshold{threshold}": lambda sim_mat, ids, threshold=threshold: edge.threshold_based_edge_assignment(sim_mat, ids, threshold) for threshold in threshold_values},
    **{f"mutual_knn{k}": lambda sim_mat, ids, k=k: edge.mutual_knn_edge_assignment(sim_mat, ids, k) for k in k_values},
    **{f"spectral{n_clusters}": lambda sim_mat, ids, n_clusters=n_clusters: edge.spectral_clustering_edge_assignment(sim_mat, ids, n_clusters) for n_clusters in k_values}
}

df3 = fr.compare_edge_assignment_metrics("interview_prep",
                                        ["minilm", "mpnet", "nomic", "bert", "specter", "word2vec"],
                                        ["mean"],
                                        {
                                            # "kmeans5": lambda x: clu.kmeans(x), # pick the best form above... to save time
                                            # "kmeans2": lambda x: clu.kmeans(x, 2)
                                        },
                                        edge_assignment_methods,
                                        ids, tags, titles, max_depth=3)

df3.to_csv("analysis/metric3_medium1k.csv")
print(df3.head(10))

In [None]:
# df3 = fr.compare_edge_assignment_metrics("interview_prep",
#                                         ["minilm", "mpnet", "nomic", "bert", "specter", "word2vec"],
#                                         ["mean"],
#                                         {
#                                             "kmeans5": lambda x: clu.kmeans(x),
#                                             "kmeans2": lambda x: clu.kmeans(x, 2)
#                                         },
#                                         {
#                                             "knn3": lambda sim_mat, ids: edge.knn(sim_mat, ids, k=3),
#                                             "knn5": lambda sim_mat, ids: edge.knn(sim_mat, ids, k=5)
#                                         }, ids, tags, titles, max_depth=3)
# df3.to_csv("analysis/metric3_medium1k.csv")
# df3.head(10)