In [1]:
import ast
import pandas as pd

import src.edge_constructors as edge
import src.clustering as clu
import src.metrics_fr as fr
import src.utils as utils

from collections import Counter

  from .autonotebook import tqdm as notebook_tqdm


# Thesis Evaluation

## Helper Functions

In [2]:
def load_data(filepath, n=None):
    assert filepath[-4:] == ".csv", "Must be a .csv file"
    data = pd.read_csv(filepath)
    if n:
        data = data.head(n)

    attrs = {
        "titles": data["title"].tolist(),
        "text": data["text"].tolist(),
        "tags": data["tags"].apply(ast.literal_eval).tolist(),
        "ids": data.index.tolist()
    }

    if "simplified_tags" in data.columns:
        attrs["simplified_tags"] = data["simplified_tags"].apply(ast.literal_eval).tolist()

    return attrs

def load_embeddings(dataset_name, model_names):
    embeddings = []
    for name in model_names:
        embeddings.append(utils.load_from_pickle(f"embeddings/{dataset_name}_{name}_n10000.pickle"))
    return embeddings

## Data: interview_prep.csv

In [3]:
data_name = "interview_prep"
data = load_data(f"data/{data_name}.csv")

### Embedding and Similarity Scores

In [4]:
df1 = fr.get_embedding_similarity_metrics_per_dataset("interview_prep", data["tags"],
                                             ["minilm", "mpnet", "nomic", "bert", "specter", "word2vec"],
                                             ["mean"])
df1.to_csv("analysis/metric1_interview.csv")
df1.head(10)

Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 344.59it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 49.29it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 72.36it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 514.39it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 58.16it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 3226.39it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 939.16it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 56.66it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 1894.45it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 1044.66it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 60.64it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 1321.04it/s]
Calculati

Unnamed: 0,data_source,embedding_model,agg_method,metric_name,metric,between_all_nodes,between_shared_tags
0,interview_prep,minilm,mean,cosine,mean,0.369821,0.413958
1,interview_prep,minilm,mean,cosine,median,0.34955,0.431531
2,interview_prep,minilm,mean,cosine,std_dev,0.139443,0.14063
3,interview_prep,minilm,mean,soft_cosine,mean,0.7115,0.755448
4,interview_prep,minilm,mean,soft_cosine,median,0.727371,0.784485
5,interview_prep,minilm,mean,soft_cosine,std_dev,0.134889,0.131459
6,interview_prep,minilm,mean,euclidean,mean,0.537721,0.56198
7,interview_prep,minilm,mean,euclidean,median,0.521811,0.566406
8,interview_prep,minilm,mean,euclidean,std_dev,0.075639,0.077599
0,interview_prep,mpnet,mean,cosine,mean,0.414266,0.451403


### Cluster Purity, Homogeneity, and Completeness

In [5]:
df2 = fr.compare_cluster_metrics("interview_prep",
                                    ["minilm", "mpnet", "nomic", "bert", "specter", "word2vec"],
                                    ["mean"],
                                    {
                                        "kmeans5": lambda x: clu.kmeans(x),
                                        "kmeans2": lambda x: clu.kmeans(x, 2)
                                    }, data["ids"], data["tags"], k=2)
df2.to_csv("analysis/metric2_interview.csv")
df2.head(10)

Unnamed: 0,embedding_model,agg_method,clusterer,homogeneity,completeness,tag_concentration_purity,cluster_tag_purity
0,minilm,mean,kmeans5,0.285,0.133,"{'haha': 0.833, 'hehe': 0.667}","{'haha': 0.357, 'hehe': 0.143}"
1,minilm,mean,kmeans2,0.205,0.223,"{'haha': 0.8, 'hehe': 0.75}","{'haha': 0.571, 'hehe': 0.214}"
2,mpnet,mean,kmeans5,0.335,0.139,"{'haha': 0.75, 'hehe': 0.667}","{'haha': 0.214, 'hehe': 0.143}"
3,mpnet,mean,kmeans2,0.084,0.106,"{'haha': 0.727, 'hehe': 0.273}","{'haha': 0.571, 'hehe': 0.214}"
4,nomic,mean,kmeans5,0.43,0.178,"{'haha': 1.0, 'hehe': 0.667}","{'haha': 0.286, 'hehe': 0.143}"
5,nomic,mean,kmeans2,0.084,0.106,"{'haha': 0.727, 'hehe': 0.273}","{'haha': 0.571, 'hehe': 0.214}"
6,bert,mean,kmeans5,0.191,0.086,"{'haha': 0.833, 'hehe': 0.5}","{'haha': 0.357, 'hehe': 0.071}"
7,bert,mean,kmeans2,0.121,0.305,"{'haha': 0.692, 'hehe': 0.308}","{'haha': 0.643, 'hehe': 0.286}"
8,specter,mean,kmeans5,0.544,0.247,"{'haha': 0.75, 'hehe': 0.5}","{'haha': 0.214, 'hehe': 0.214}"
9,specter,mean,kmeans2,0.084,0.106,"{'haha': 0.727, 'hehe': 0.273}","{'haha': 0.571, 'hehe': 0.214}"


### Edge Assignment Evaluation: Tag Connectivity and Degree of Separation

In [11]:
df3 = fr.compare_edge_assignment_metrics("interview_prep",
                                        ["minilm", "mpnet", "nomic", "bert", "specter", "word2vec"],
                                        ["mean"],
                                        {
                                            "kmeans5": lambda x: clu.kmeans(x),
                                            "kmeans2": lambda x: clu.kmeans(x, 2)
                                        },
                                        {
                                            "knn3": lambda sim_mat, ids: edge.knn(sim_mat, ids, k=3),
                                            "knn5": lambda sim_mat, ids: edge.knn(sim_mat, ids, k=5)
                                        }, data["ids"], data["tags"], data["titles"], max_depth=3)
df3.to_csv("analysis/metric3_interview.csv")
df3.head(10)

Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 389.88it/s]


Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 51.15it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 916.59it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 7194.35it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 8473.34it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 1751.28it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 2326.29it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 5777.28it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 5882.61it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 1031.05it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 5236.33it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 5178.15it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 3650.40it/s]
Calculating cosine sim

Unnamed: 0,embedding_model,agg_method,similarity,edge constructor,clusterer,depth,connected_nodes,percentage_connected,degree_of_separation
0,minilm,mean,cosine,knn3,kmeans5,1,2,0.100,1.264983
1,minilm,mean,cosine,knn3,kmeans5,2,3,0.150,1.264983
2,minilm,mean,cosine,knn3,kmeans5,3,11,0.550,1.264983
3,minilm,mean,cosine,knn3,kmeans2,1,2,0.118,0.822570
4,minilm,mean,cosine,knn3,kmeans2,2,7,0.412,0.822570
...,...,...,...,...,...,...,...,...,...
211,word2vec,mean,euclidean,knn5,kmeans5,2,4,0.200,1.972377
212,word2vec,mean,euclidean,knn5,kmeans5,3,7,0.350,1.972377
213,word2vec,mean,euclidean,knn5,kmeans2,1,2,0.118,1.781784
214,word2vec,mean,euclidean,knn5,kmeans2,2,2,0.118,1.781784


## Data: Medium (n=10000)

In [12]:
data_name = "medium_1k_tags_simplified"
data = load_data(f"data/{data_name}.csv")
ids = data["ids"]
titles = data["titles"]
tags = data["simplified_tags"]

### Embedding and Similarity Scores

In [13]:
# df1 = fr.get_embedding_similarity_metrics_per_dataset("medium1k", tags,
#                                              ["minilm", "mpnet", "nomic", "bert", "specter", "word2vec"],
#                                              ["mean"])
# df1.to_csv("analysis/metric1_medium1k.csv")
# df1.head(10)

: 

In [None]:
df1 = fr.get_embedding_similarity_metrics_per_dataset("medium1k", tags,
                                             ["minilm"],
                                             ["mean"])
df1.to_csv("analysis/metric1_medium1k_minilm.csv")
df1.head(10)

### Cluster Purity, Homogeneity, and Completeness

In [None]:
df2 = fr.compare_cluster_metrics("medium1k",
                                    ["minilm", "mpnet", "nomic", "bert", "specter", "word2vec"],
                                    ["mean"],
                                    {
                                        "kmeans5": lambda x: clu.kmeans(x),
                                        "kmeans2": lambda x: clu.kmeans(x, 2)
                                    }, ids, tags, k=2)
df2.to_csv("analysis/metric2_medium1k.csv")
df2.head(10)

### Edge Assignment Evaluation: Tag Connectivity and Degree of Separation

In [None]:
df3 = fr.compare_edge_assignment_metrics("interview_prep",
                                        ["minilm", "mpnet", "nomic", "bert", "specter", "word2vec"],
                                        ["mean"],
                                        {
                                            "kmeans5": lambda x: clu.kmeans(x),
                                            "kmeans2": lambda x: clu.kmeans(x, 2)
                                        },
                                        {
                                            "knn3": lambda sim_mat, ids: edge.knn(sim_mat, ids, k=3),
                                            "knn5": lambda sim_mat, ids: edge.knn(sim_mat, ids, k=5)
                                        }, ids, tags, titles, max_depth=3)
df3.to_csv("analysis/metric3_medium1k.csv")
df3.head(10)