In [1]:
import ast
import pandas as pd

import src.embeddings as emb
import src.similarity as sim
import src.clustering as clu
import src.metrics as met
import src.metrics_fr as fr
import src.utils as utils

from collections import Counter

  from .autonotebook import tqdm as notebook_tqdm


# Thesis Evaluation

## Helper Functions

In [2]:
def load_data(filepath, n=None):
    assert filepath[-4:] == ".csv", "Must be a .csv file"
    data = pd.read_csv(filepath)
    if n:
        data = data.head(n)

    attrs = {
        "titles": data["title"].tolist(),
        "text": data["text"].tolist(),
        "tags": data["tags"].apply(ast.literal_eval).tolist(),
        "ids": data.index.tolist()
    }

    if "simplified_tags" in data.columns:
        attrs["simplified_tags"] = data["simplified_tags"].apply(ast.literal_eval).tolist()

    return attrs

def load_embeddings(dataset_name, model_names):
    embeddings = []
    for name in model_names:
        embeddings.append(utils.load_from_pickle(f"embeddings/{dataset_name}_{name}_n10000.pickle"))
    return embeddings

## Data: interview_prep.csv

In [3]:
data_name = "interview_prep"
data = load_data(f"data/{data_name}.csv")

### Embedding and Similarity Scores

In [8]:
df1 = fr.get_embedding_similarity_metrics_per_dataset("interview_prep", data["tags"],
                                             ["minilm", "mpnet", "nomic", "bert", "specter", "word2vec"],
                                             ["mean"])
df1.to_csv("analysis/metric1_interview.csv")
df1.head(10)

Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 266.61it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 38.11it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 840.21it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 283.30it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 32.67it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 437.45it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 600.22it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 43.09it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 449.79it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 375.83it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 27.35it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 741.44it/s]
Calculating 

Unnamed: 0,data_source,embedding_model,agg_method,metric_name,metric,between_all_nodes,between_shared_tags
0,interview_prep,minilm,mean,cosine,mean,0.369821,0.413958
1,interview_prep,minilm,mean,cosine,median,0.34955,0.431531
2,interview_prep,minilm,mean,cosine,std_dev,0.139443,0.14063
3,interview_prep,minilm,mean,soft_cosine,mean,0.7115,0.755448
4,interview_prep,minilm,mean,soft_cosine,median,0.727371,0.784485
5,interview_prep,minilm,mean,soft_cosine,std_dev,0.134889,0.131459
6,interview_prep,minilm,mean,euclidean,mean,0.537721,0.56198
7,interview_prep,minilm,mean,euclidean,median,0.521811,0.566406
8,interview_prep,minilm,mean,euclidean,std_dev,0.075639,0.077599
0,interview_prep,mpnet,mean,cosine,mean,0.414266,0.451403


### Cluster Purity, Homogeneity, and Completeness

In [9]:
df2 = fr.compare_cluster_metrics("interview_prep",
                                    ["minilm", "mpnet", "nomic", "bert", "specter", "word2vec"],
                                    ["mean"],
                                    {
                                        "kmeans5": lambda x: clu.kmeans(x),
                                        "kmeans2": lambda x: clu.kmeans(x, 2)
                                    }, data["ids"], data["tags"], k=2)
df2.to_csv("analysis/metric2_interview.csv")
df2.head(10)

Unnamed: 0,embedding_model,agg_method,clusterer,homogeneity,completeness,tag_concentration_purity,cluster_tag_purity
0,minilm,mean,kmeans5,0.285,0.133,"{'haha': 0.833, 'hehe': 0.667}","{'haha': 0.357, 'hehe': 0.143}"
1,minilm,mean,kmeans2,0.205,0.223,"{'haha': 0.8, 'hehe': 0.75}","{'haha': 0.571, 'hehe': 0.214}"
2,mpnet,mean,kmeans5,0.335,0.139,"{'haha': 0.75, 'hehe': 0.667}","{'haha': 0.214, 'hehe': 0.143}"
3,mpnet,mean,kmeans2,0.084,0.106,"{'haha': 0.727, 'hehe': 0.273}","{'haha': 0.571, 'hehe': 0.214}"
4,nomic,mean,kmeans5,0.43,0.178,"{'haha': 1.0, 'hehe': 0.667}","{'haha': 0.286, 'hehe': 0.143}"
5,nomic,mean,kmeans2,0.084,0.106,"{'haha': 0.727, 'hehe': 0.273}","{'haha': 0.571, 'hehe': 0.214}"
6,bert,mean,kmeans5,0.191,0.086,"{'haha': 0.833, 'hehe': 0.5}","{'haha': 0.357, 'hehe': 0.071}"
7,bert,mean,kmeans2,0.121,0.305,"{'haha': 0.692, 'hehe': 0.308}","{'haha': 0.643, 'hehe': 0.286}"
8,specter,mean,kmeans5,0.544,0.247,"{'haha': 0.75, 'hehe': 0.5}","{'haha': 0.214, 'hehe': 0.214}"
9,specter,mean,kmeans2,0.084,0.106,"{'haha': 0.727, 'hehe': 0.273}","{'haha': 0.571, 'hehe': 0.214}"


### Edge Assignment Evaluation: Tag Connectivity and Degree of Separation

In [13]:
import src.edge_constructors as edge
import src.aggregation as agg
import src.pipeline as pipe

In [15]:
dataset_name = "interview_prep"
embedding_model = "bert"
agg_method = "mean"
embeddings = utils.load_from_pickle(f"embeddings/{dataset_name}_{embedding_model}_{agg_method}_n10000.pickle")

metric = "cosine"
similarity_scores = sim.batch_similarity_scores(embeddings, metric)
edge_constructor_f = lambda sim_mat, ids: edge.knn(sim_mat, ids, k=3)
clusterer_f = lambda emb: clu.kmeans(emb, n_clusters=5)


G = pipe.cluster_and_connect(embeddings, similarity_scores, data["ids"],
                             metric,
                             edge_constructor_f,
                             clusterer_f,
                             aggregator_f=agg.mean_pooling,
                             titles=data["titles"], tags=data["tags"])

Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 130.79it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 652.71it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 4080.06it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 2882.68it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 1677.72it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 2008.77it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 312.98it/s]


In [16]:
fr.bfs_tag_connectivity(G)

{1: (1, 0.05), 2: (6, 0.3), 3: (8, 0.4)}

In [17]:
fr.degree_of_separation(G)

1.8678350097093828

In [18]:
fr.calculate_edge_assignment_metrics(G)

{'tag_connectivity': {1: (1, 0.05), 2: (6, 0.3), 3: (8, 0.4)},
 'degree_of_separation': 1.8678350097093828}

In [19]:
df = fr.compare_edge_assignment_metrics("interview_prep",
                                        ["minilm", "mpnet", "nomic", "bert", "specter", "word2vec"],
                                        ["mean"],
                                        {
                                            "kmeans5": lambda x: clu.kmeans(x),
                                            "kmeans2": lambda x: clu.kmeans(x, 2)
                                        },
                                        {
                                            "knn3": lambda sim_mat, ids: edge.knn(sim_mat, ids, k=3),
                                            "knn5": lambda sim_mat, ids: edge.knn(sim_mat, ids, k=5)
                                        }, data["ids"], data["tags"], data["titles"], max_depth=3)

df

TypeError: compare_edge_assignment_metrics() got an unexpected keyword argument 'k'

## Data: Medium (n=10000)