In [1]:
import ast
import pandas as pd

import src.embeddings as emb
import src.similarity as sim
import src.clustering as clu
import src.metrics as met
import src.metrics_fr as fr
import src.utils as utils

from collections import Counter

  from .autonotebook import tqdm as notebook_tqdm


# Thesis Evaluation

## Helper Functions

In [2]:
def load_data(filepath, n=None):
    assert filepath[-4:] == ".csv", "Must be a .csv file"
    data = pd.read_csv(filepath)
    if n:
        data = data.head(n)

    attrs = {
        "titles": data["title"].tolist(),
        "text": data["text"].tolist(),
        "tags": data["tags"].apply(ast.literal_eval).tolist(),
        "ids": data.index.tolist()
    }

    if "simplified_tags" in data.columns:
        attrs["simplified_tags"] = data["simplified_tags"].apply(ast.literal_eval).tolist()

    return attrs

def load_embeddings(dataset_name, model_names):
    embeddings = []
    for name in model_names:
        embeddings.append(utils.load_from_pickle(f"embeddings/{dataset_name}_{name}_n10000.pickle"))
    return embeddings

## Data: interview_prep.csv

In [3]:
data_name = "interview_prep"
data = load_data(f"data/{data_name}.csv")

### Embedding and Similarity Scores

In [4]:
def get_embedding_similarity_metrics_per_dataset(dataset_name, dataset_tags, model_names):
    dataframes = []

    for model_name in model_names:
        embeddings = utils.load_from_pickle(f"embeddings/{dataset_name}_{model_name}_n10000.pickle")
        cosine_sim, soft_cosine_sim, euclidean_sim = sim.get_all_similarities(embeddings)
        dataframes.append(fr.calculate_embedding_metrics_for_all(cosine_sim, soft_cosine_sim, euclidean_sim,
                                        dataset_tags, model_name, dataset_name))
    
    return pd.concat(dataframes)

In [5]:
df = get_embedding_similarity_metrics_per_dataset("interview_prep", data["tags"],
                                             ["minilm", "mpnet", "nomic", "bert", "specter", "word2vec"])
df.to_csv("analysis/metric1_interview.csv")

Calculating cosine similarities:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 373.96it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 42.78it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 220.60it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 235.29it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 58.88it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 2189.09it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 637.34it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 60.77it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 752.48it/s]
Calculating cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 983.19it/s]
Calculating soft_cosine similarities: 100%|██████████| 1/1 [00:00<00:00, 59.84it/s]
Calculating euclidean similarities: 100%|██████████| 1/1 [00:00<00:00, 1703.62it/s]
Calculatin

### Cluster Purity, Homogeneity, and Completeness

In [6]:
df = fr.compare_cluster_metrics("interview_prep",
                                    ["minilm", "bert"],
                                    {
                                        "kmeans5": lambda x: clu.kmeans(x),
                                        "kmeans2": lambda x: clu.kmeans(x, 2)
                                    }, data["ids"], data["tags"], k=2)
df.to_csv("analysis/metric2_interview.csv")

TypeError: compare_cluster_metrics() missing 1 required positional argument: 'k'

In [None]:
df

In [7]:
fr.compare_purity_metrics("interview_prep",
                                    ["minilm", "bert"],
                                    {
                                        "kmeans5": lambda x: clu.kmeans(x),
                                        "kmeans2": lambda x: clu.kmeans(x, 2)
                                    }, data["ids"], data["tags"], 2)

Unnamed: 0,embedding_model,clusterer,tag_concentration_purity,cluster_tag_purity
0,minilm,kmeans5,"{'haha': 0.833, 'hehe': 0.667}","{'haha': 0.357, 'hehe': 0.143}"
1,minilm,kmeans2,"{'haha': 0.8, 'hehe': 0.75}","{'haha': 0.571, 'hehe': 0.214}"
2,bert,kmeans5,"{'haha': 0.833, 'hehe': 0.5}","{'haha': 0.357, 'hehe': 0.071}"
3,bert,kmeans2,"{'haha': 0.692, 'hehe': 0.308}","{'haha': 0.643, 'hehe': 0.286}"


## Data: Medium (n=1000)

In [8]:
df = get_embedding_similarity_metrics_per_data("medium1k", data["tags"],
                                             ["minilm"])
#  "mpnet", "nomic", "bert", "specter"]

In [None]:
# all_minilm_df = met.calculate_embedding_metrics_for_all(minilm_cosine_sim, minilm_soft_cosine_sim, minilm_euclidean_sim,
#                                         tags, "minilm", data_name)
# all_mpnet_df = met.calculate_embedding_metrics_for_all(mpnet_cosine_sim, mpnet_soft_cosine_sim, mpnet_euclidean_sim,
#                                         tags, "mpnet", data_name)
# nomic_df = met.calculate_embedding_metrics_for_all(nomic_cosine_sim, nomic_soft_cosine_sim, nomic_euclidean_sim,
#                                         tags, "nomic", data_name)
# bert_df = met.calculate_embedding_metrics_for_all(bert_cosine_sim, bert_soft_cosine_sim, bert_euclidean_sim,
#                                         tags, "bert", data_name)
# specter_df = met.calculate_embedding_metrics_for_all(specter_cosine_sim, specter_soft_cosine_sim, specter_euclidean_sim,
#                                         tags, "specter", data_name)

In [None]:
# combined_df = pd.concat([all_minilm_df, nomic_df, all_mpnet_df, bert_df, specter_df], ignore_index=True)
# melted_df = combined_df.melt(id_vars=['data_source', 'embedding_model', 'metric_name', 'metric'], 
#                              value_vars=['between_all_nodes', 'between_shared_tags'], 
#                              var_name='comparison_type', value_name='value')