In [2]:
import ast
import pandas as pd

import src.embeddings as emb
import src.similarity as sim
import src.clustering as clu
import src.metrics as met
import src.metrics_fr as fr
import src.utils as utils

from collections import Counter

# Thesis Evaluation

## Helper Functions

In [3]:
def load_data(filepath, n=None):
    assert filepath[-4:] == ".csv", "Must be a .csv file"
    data = pd.read_csv(filepath)
    if n:
        data = data.head(n)

    attrs = {
        "titles": data["title"].tolist(),
        "text": data["text"].tolist(),
        "tags": data["tags"].apply(ast.literal_eval).tolist(),
        "ids": data.index.tolist()
    }

    if "simplified_tags" in data.columns:
        attrs["simplified_tags"] = data["simplified_tags"].apply(ast.literal_eval).tolist()

    return attrs

def load_embeddings(dataset_name, model_names):
    embeddings = []
    for name in model_names:
        embeddings.append(utils.load_from_pickle(f"embeddings/{dataset_name}_{name}_n10000.pickle"))
    return embeddings

## Data: interview_prep.csv

In [6]:
data_name = "interview_prep"
data = load_data(f"data/{data_name}.csv")

### Embedding and Similarity Scores

In [8]:
df = fr.get_embedding_similarity_metrics_per_dataset("interview_prep", data["tags"],
                                             ["minilm", "mpnet", "nomic", "bert", "specter", "word2vec"],
                                             ["mean_pooling", "sum_pooling", "max_pooling", "global_attention"])
df.to_csv("analysis/metric1_interview.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'embeddings/interview_prep_minilm_mean_pooling_n10000.pickle'

### Cluster Purity, Homogeneity, and Completeness

In [12]:
df = fr.compare_cluster_metrics("interview_prep",
                                    ["minilm", "bert"],
                                    {
                                        "kmeans5": lambda x: clu.kmeans(x),
                                        "kmeans2": lambda x: clu.kmeans(x, 2)
                                    }, data["ids"], data["tags"], k=2)
df.to_csv("analysis/metric2_interview.csv")

### Edge Assignment Evaluation: Tag Connectivity and Degree of Separation

## Data: Medium (n=10000)