In [7]:
import ast
import pandas as pd

import src.edge_constructors as edge
import src.clustering as clu
import src.metrics_fr as fr
import src.utils as utils

# Thesis Evaluation

## Helper Functions

In [8]:
def load_data(filepath, n=None):
    assert filepath[-4:] == ".csv", "Must be a .csv file"
    data = pd.read_csv(filepath)
    if n:
        data = data.head(n)

    attrs = {
        "titles": data["title"].tolist(),
        "text": data["text"].tolist(),
        "tags": data["tags"].apply(ast.literal_eval).tolist(),
        "ids": data.index.tolist()
    }

    if "simplified_tags" in data.columns:
        attrs["simplified_tags"] = data["simplified_tags"].apply(ast.literal_eval).tolist()

    return attrs

def load_embeddings(dataset_name, model_names):
    embeddings = []
    for name in model_names:
        embeddings.append(utils.load_from_pickle(f"embeddings/{dataset_name}_{name}_n10000.pickle"))
    return embeddings

## Data: interview_prep.csv

In [3]:
data_name = "interview_prep"
data = load_data(f"data/{data_name}.csv")

### Embedding and Similarity Scores

In [4]:
df1 = fr.get_embedding_similarity_metrics_per_dataset("interview_prep", data["tags"],
                                             ["minilm", "mpnet", "nomic", "bert", "specter", "word2vec"],
                                             ["mean"])
df1.to_csv("analysis/metric1_interview.csv")
df1.head(10)

Unnamed: 0,data_source,embedding_model,agg_method,metric_name,metric,between_all_nodes,between_shared_tags
0,interview_prep,minilm,mean,cosine,mean,0.68491,0.706979
1,interview_prep,minilm,mean,cosine,median,0.674775,0.715765
2,interview_prep,minilm,mean,cosine,std_dev,0.069721,0.070315
3,interview_prep,minilm,mean,soft_cosine,mean,0.641387,0.656642
4,interview_prep,minilm,mean,soft_cosine,median,0.644162,0.661419
5,interview_prep,minilm,mean,soft_cosine,std_dev,0.12629,0.134193
6,interview_prep,minilm,mean,euclidean,mean,0.537721,0.56198
7,interview_prep,minilm,mean,euclidean,median,0.521811,0.566406
8,interview_prep,minilm,mean,euclidean,std_dev,0.075639,0.077599
0,interview_prep,mpnet,mean,cosine,mean,0.707133,0.725702


### Cluster Purity, Homogeneity, and Completeness

In [None]:
k_values = [1, 2, 5, 10, 15, 20]
clustering_methods = {
    **{f"kmeans{k}": lambda x, k=k: clu.kmeans(x, k) for k in k_values},
    **{f"dbscan_eps{eps}_min{min_samples}": lambda x, eps=eps, min_samples=min_samples: clu.dbscan(x, eps=eps, min_samples=min_samples) for eps in [0.1, 0.3, 0.5, 0.7, 1.0] for min_samples in [3, 5, 10, 15]},
    **{f"gmm{n_components}": lambda x, n_components=n_components: clu.gmm(x, n_components) for n_components in k_values},
    **{f"birch{k}": lambda x, k=k: clu.birch(x, k) for k in k_values}
}

df2 = fr.compare_cluster_metrics("interview_prep",
                                    ["minilm", "mpnet", "nomic", "bert", "specter", "word2vec"],
                                    ["mean"],
                                    clustering_methods, data["ids"], data["tags"], k=2)
df2.to_csv("analysis/metric2_interview.csv")
df2

### Edge Assignment Evaluation: Tag Connectivity and Degree of Separation

In [6]:
k_values = [1, 2, 5, 50]
clustering_methods = {
    **{f"kmeans{k}": lambda x, k=k: clu.kmeans(x, k) for k in k_values},
    **{f"dbscan_eps{eps}_min{min_samples}": lambda x, eps=eps, min_samples=min_samples: clu.dbscan(x, eps=eps, min_samples=min_samples) for eps in [0.1, 0.3, 0.5, 0.7, 1.0] for min_samples in [3, 5, 10, 15]},
    **{f"gmm{n_components}": lambda x, n_components=n_components: clu.gmm(x, n_components) for n_components in k_values},
    **{f"birch{k}": lambda x, k=k: clu.birch(x, k) for k in k_values}
}

edge_connector_methods = {
    "random_edges": lambda sim_mat, document_ids: edge.random_edges(sim_mat, document_ids, num_edges_per_node=5),
    **{f"knn{k}": lambda sim_mat, document_ids, k=k: edge.knn(sim_mat, document_ids, k) for k in [3, 5, 10, 15]},
    **{f"knn_mst{k}": lambda sim_mat, document_ids, k=k: edge.knn_mst(sim_mat, document_ids, k) for k in [3, 5, 10, 15]},
    **{f"threshold_{threshold}": lambda sim_mat, document_ids, threshold=threshold: edge.threshold_based_edge_assignment(sim_mat, document_ids, threshold) for threshold in [0.3, 0.5, 0.7, 0.9]},
    **{f"mutual_knn{k}": lambda sim_mat, document_ids, k=k: edge.mutual_knn_edge_assignment(sim_mat, document_ids, k) for k in [3, 5, 10, 15]},
    **{f"spectral_clustering{n_clusters}": lambda sim_mat, document_ids, n_clusters=n_clusters: edge.spectral_clustering_edge_assignment(sim_mat, document_ids, n_clusters) for n_clusters in [2, 3, 5, 10]}
}

df3 = fr.compare_edge_assignment_metrics("interview_prep",
                                        ["minilm", "mpnet", "nomic", "bert", "specter", "word2vec"],
                                        ["mean"],
                                        clustering_methods,
                                        edge_connector_methods, data["ids"], data["tags"], data["titles"], max_depth=3)
df3.to_csv("analysis/metric3_interview.csv")
df3.head(10)

graphs/interview_prep_minilm_cosine_mean_random_edges_kmeans1
graphs/interview_prep_minilm_cosine_mean_random_edges_kmeans2
graphs/interview_prep_minilm_cosine_mean_random_edges_kmeans5
graphs/interview_prep_minilm_cosine_mean_random_edges_dbscan_eps0.1_min3
graphs/interview_prep_minilm_cosine_mean_random_edges_dbscan_eps0.1_min5
graphs/interview_prep_minilm_cosine_mean_random_edges_dbscan_eps0.1_min10
graphs/interview_prep_minilm_cosine_mean_random_edges_dbscan_eps0.1_min15
graphs/interview_prep_minilm_cosine_mean_random_edges_dbscan_eps0.3_min3
graphs/interview_prep_minilm_cosine_mean_random_edges_dbscan_eps0.3_min5
graphs/interview_prep_minilm_cosine_mean_random_edges_dbscan_eps0.3_min10
graphs/interview_prep_minilm_cosine_mean_random_edges_dbscan_eps0.3_min15
graphs/interview_prep_minilm_cosine_mean_random_edges_dbscan_eps0.5_min3
graphs/interview_prep_minilm_cosine_mean_random_edges_dbscan_eps0.5_min5
graphs/interview_prep_minilm_cosine_mean_random_edges_dbscan_eps0.5_min10
graphs

## Data: Medium (n=2000)

In [12]:
data_name = "medium1k"
data = load_data(f"data/{data_name}.csv", n=2000)
ids = data["ids"]
titles = data["titles"]
tags = data["simplified_tags"]

### Embedding and Similarity Scores

In [13]:
df1 = fr.get_embedding_similarity_metrics_per_dataset("medium1k", tags,
                                             ["minilm", "mpnet", "nomic", "bert", "specter", "word2vec"],
                                             ["mean"])
df1.to_csv("analysis/metric1_medium1k.csv")
df1.head(10)

EOFError: Ran out of input

In [None]:
# df1 = fr.get_embedding_similarity_metrics_per_dataset("medium1k", tags,
#                                              ["mpnet", "nomic", "bert", "specter", "word2vec"],
#                                              ["mean"])
# df1.to_csv("analysis/metric1_medium1k_rest.csv")
# df1.head(10)

#### Analysis

In [3]:
df_minilm = pd.read_csv("analysis/metric1_medium1k_minilm.csv")
df_rest = pd.read_csv("analysis/metric1_medium1k_rest.csv")

df_metric1 = pd.concat([df_minilm, df_rest])

EmptyDataError: No columns to parse from file

In [4]:
df_metric1

NameError: name 'df_metric1' is not defined

### Cluster Purity, Homogeneity, and Completeness

In [11]:
k_values = [1, 2, 5, 10, 15, 50, 100]
clustering_methods = {
    **{f"kmeans{k}": lambda x, k=k: clu.kmeans(x, k) for k in k_values},
    **{f"dbscan_eps{eps}_min{min_samples}": lambda x, eps=eps, min_samples=min_samples: clu.dbscan(x, eps=eps, min_samples=min_samples) for eps in [0.1, 0.3, 0.5, 0.7, 1.0] for min_samples in [3, 5, 10, 15]},
    **{f"gmm{n_components}": lambda x, n_components=n_components: clu.gmm(x, n_components) for n_components in k_values},
    **{f"birch{k}": lambda x, k=k: clu.birch(x, k) for k in k_values}
}

# df2 = fr.compare_cluster_metrics("medium1k",
#                                     ["minilm", "mpnet", "nomic", "bert", "specter", "word2vec"],
#                                     ["mean"],
#                                     clustering_methods,
#                                     ids, tags, k=2)

# df2.to_csv("analysis/metric2_medium1k.csv")
# df2.head(10)

In [None]:
df2 = fr.compare_cluster_metrics("medium1k",
                                    ["minilm"],
                                    ["mean"],
                                    clustering_methods,
                                    ids, tags, k=15)

df2.to_csv("analysis/metric2_medium1k_minilm.csv")

In [None]:
df2 = fr.compare_cluster_metrics("medium1k",
                                    ["specter"],
                                    ["mean"],
                                    clustering_methods,
                                    ids, tags, k=15)


df2.to_csv("analysis/metric2_medium1k_specter.csv")

In [None]:
df2 = fr.compare_cluster_metrics("medium1k",
                                    ["word2vec"],
                                    ["mean"],
                                    clustering_methods,
                                    ids, tags, k=15)

df2.to_csv("analysis/metric2_medium1k_word2vec.csv")

In [12]:
df2 = fr.compare_cluster_metrics("medium1k",
                                    ["bert"],
                                    ["mean"],
                                    clustering_methods,
                                    ids, tags, k=15)

df2.to_csv("analysis/metric2_medium1k_bert.csv") # run again

Error: Found array with 1 sample(s) (shape=(1, 768)) while a minimum of 2 is required by AgglomerativeClustering.
Skipping bert, mean, birch1 due to insufficient samples




#### Analysis

In [None]:
df2 = fr.compare_cluster_metrics("medium1k",
                                    ["nomic"],
                                    ["mean"],
                                    clustering_methods,
                                    ids, tags, k=15)

df2.to_csv("analysis/metric2_medium1k_nomic.csv")

In [13]:
df2 = fr.compare_cluster_metrics("medium1k",
                                    ["mpnet"],
                                    ["mean"],
                                    clustering_methods,
                                    ids, tags, k=15)

df2.to_csv("analysis/metric2_medium1k_mpnet.csv") # run again

### Edge Assignment Evaluation: Tag Connectivity and Degree of Separation

In [None]:
k_values = [1, 2, 5, 10, 15, 20, 30, 40, 50, 65, 80, 100, 120, 150]
threshold_values = [0.1, 0.3, 0.5, 0.7, 0.9]

clustering_methods = {
    **{f"kmeans{k}": lambda x, k=k: clu.kmeans(x, k) for k in k_values},
    **{f"dbscan_eps{eps}_min{min_samples}": lambda x, eps=eps, min_samples=min_samples: clu.dbscan(x, eps=eps, min_samples=min_samples) for eps in [0.1, 0.3, 0.5, 0.7, 1.0] for min_samples in [3, 5, 10, 15]},
    **{f"gmm{n_components}": lambda x, n_components=n_components: clu.gmm(x, n_components) for n_components in k_values},
    **{f"birch{k}": lambda x, k=k: clu.birch(x, k) for k in k_values}
}

edge_assignment_methods = {
    **{f"knn{k}": lambda sim_mat, ids, k=k: edge.knn(sim_mat, ids, k) for k in k_values},
    **{f"knn_mst{k}": lambda sim_mat, ids, k=k: edge.knn_mst(sim_mat, ids, k) for k in k_values},
    **{f"threshold{threshold}": lambda sim_mat, ids, threshold=threshold: edge.threshold_based_edge_assignment(sim_mat, ids, threshold) for threshold in threshold_values},
    **{f"mutual_knn{k}": lambda sim_mat, ids, k=k: edge.mutual_knn_edge_assignment(sim_mat, ids, k) for k in k_values},
    **{f"spectral{n_clusters}": lambda sim_mat, ids, n_clusters=n_clusters: edge.spectral_clustering_edge_assignment(sim_mat, ids, n_clusters) for n_clusters in k_values}
}

df3 = fr.compare_edge_assignment_metrics("interview_prep",
                                        ["minilm", "mpnet", "nomic", "bert", "specter", "word2vec"],
                                        ["mean"],
                                        clustering_methods,
                                        edge_assignment_methods,
                                        ids, tags, titles, max_depth=3)

df3.to_csv("analysis/metric3_medium1k.csv")
print(df3.head(10))

### Run

In [5]:
threshold_values = [0.1, 0.3, 0.5, 0.7, 0.9]
edge_assignment_methods = {
    **{f"random{k}": lambda x, k=k: edge.random_edges(x, ids, k) for k in k_values},
    **{f"knn{k}": lambda sim_mat, ids, k=k: edge.knn(sim_mat, ids, k) for k in k_values},
    **{f"knn_mst{k}": lambda sim_mat, ids, k=k: edge.knn_mst(sim_mat, ids, k) for k in k_values},
    **{f"threshold{threshold}": lambda sim_mat, ids, threshold=threshold: edge.threshold_based_edge_assignment(sim_mat, ids, threshold) for threshold in threshold_values},
    **{f"mutual_knn{k}": lambda sim_mat, ids, k=k: edge.mutual_knn_edge_assignment(sim_mat, ids, k) for k in k_values},
    **{f"spectral{n_clusters}": lambda sim_mat, ids, n_clusters=n_clusters: edge.spectral_clustering_edge_assignment(sim_mat, ids, n_clusters) for n_clusters in k_values}
}

In [14]:
df3 = fr.compare_edge_assignment_metrics("medium1k",
                                        ["minilm"],
                                        ["mean"],
                                        clustering_methods,
                                        edge_assignment_methods,
                                        ids, tags, titles, max_depth=8)

df3.to_csv("analysis/metric3_medium1k_minilm.csv")

KeyboardInterrupt: 

In [None]:
df3 = fr.compare_edge_assignment_metrics("medium1k",
                                        ["specter"],
                                        ["mean"],
                                        clustering_methods,
                                        edge_assignment_methods,
                                        ids, tags, titles, max_depth=8)

df3.to_csv("analysis/metric3_medium1k_specter.csv")


: 

In [None]:
df3 = fr.compare_edge_assignment_metrics("medium1k",
                                        ["word2vec"],
                                        ["mean"],
                                        clustering_methods,
                                        edge_assignment_methods,
                                        ids, tags, titles, max_depth=8)

df3.to_csv("analysis/metric3_medium1k_word2vec.csv")

: 

In [None]:
df3 = fr.compare_edge_assignment_metrics("medium1k",
                                        ["bert"],
                                        ["mean"],
                                        clustering_methods,
                                        edge_assignment_methods,
                                        ids, tags, titles, max_depth=8)

df3.to_csv("analysis/metric3_medium1k_bert.csv")


: 

In [None]:
df3 = fr.compare_edge_assignment_metrics("medium1k",
                                        ["nomic"],
                                        ["mean"],
                                        clustering_methods,
                                        edge_assignment_methods,
                                        ids, tags, titles, max_depth=8)

df3.to_csv("analysis/metric3_medium1k_nomic.csv")

: 

In [None]:
df3 = fr.compare_edge_assignment_metrics("medium1k",
                                        ["mpnet"],
                                        ["mean"],
                                        clustering_methods,
                                        edge_assignment_methods,
                                        ids, tags, titles, max_depth=8)

df3.to_csv("analysis/metric3_medium1k_mpnet.csv")

: 

In [None]:
# k_values = [1, 2, 5, 10, 15, 20, 30, 40, 50, 65, 80, 100, 120, 150]

# clustering_methods = {
#     **{f"kmeans{k}": lambda x, k=k: clu.kmeans(x, k) for k in k_values},
#     **{f"dbscan_eps{eps}_min{min_samples}": lambda x, eps=eps, min_samples=min_samples: clu.dbscan(x, eps=eps, min_samples=min_samples) for eps in [0.1, 0.3, 0.5, 0.7, 1.0] for min_samples in [3, 5, 10, 15]},
#     **{f"gmm{n_components}": lambda x, n_components=n_components: clu.gmm(x, n_components) for n_components in k_values},
#     **{f"birch{k}": lambda x, k=k: clu.birch(x, k) for k in k_values}
# }

# df2 = fr.compare_cluster_metrics("medium1k",
#                                     ["minilm", "mpnet", "nomic", "bert", "specter", "word2vec"],
#                                     ["mean"],
#                                     clustering_methods,
#                                     ids, tags, k=15)

# df2.to_csv("analysis/metric2_medium1k.csv")

: 

In [None]:
# k_values = [1, 2, 5, 10, 15, 20, 30, 40, 50, 65, 80, 100, 120, 150]
# threshold_values = [0.1, 0.3, 0.5, 0.7, 0.9]

# clustering_methods = {
#     **{f"kmeans{k}": lambda x, k=k: clu.kmeans(x, k) for k in k_values},
#     **{f"dbscan_eps{eps}_min{min_samples}": lambda x, eps=eps, min_samples=min_samples: clu.dbscan(x, eps=eps, min_samples=min_samples) for eps in [0.1, 0.3, 0.5, 0.7, 1.0] for min_samples in [3, 5, 10, 15]},
#     **{f"gmm{n_components}": lambda x, n_components=n_components: clu.gmm(x, n_components) for n_components in k_values},
#     **{f"birch{k}": lambda x, k=k: clu.birch(x, k) for k in k_values}
# }

# edge_assignment_methods = {
#     **{f"knn{k}": lambda sim_mat, ids, k=k: edge.knn(sim_mat, ids, k) for k in k_values},
#     **{f"knn_mst{k}": lambda sim_mat, ids, k=k: edge.knn_mst(sim_mat, ids, k) for k in k_values},
#     **{f"threshold{threshold}": lambda sim_mat, ids, threshold=threshold: edge.threshold_based_edge_assignment(sim_mat, ids, threshold) for threshold in threshold_values},
#     **{f"mutual_knn{k}": lambda sim_mat, ids, k=k: edge.mutual_knn_edge_assignment(sim_mat, ids, k) for k in k_values},
#     **{f"spectral{n_clusters}": lambda sim_mat, ids, n_clusters=n_clusters: edge.spectral_clustering_edge_assignment(sim_mat, ids, n_clusters) for n_clusters in k_values}
# }

# df3 = fr.compare_edge_assignment_metrics("interview_prep",
#                                         ["minilm", "mpnet", "nomic", "bert", "specter", "word2vec"],
#                                         ["mean"],
#                                         clustering_methods,
#                                         edge_assignment_methods,
#                                         ids, tags, titles, max_depth=8)

# df3.to_csv("analysis/metric3_medium1k.csv")
# df3.head(10)

: 