In [None]:
import ast
import pandas as pd
import pickle

import src.embeddings as emb
import src.similarity as sim
import src.metrics_fr as met

In [None]:
def load_data(filepath, n=None):
    assert filepath[-4:] == ".csv", "Must be a .csv file"
    data = pd.read_csv(filepath)
    if n:
        data = data.head(n)

    attrs = {
        "titles": data["title"].tolist(),
        "text": data["text"].tolist(),
        "tags": data["tags"].apply(ast.literal_eval).tolist(),
        "ids": data.index.tolist()
    }

    if "simplified_tags" in data.columns:
        attrs["simplified_tags"] = data["simplified_tags"].apply(ast.literal_eval).tolist()

    return attrs


def save_to_pickle(object, filename):
    pickle.dump(object, open(filename, 'wb'))


def load_from_pickle(filename):
    return pickle.load(open(filename, 'rb'))

In [None]:
data_name = "medium1k"
data = load_data(f"data/{data_name}.csv", 2000)
text = data["text"]
tags = data["simplified_tags"]

In [None]:
model_name = "word2vec"
word2vec_embeddings = emb.process_embeddings(text, model_name)
save_to_pickle(word2vec_embeddings, "embeddings/medium1k_word2vec_mean_n2000.pickle")

In [None]:
model_name = "minilm"
minilm_embeddings = emb.process_embeddings(text, model_name)

In [None]:
save_to_pickle(minilm_embeddings, "embeddings/medium1k_minilm_n10000.pickle")

In [None]:
model_name = "mpnet"
mpnet_embeddings = emb.process_embeddings(text, model_name)
cosine_sim, soft_cosine_sim, euclidean_sim = sim.get_all_similarities(mpnet_embeddings)
all_mpnet_df = met.calculate_embedding_metrics_for_all(cosine_sim, soft_cosine_sim, euclidean_sim,
                                        tags, model_name, data_name)

In [None]:
save_to_pickle(mpnet_embeddings, "embeddings/medium1k_mpnet_n10000.pickle")

In [None]:
model_name = "nomic"
nomic_embeddings = emb.process_embeddings(text, model_name)
cosine_sim, soft_cosine_sim, euclidean_sim = sim.get_all_similarities(nomic_embeddings)
nomic_df = met.calculate_embedding_metrics_for_all(cosine_sim, soft_cosine_sim, euclidean_sim,
                                        tags, model_name, data_name)

In [None]:
save_to_pickle(nomic_embeddings, "embeddings/medium1k_nomic_n10000.pickle")

In [None]:
model_name = "bert"
bert_embeddings = emb.process_embeddings(text, model_name)
cosine_sim, soft_cosine_sim, euclidean_sim = sim.get_all_similarities(bert_embeddings)
bert_df = met.calculate_embedding_metrics_for_all(cosine_sim, soft_cosine_sim, euclidean_sim,
                                        tags, model_name, data_name)

In [None]:
save_to_pickle(bert_embeddings, "embeddings/medium1k_bert_n10000.pickle")

In [None]:
model_name = "specter"
specter_embeddings = emb.process_embeddings(text, model_name)
cosine_sim, soft_cosine_sim, euclidean_sim = sim.get_all_similarities(specter_embeddings)
specter_df = met.calculate_embedding_metrics_for_all(cosine_sim, soft_cosine_sim, euclidean_sim,
                                        tags, model_name, data_name)

In [None]:
save_to_pickle(specter_embeddings, "embeddings/medium1k_specter_n10000.pickle")