In [2]:
import ast
import pandas as pd
import pickle

import src.embeddings as emb
import src.similarity as sim
import src.metrics_fr as met

  from .autonotebook import tqdm as notebook_tqdm


[Taichi] version 1.7.1, llvm 15.0.7, commit 0f143b2f, osx, python 3.11.8


[I 05/29/24 18:22:36.180 117979] [shell.py:_shell_pop_print@23] Graphical python shell detected, using wrapped sys.stdout


[Taichi] Starting on arch=metal


In [6]:
def load_data(filepath, n=None):
    assert filepath[-4:] == ".csv", "Must be a .csv file"
    data = pd.read_csv(filepath)
    if n:
        data = data.head(n)

    attrs = {
        "titles": data["title"].tolist(),
        "text": data["abstract"].tolist(),
        "tags": data["tags"].apply(ast.literal_eval).tolist(),
        "ids": data.index.tolist()
    }

    if "simplified_tags" in data.columns:
        attrs["simplified_tags"] = data["simplified_tags"].apply(ast.literal_eval).tolist()

    return attrs


def save_to_pickle(object, filename):
    pickle.dump(object, open(filename, 'wb'))


def load_from_pickle(filename):
    return pickle.load(open(filename, 'rb'))

### Medium

In [None]:
data_name = "medium1k"
data = load_data(f"data/{data_name}.csv", 2000)
text = data["text"]
tags = data["simplified_tags"]

In [None]:
model_name = "word2vec"
word2vec_embeddings = emb.process_embeddings(text, model_name)
save_to_pickle(word2vec_embeddings, "embeddings/medium1k_word2vec_mean_n2000.pickle")

In [None]:
model_name = "minilm"
minilm_embeddings = emb.process_embeddings(text, model_name)

In [None]:
save_to_pickle(minilm_embeddings, "embeddings/medium1k_minilm_n10000.pickle")

In [None]:
model_name = "mpnet"
mpnet_embeddings = emb.process_embeddings(text, model_name)
cosine_sim, soft_cosine_sim, euclidean_sim = sim.get_all_similarities(mpnet_embeddings)
all_mpnet_df = met.calculate_embedding_metrics_for_all(cosine_sim, soft_cosine_sim, euclidean_sim,
                                        tags, model_name, data_name)

In [None]:
save_to_pickle(mpnet_embeddings, "embeddings/medium1k_mpnet_n10000.pickle")

In [None]:
model_name = "nomic"
nomic_embeddings = emb.process_embeddings(text, model_name)
cosine_sim, soft_cosine_sim, euclidean_sim = sim.get_all_similarities(nomic_embeddings)
nomic_df = met.calculate_embedding_metrics_for_all(cosine_sim, soft_cosine_sim, euclidean_sim,
                                        tags, model_name, data_name)

In [None]:
save_to_pickle(nomic_embeddings, "embeddings/medium1k_nomic_n10000.pickle")

In [None]:
model_name = "bert"
bert_embeddings = emb.process_embeddings(text, model_name)
cosine_sim, soft_cosine_sim, euclidean_sim = sim.get_all_similarities(bert_embeddings)
bert_df = met.calculate_embedding_metrics_for_all(cosine_sim, soft_cosine_sim, euclidean_sim,
                                        tags, model_name, data_name)

In [None]:
save_to_pickle(bert_embeddings, "embeddings/medium1k_bert_n10000.pickle")

In [None]:
model_name = "specter"
specter_embeddings = emb.process_embeddings(text, model_name)
cosine_sim, soft_cosine_sim, euclidean_sim = sim.get_all_similarities(specter_embeddings)
specter_df = met.calculate_embedding_metrics_for_all(cosine_sim, soft_cosine_sim, euclidean_sim,
                                        tags, model_name, data_name)

In [None]:
save_to_pickle(specter_embeddings, "embeddings/medium1k_specter_n10000.pickle")

### ARXIV

In [8]:
data_name = "arXiv0_tags"
data = load_data(f"data/{data_name}.csv", 2000)
text = data["text"]
tags = data["tags"]

  data = pd.read_csv(filepath)


In [9]:
model_name = "minilm"
minilm_embeddings = emb.process_embeddings(text, model_name)
save_to_pickle(minilm_embeddings, "embeddings/arXiv0_minilm_mean_n2000.pickle")

Initializing sentence-transformers/all-MiniLM-L6-v2 Model


Processing batch:   Solid-state superi...: 100%|██████████| 63/63 [01:13<00:00,  1.16s/it]


In [10]:
model_name = "mpnet"
mpnet_embeddings = emb.process_embeddings(text, model_name)
save_to_pickle(mpnet_embeddings, "embeddings/arXiv0_mpnet_mean_n2000.pickle")

Initializing sentence-transformers/all-mpnet-base-v2 Model


Processing batch:   Solid-state superi...: 100%|██████████| 63/63 [06:40<00:00,  6.36s/it]


In [12]:
model_name = "nomic"
nomic_embeddings = emb.process_embeddings(text, model_name)
save_to_pickle(nomic_embeddings, "embeddings/arXiv0_nomic_mean_n2000.pickle")

Initializing Nomic Model


<All keys matched successfully>
Processing batch:   Solid-state superi...: 100%|██████████| 63/63 [10:09<00:00,  9.68s/it]


In [13]:
model_name = "specter"
specter_embeddings = emb.process_embeddings(text, model_name)
save_to_pickle(specter_embeddings, "embeddings/arXiv0_specter_mean_n2000.pickle")

Initializing AllenAI Specter Model


Processing batch:   Solid-state superi...: 100%|██████████| 63/63 [08:09<00:00,  7.77s/it]


In [14]:
model_name = "bert"
bert_embeddings = emb.process_embeddings(text, model_name)
save_to_pickle(bert_embeddings, "embeddings/arXiv0_bert_mean_n2000.pickle")

Initializing Google BERT Model


Processing batch:   Solid-state superi...: 100%|██████████| 63/63 [10:42<00:00, 10.19s/it]


In [15]:
model_name = "word2vec"
word2vec_embeddings = emb.process_embeddings(text, model_name)
save_to_pickle(word2vec_embeddings, "embeddings/arXiv0_word2vec_mean_n2000.pickle")

Initializing Word2Vec Model


Processing batch:     S...: 100%|██████████| 63/63 [00:01<00:00, 32.03it/s]
