In [12]:
import ast
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

import src.embeddings as emb
import src.similarity as sim
import src.metrics as met

In [13]:
def load_data(filepath, n=None):
    assert filepath[-4:] == ".csv", "Must be a .csv file"
    data = pd.read_csv(filepath)
    if n:
        data = data.head(n)

    attrs = {
        "titles": data["title"].tolist(),
        "text": data["text"].tolist(),
        "tags": data["tags"].apply(ast.literal_eval).tolist(),
        "ids": data.index.tolist()
    }

    if "simplified_tags" in data.columns:
        attrs["simplified_tags"] = data["simplified_tags"].apply(ast.literal_eval).tolist()

    return attrs


def save_to_pickle(object, filename):
    pickle.dump(object, open(filename, 'wb'))


def load_from_pickle(filename):
    return pickle.load(open(filename, 'rb'))

In [14]:
data_name = "medium_1k_tags_simplified"
data = load_data(f"data/{data_name}.csv", 10000)
text = data["text"]
tags = data["simplified_tags"]

In [15]:
model_name = "minilm"
minilm_embeddings = emb.process_embeddings(text, model_name)
cosine_sim, soft_cosine_sim, euclidean_sim = sim.get_all_similarities(minilm_embeddings)
all_mini_df = met.calculate_embedding_metrics_for_all(cosine_sim, soft_cosine_sim, euclidean_sim,
                                        tags, model_name, data_name)

Initializing sentence-transformers/all-MiniLM-L6-v2 Model


Processing batch: The year is 2573:You...:  11%|█         | 33/313 [04:08<33:17,  7.13s/it]

Processing batch: The year is 2573:6<33:17,  7.13s/it]

Processing batch: Business Modelsurce ...:  14%|█▍        | 45/313 [05:37<35:17,  7.90s/it]

Processing batch: Business Models05:42<35:17,  7.90s/it]

Processing batch: #1 — Deon building a...:  20%|██        | 63/313 [07:46<28:52,  6.93s/it]

Processing batch: #1 — Deon  | 63/313 [07:53<28:52,  6.93s/it]

Processing batch: About      | 64/313 [07:53<29:08,  7.02s/it]

Processing batch: About██        | 64/313 [08:01<29:08,  7.02s/it]

Processing batch: Not As Easy As ABC5/313 [08:01<30:04,  7.28s/it]

Processing batch: Not As Easy As ABC30:04,  7.28s/it]

Processing batch: From hard to softela...:  22%|██▏       | 68/313 [08:22<29:42,  7.28s/it]

Processing batch: From hard to soft0<29:42,  7.28s/it]

Processing batch: Digital Marketing a ...:  24%|██▎       | 74/313 [09:12<32:49,  8.24s/it]

Processing batch: Digital Marketing0<32:49,  8

In [16]:
save_to_pickle(minilm_embeddings, "embeddings/medium1k_minilm_n10000.pickle")

In [17]:
model_name = "mpnet"
mpnet_embeddings = emb.process_embeddings(text, model_name)
cosine_sim, soft_cosine_sim, euclidean_sim = sim.get_all_similarities(mpnet_embeddings)
all_mpnet_df = met.calculate_embedding_metrics_for_all(cosine_sim, soft_cosine_sim, euclidean_sim,
                                        tags, model_name, data_name)

Initializing sentence-transformers/all-mpnet-base-v2 Model


Processing batch: The year is 2573:You...:  11%|█         | 33/313 [13:49<1:51:56, 23.99s/it]

Processing batch: The year is 2573:5<1:51:56, 23.99s/it]

Processing batch: Business Modelsurce ...:  14%|█▍        | 45/313 [19:15<2:08:24, 28.75s/it]

Processing batch: Business Models19:36<2:08:24, 28.75s/it]

Processing batch: #1 — Deon building a...:  20%|██        | 63/313 [27:19<1:49:39, 26.32s/it]

Processing batch: #1 — Deon  | 63/313 [27:46<1:49:39, 26.32s/it]

Processing batch: About      | 64/313 [27:46<1:50:21, 26.59s/it]

Processing batch: About██        | 64/313 [28:16<1:50:21, 26.59s/it]

Processing batch: Not As Easy As ABC5/313 [28:16<1:53:12, 27.39s/it]

Processing batch: Not As Easy As ABC1:53:12, 27.39s/it]

Processing batch: From hard to softela...:  22%|██▏       | 68/313 [29:33<1:50:04, 26.96s/it]

Processing batch: From hard to soft3<1:50:04, 26.96s/it]

Processing batch: Digital Marketing a ...:  24%|██▎       | 74/313 [32:40<1:57:48, 29.57s/it]

Processing batch: Di

In [18]:
save_to_pickle(mpnet_embeddings, "embeddings/medium1k_mpnet_n10000.pickle")

In [19]:
model_name = "nomic"
nomic_embeddings = emb.process_embeddings(text, model_name)
cosine_sim, soft_cosine_sim, euclidean_sim = sim.get_all_similarities(nomic_embeddings)
nomic_df = met.calculate_embedding_metrics_for_all(cosine_sim, soft_cosine_sim, euclidean_sim,
                                        tags, model_name, data_name)

Initializing Nomic Model


<All keys matched successfully>
Processing batch: The year is 2573:You...:  11%|█         | 33/313 [17:03<2:17:22, 29.44s/it]

Processing batch: The year is 2573:6<2:17:22, 29.44s/it]

Processing batch: Business Modelsurce ...:  14%|█▍        | 45/313 [23:43<2:33:31, 34.37s/it]

Processing batch: Business Models24:09<2:33:31, 34.37s/it]

Processing batch: #1 — Deon building a...:  20%|██        | 63/313 [33:15<2:08:17, 30.79s/it]

Processing batch: #1 — Deon  | 63/313 [33:46<2:08:17, 30.79s/it]

Processing batch: About      | 64/313 [33:46<2:07:36, 30.75s/it]

Processing batch: About██        | 64/313 [34:20<2:07:36, 30.75s/it]

Processing batch: Not As Easy As ABC5/313 [34:20<2:10:47, 31.64s/it]

Processing batch: Not As Easy As ABC2:10:47, 31.64s/it]

Processing batch: From hard to softela...:  22%|██▏       | 68/313 [35:55<2:12:24, 32.43s/it]

Processing batch: From hard to soft9<2:12:24, 32.43s/it]

Processing batch: Digital Marketing a ...:  24%|██▎       | 74/313 [39:31<2:19:44, 

In [20]:
save_to_pickle(nomic_embeddings, "embeddings/medium1k_nomic_n10000.pickle")

In [21]:
model_name = "bert"
bert_embeddings = emb.process_embeddings(text, model_name)
cosine_sim, soft_cosine_sim, euclidean_sim = sim.get_all_similarities(bert_embeddings)
bert_df = met.calculate_embedding_metrics_for_all(cosine_sim, soft_cosine_sim, euclidean_sim,
                                        tags, model_name, data_name)

Initializing Google BERT Model


Processing batch: The year is 2573:You...:  11%|█         | 33/313 [13:02<1:45:39, 22.64s/it]

Processing batch: The year is 2573:8<1:45:39, 22.64s/it]

Processing batch: Business Modelsurce ...:  14%|█▍        | 45/313 [18:07<1:59:33, 26.77s/it]

Processing batch: Business Models18:25<1:59:33, 26.77s/it]

Processing batch: #1 — Deon building a...:  20%|██        | 63/313 [25:36<1:40:11, 24.05s/it]

Processing batch: #1 — Deon  | 63/313 [26:00<1:40:11, 24.05s/it]

Processing batch: About      | 64/313 [26:00<1:39:29, 23.98s/it]

Processing batch: About██        | 64/313 [26:26<1:39:29, 23.98s/it]

Processing batch: Not As Easy As ABC5/313 [26:26<1:41:40, 24.60s/it]

Processing batch: Not As Easy As ABC1:41:40, 24.60s/it]

Processing batch: From hard to softela...:  22%|██▏       | 68/313 [27:40<1:44:05, 25.49s/it]

Processing batch: From hard to soft8<1:44:05, 25.49s/it]

Processing batch: Digital Marketing a ...:  24%|██▎       | 74/313 [30:28<1:47:42, 27.04s/it]

Processing batch: Di

In [22]:
save_to_pickle(bert_embeddings, "embeddings/medium1k_bert_n10000.pickle")

In [23]:
model_name = "specter"
specter_embeddings = emb.process_embeddings(text, model_name)
cosine_sim, soft_cosine_sim, euclidean_sim = sim.get_all_similarities(specter_embeddings)
specter_df = met.calculate_embedding_metrics_for_all(cosine_sim, soft_cosine_sim, euclidean_sim,
                                        tags, model_name, data_name)

Initializing AllenAI Specter Model


Processing batch: The year is 2573:You...:  11%|█         | 33/313 [14:24<1:57:33, 25.19s/it]

Processing batch: The year is 2573:2<1:57:33, 25.19s/it]

Processing batch: Business Modelsurce ...:  14%|█▍        | 45/313 [20:52<2:21:49, 31.75s/it]

Processing batch: Business Models21:14<2:21:49, 31.75s/it]

Processing batch: #1 — Deon building a...:  20%|██        | 63/313 [29:13<1:53:11, 27.17s/it]

Processing batch: #1 — Deon  | 63/313 [29:41<1:53:11, 27.17s/it]

Processing batch: About      | 64/313 [29:41<1:53:19, 27.31s/it]

Processing batch: About██        | 64/313 [30:11<1:53:19, 27.31s/it]

Processing batch: Not As Easy As ABC5/313 [30:11<1:56:11, 28.11s/it]

Processing batch: Not As Easy As ABC1:56:11, 28.11s/it]

Processing batch: From hard to softela...:  22%|██▏       | 68/313 [31:34<1:56:10, 28.45s/it]

Processing batch: From hard to soft5<1:56:10, 28.45s/it]

Processing batch: Digital Marketing a ...:  24%|██▎       | 74/313 [34:45<2:02:14, 30.69s/it]

Processing batch: Di

In [24]:
save_to_pickle(specter_embeddings, "embeddings/medium1k_specter_n10000.pickle")