# Pipeline Experiment Time

In [1]:
import ast
import pandas as pd

import src.utils as utils
import src.embeddings as emb
import src.similarity as ss
import src.edge_constructors as edge
import src.aggregation as agg
import src.clustering as cluster
import src.graph_construction as gc
import src.pipeline as pipe
import src.metrics as m

  from .autonotebook import tqdm as notebook_tqdm


Each graph is labeled with the following components:

- **Data**: Description of the dataset or source
- **Embedding Model**: Type of embedding model used, optionally followed by parameters
- **Edge Assignment**: Method of edge assignment, optionally followed by parameters
- **Aggergator**: Method of aggregating embeddings. Used for cluster nodes.
- **Clustering Method**: Method used for clustering, optionally followed by parameters
- **Small World**: Method used for assigning new edges between nodes.

Example:
- **Data**: interview
- **Embedding Model**: all-MiniLM-L6-v2
- **Comparison Metric**: cosine
- **Edge Assignment**: knn2
- **Aggregator**: mean_pooling
- **Clusterer**: None
- **Small World**: None

`interview_all-MiniLM-L6-v2_cosine_knn2_mean_x_x.pickle`

# Helper Functions

In [2]:
def load_data(filepath, n=None):
    assert filepath[-4:] == ".csv", "Must be a .csv file"
    data = pd.read_csv(filepath)
    if n:
        data = data.head(n)
    attrs = {
        "titles": data["title"].tolist(),
        "text": data["text"].tolist(),
        "tags": data["tags"].apply(ast.literal_eval).tolist(),
        "ids": data.index.tolist()
    }

    if "simplified_tags" in data.columns:
        attrs["simplified_tags"] = data["simplified_tags"].apply(ast.literal_eval).tolist()

    return attrs

# Data: interview_prep.csv

My Study.com presentation prep: [[write-up](https://docs.google.com/document/d/14gn6bOk_FW9pkEgEESlip1B_zXMUKVgQeM3tP_fTx5A/edit?usp=sharing)]
- Split by section headers
- Placeholder tags: ["haha"]

In [3]:
data = load_data("data/interview_prep.csv")

## Embeddings: sentence-transformers/all-MiniLM-L6-v2
Metric: Cosine Similarity

In [4]:
model_id = 'sentence-transformers/all-MiniLM-L6-v2'
tokenizer, model = emb.initialize_embedding_model(model_id)
embeddings = emb.batch_embeddings(data["text"], tokenizer, model) # pyright: ignore
similarity_scores = ss.batch_similarity_scores(embeddings, metric="cosine")

Processing batch: This is a project I ...: 100%|██████████| 1/1 [00:02<00:00,  2.42s/it]
Similarity batch: 0/14: 100%|██████████| 1/1 [00:00<00:00, 352.26it/s]


### Pipeline 1
- similarity metric: cosine
- edge constructor: knn (k=2)
- aggregator: mean pooling
- clusterer: None
- small world: None

In [5]:
# make graph
pickle_name = "graphs/interview_all-MiniLM-L6-v2_cosine_knn2_mean_x_x.pickle"
knn_edge_constructor = lambda sim_mat, ids: edge.knn(sim_mat, ids, k=2)
G = pipe.connect_directly(embeddings, similarity_scores, data["ids"],
                          similarity_metric="cosine",
                          edge_constructor_f=knn_edge_constructor,
                          aggregator_f=agg.mean_pooling,
                          titles=data["titles"], tags=data["tags"])
utils.save_graph_to_pickle(G, pickle_name)

Similarity batch: 0/14: 100%|██████████| 1/1 [00:00<00:00, 432.89it/s]


In [6]:
# turn to json
encoding_f = lambda x: utils.pca(x, n_components=5)
utils.pickle_to_json(pickle_name, encoding_f)

### Pipeline 2
- similarity metric: cosine
- edge constructor: knn (k=2)
- aggregator: mean pooling
- clusterer: None
- small world: Watts-Strogatz (p=.2)

In [7]:
# make graph
pickle_name = "graphs/interview_all-MiniLM-L6-v2_cosine_knn2_mean_x_watts.pickle"
knn_edge_constructor = lambda sim_mat, ids: edge.knn(sim_mat, ids, k=2)
G = pipe.connect_directly(embeddings, similarity_scores, data["ids"],
                          similarity_metric="cosine",
                          edge_constructor_f=knn_edge_constructor,
                          aggregator_f=agg.mean_pooling,
                          titles=data["titles"], tags=data["tags"])
G = gc.watts_strogatz(G, similarity_scores, p=0.2, seed=42)
utils.save_graph_to_pickle(G, pickle_name)

Similarity batch: 0/14: 100%|██████████| 1/1 [00:00<00:00, 476.19it/s]
Watts-Strogatz: : 14it [00:00, 1221.99it/s]


In [8]:
encoding_f = lambda x: utils.pca(x, n_components=5)
utils.pickle_to_json(pickle_name, encoding_f)

### Pipeline 3
- similarity metric: cosine
- edge constructor: knn (k=2)
- aggregator: mean pooling
- clusterer: kmeans (n=2)
- small world: None

In [9]:
pickle_name = "graphs/interview_all-MiniLM-L6-v2_cosine_knn2_mean_kmeans2_x.pickle"
knn_edge_constructor = lambda sim_mat, ids: edge.knn(sim_mat, ids, k=2)
kmeans_clusterer = lambda embeddings: cluster.kmeans(embeddings, n_clusters=2)
G = pipe.cluster_and_connect(embeddings, similarity_scores, data["ids"],
                             similarity_metric="cosine",
                             edge_constructor_f=knn_edge_constructor,
                             clusterer_f=kmeans_clusterer,
                             aggregator_f=agg.mean_pooling,
                             titles=data["titles"], tags=data["tags"])
utils.save_graph_to_pickle(G, pickle_name)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Similarity batch: 0/10: 100%|██████████| 1/1 [00:00<00:00, 23.04it/s]
Similarity batch: 0/4: 100%|██████████| 1/1 [00:00<00:00, 443.65it/s]
Similarity batch: 0/3: 100%|██████████| 1/1 [00:00<00:00, 311.45it/s]


In [10]:
encoding_f = lambda x: utils.pca(x, n_components=5)
utils.pickle_to_json(pickle_name, encoding_f)

# Data: medium_1k_tags.csv

Medium Articles: [huggingface dataset](https://huggingface.co/datasets/fabiochiu/medium-articles)
- Blog post's tags must appear >1k times.

In [11]:
data = load_data("data/medium_1k_tags_simplified.csv", n=100)

## Embeddings: sentence-transformers/all-MiniLM-L6-v2
Metric: Cosine Similarity

In [12]:
model_id = 'sentence-transformers/all-MiniLM-L6-v2'
tokenizer, model = emb.initialize_embedding_model(model_id)
embeddings = emb.batch_embeddings(data["text"], tokenizer, model) # pyright: ignore
similarity_scores = ss.batch_similarity_scores(embeddings, metric="cosine")

Processing batch: Our focus today will...: 100%|██████████| 4/4 [00:20<00:00,  5.09s/it]
Similarity batch: 96/100: 100%|██████████| 4/4 [00:00<00:00, 76.28it/s]


### Pipeline 4
- similarity metric: cosine
- edge constructor: knn (k=3)
- aggregator: mean pooling
- clusterer: None
- small world: None

In [13]:
pickle_name = "graphs/medium1k_all-MiniLM-L6-v2_cosine_knn3_mean_x_x.pickle"
knn_edge_constructor = lambda sim_mat, ids: edge.knn(sim_mat, ids, k=3)
G = pipe.connect_directly(embeddings, similarity_scores, data["ids"],
                          similarity_metric="cosine",
                          edge_constructor_f=knn_edge_constructor,
                          aggregator_f=agg.mean_pooling,
                          titles=data["titles"], tags=data["tags"], simplified_tags=data["simplified_tags"])
utils.save_graph_to_pickle(G, pickle_name)

Similarity batch: 96/100: 100%|██████████| 4/4 [00:00<00:00, 210.39it/s]


In [14]:
import src.utils as utils
import src.metrics as m
G = utils.load_graph_from_pickle("graphs/medium1k_all-MiniLM-L6-v2_cosine_knn3_mean_x_x.pickle")

In [15]:
m.aggregate_metrics(G, sample_size=10, depth=2, n_tags=1)

Mental Note Vol. 24
Your Brain On Coronavirus
Mind Your Nose
The 4 Purposes of Dreams
Surviving a Rod Through the Head
Mentally, Young Adults Are Suffering Most From COVID
How to Turn Your Popular Blog Series Into a Bestselling Book
Dr Faisal Dar — Pioneer of Liver Transplantation in Pakistan
Sunlight — The Natural Supplement For Our Mental Health
Occam’s dice
To Quickly Build Trust, Tell Your Origin Story
Four Exercises to Strengthen Your Writing
Facing Three Fundamental Coronavirus Fears
For Creatives, Silence Isn’t Always Golden
This 10-Minute Routine Will Increase Your Clarity And Creativity
The Ted Talk That Changed My Life
How to Make Your Day Job Support Your Art
Exploring New York City Restaurants
A Social Worker Offered Mormon Lingo to Me When I Was in Crisis, Told Me to Think Happy Thoughts, and Hung Up on Me — While I Was Still in Crisis
An Effective Five-Step Process for Writing Captivating Headlines
Loss Aversion — how fear influences customer choice
The FDA Banned These C

TypeError: '<=' not supported between instances of 'str' and 'int'

In [42]:
encoding_f = lambda x: utils.pca(x, n_components=5)
utils.pickle_to_json(pickle_name, encoding_f)

### Pipeline 5
- similarity metric: cosine
- edge constructor: knn (k=3)
- aggregator: mean pooling
- clusterer: kmeans (n=5)
- small world: None

In [35]:
pickle_name = "graphs/medium1k_all-MiniLM-L6-v2_cosine_knn3_mean_kmeans5_x.pickle"
knn_edge_constructor = lambda sim_mat, ids: edge.knn(sim_mat, ids, k=3)
kmeans_clusterer = lambda embeddings: cluster.kmeans(embeddings, n_clusters=5)
G = pipe.cluster_and_connect(embeddings, similarity_scores, data["ids"],
                             similarity_metric="cosine",
                             edge_constructor_f=knn_edge_constructor,
                             clusterer_f=kmeans_clusterer,
                             aggregator_f=agg.mean_pooling,
                             titles=data["titles"], tags=data["tags"], simplified_tags=data["simplified_tags"])
utils.save_graph_to_pickle(G, pickle_name)

Similarity batch: 0/20: 100%|██████████| 1/1 [00:00<00:00, 29.26it/s]
Similarity batch: 0/29: 100%|██████████| 1/1 [00:00<00:00, 38.58it/s]
Similarity batch: 0/30: 100%|██████████| 1/1 [00:00<00:00, 14.96it/s]
Similarity batch: 0/9: 100%|██████████| 1/1 [00:00<00:00, 696.38it/s]
Similarity batch: 0/12: 100%|██████████| 1/1 [00:00<00:00, 31.19it/s]
Similarity batch: 0/6: 100%|██████████| 1/1 [00:00<00:00, 512.56it/s]


In [36]:
encoding_f = lambda x: utils.pca(x, n_components=5)
utils.pickle_to_json(pickle_name, encoding_f)

### Pipeline 6
- similarity metric: cosine
- edge constructor: knn (k=3)
- aggregator: mean pooling
- clusterer: kmeans (n=20)
- small world: None

What happens when we use the same number of clusters as there are unique tags? Greater than? Less than?

In [37]:
pickle_name = "graphs/medium1k_all-MiniLM-L6-v2_cosine_knn3_mean_kmeans20_x.pickle"
knn_edge_constructor = lambda sim_mat, ids: edge.knn(sim_mat, ids, k=3)
kmeans_clusterer = lambda embeddings: cluster.kmeans(embeddings, n_clusters=20)
G = pipe.cluster_and_connect(embeddings, similarity_scores, data["ids"],
                             similarity_metric="cosine",
                             edge_constructor_f=knn_edge_constructor,
                             clusterer_f=kmeans_clusterer,
                             aggregator_f=agg.mean_pooling,
                             titles=data["titles"], tags=data["tags"], simplified_tags=data["simplified_tags"])
utils.save_graph_to_pickle(G, pickle_name)

Similarity batch: 0/3: 100%|██████████| 1/1 [00:00<00:00, 283.86it/s]
Similarity batch: 0/17: 100%|██████████| 1/1 [00:00<00:00, 73.96it/s]
Similarity batch: 0/4: 100%|██████████| 1/1 [00:00<00:00, 257.34it/s]
Similarity batch: 0/1: 100%|██████████| 1/1 [00:00<00:00, 474.36it/s]
Similarity batch: 0/15: 100%|██████████| 1/1 [00:00<00:00, 31.57it/s]
Similarity batch: 0/4: 100%|██████████| 1/1 [00:00<00:00, 646.37it/s]
Similarity batch: 0/1: 100%|██████████| 1/1 [00:00<00:00, 407.69it/s]
Similarity batch: 0/4: 100%|██████████| 1/1 [00:00<00:00, 374.96it/s]
Similarity batch: 0/10: 100%|██████████| 1/1 [00:00<00:00, 23.96it/s]
Similarity batch: 0/4: 100%|██████████| 1/1 [00:00<00:00, 809.55it/s]
Similarity batch: 0/6: 100%|██████████| 1/1 [00:00<00:00, 488.28it/s]
Similarity batch: 0/1: 100%|██████████| 1/1 [00:00<00:00, 786.19it/s]
Similarity batch: 0/12: 100%|██████████| 1/1 [00:00<00:00, 596.63it/s]
Similarity batch: 0/5: 100%|██████████| 1/1 [00:00<00:00, 836.69it/s]
Similarity batch: 0

Similarity batch: 0/1: 100%|██████████| 1/1 [00:00<00:00, 914.19it/s]
Similarity batch: 0/1: 100%|██████████| 1/1 [00:00<00:00, 737.52it/s]
Similarity batch: 0/1: 100%|██████████| 1/1 [00:00<00:00, 818.88it/s]
Similarity batch: 0/5: 100%|██████████| 1/1 [00:00<00:00, 358.67it/s]
Similarity batch: 0/21: 100%|██████████| 1/1 [00:00<00:00, 441.32it/s]


In [38]:
encoding_f = lambda x: utils.pca(x, n_components=5)
utils.pickle_to_json(pickle_name, encoding_f)

### Pipeline 7
- similarity metric: cosine
- edge constructor: knn (k=3)
- aggregator: mean pooling
- clusterer: kmeans (n=5)
- small world: Watts Strogatz (p=0.2)

What happens when we use the same number of clusters as there are unique tags? Greater than? Less than?

In [39]:
pickle_name = "graphs/medium1k_all-MiniLM-L6-v2_cosine_knn3_mean_kmeans5_watts20.pickle"
knn_edge_constructor = lambda sim_mat, ids: edge.knn(sim_mat, ids, k=3)
kmeans_clusterer = lambda embeddings: cluster.kmeans(embeddings, n_clusters=5)
G = pipe.cluster_and_connect(embeddings, similarity_scores, data["ids"],
                             similarity_metric="cosine",
                             edge_constructor_f=knn_edge_constructor,
                             clusterer_f=kmeans_clusterer,
                             aggregator_f=agg.mean_pooling,
                             titles=data["titles"], tags=data["tags"], simplified_tags=data["simplified_tags"])
G = gc.watts_strogatz(G, similarity_scores, p=0.2, seed=42)
utils.save_graph_to_pickle(G, pickle_name)

Similarity batch: 0/20: 100%|██████████| 1/1 [00:00<00:00, 16.86it/s]
Similarity batch: 0/29: 100%|██████████| 1/1 [00:00<00:00, 25.17it/s]
Similarity batch: 0/30:   0%|          | 0/1 [00:00<?, ?it/s]

Similarity batch: 0/30: 100%|██████████| 1/1 [00:00<00:00, 27.30it/s]
Similarity batch: 0/9: 100%|██████████| 1/1 [00:00<00:00, 780.77it/s]
Similarity batch: 0/12: 100%|██████████| 1/1 [00:00<00:00, 204.93it/s]
Similarity batch: 0/6: 100%|██████████| 1/1 [00:00<00:00, 411.00it/s]
Watts-Strogatz: : 100it [00:00, 1634.01it/s]


In [40]:
encoding_f = lambda x: utils.pca(x, n_components=5)
utils.pickle_to_json(pickle_name, encoding_f)