In [1]:
import sys
import torch
import numpy as np
import pandas as pd
# import plotly.express as px

from tqdm import tqdm
from collections import Counter

# Medium Blog Posts


In [2]:
# https://www.kaggle.com/datasets/fabiochiusano/medium-articles
medium = pd.read_csv("../data/medium_articles.csv")
medium = medium.dropna()
medium.loc[:, "tags"] = medium["tags"].apply(lambda tags: eval(tags))

medium

Unnamed: 0,title,text,url,authors,timestamp,tags
0,Mental Note Vol. 24,Photo by Josh Riemer on Unsplash\n\nMerry Chri...,https://medium.com/invisible-illness/mental-no...,['Ryan Fan'],2020-12-26 03:38:10.479000+00:00,"[Mental Health, Health, Psychology, Science, N..."
1,Your Brain On Coronavirus,Your Brain On Coronavirus\n\nA guide to the cu...,https://medium.com/age-of-awareness/how-the-pa...,['Simon Spichak'],2020-09-23 22:10:17.126000+00:00,"[Mental Health, Coronavirus, Science, Psycholo..."
2,Mind Your Nose,Mind Your Nose\n\nHow smell training can chang...,https://medium.com/neodotlife/mind-your-nose-f...,[],2020-10-10 20:17:37.132000+00:00,"[Biotechnology, Neuroscience, Brain, Wellness,..."
3,The 4 Purposes of Dreams,Passionate about the synergy between science a...,https://medium.com/science-for-real/the-4-purp...,['Eshan Samaranayake'],2020-12-21 16:05:19.524000+00:00,"[Health, Neuroscience, Mental Health, Psycholo..."
4,Surviving a Rod Through the Head,"You’ve heard of him, haven’t you? Phineas Gage...",https://medium.com/live-your-life-on-purpose/s...,['Rishav Sinha'],2020-02-26 00:01:01.576000+00:00,"[Brain, Health, Development, Psychology, Science]"
...,...,...,...,...,...,...
192363,Why do you need a cleaning service?,What could be more important than having a tid...,https://medium.com/@ozneedcleaningau/why-do-yo...,[],2021-11-16 08:17:08.950000+00:00,"[Cleaning, Cleaning Services, Cleaning Company..."
192364,Daily cleaning and maintenance of bedding,Daily cleaning and maintenance of bedding\n\nW...,https://medium.com/@a198blwt/daily-cleaning-an...,[],2021-11-16 05:27:05.359000+00:00,"[Bedding, Cleaning, Maintain]"
192365,Beneficial Advice on Bond Cleaning!,The most important chore at the end is bond cl...,https://medium.com/@princegohil/beneficial-adv...,['Prince Shrawan'],2021-11-26 08:20:27.660000+00:00,"[Cleaning, End Of Lease Cleaning, Cleaners]"
192366,How I Learned Romanian in 37 Easy Steps,How I Learned Romanian in 37 Easy Steps\n\nHey...,https://medium.com/@lifeinromania/how-i-learne...,['Sam Ursu'],2017-11-27 08:09:19.025000+00:00,"[Romania, Language Learning, Storyofmylife]"


In [3]:
medium.info()

<class 'pandas.core.frame.DataFrame'>
Index: 192361 entries, 0 to 192367
Data columns (total 6 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   title      192361 non-null  object
 1   text       192361 non-null  object
 2   url        192361 non-null  object
 3   authors    192361 non-null  object
 4   timestamp  192361 non-null  object
 5   tags       192361 non-null  object
dtypes: object(6)
memory usage: 10.3+ MB


# Visual Exploration

## Individual Tag Frequencies

In [4]:
def get_all_tags(df):
    return [tag for tags_list in df["tags"] for tag in tags_list]

In [5]:
all_tags = get_all_tags(medium)
tag_counts = Counter(all_tags)
# tags, frequencies = list(zip(*tag_counts.most_common(n=100)))
tags, frequencies = list(zip(*tag_counts.most_common(n=1000)))

fig = px.bar(x=tags, y=frequencies)
fig.update_xaxes(title="tags", tickangle=45)
fig.update_yaxes(title="frequencies")
fig.update_layout(width=1200, height=400, title="Individual Tag Frequencies")
fig.show()

## Time of Publishing

In [6]:
medium["datetime"] = pd.to_datetime(medium["timestamp"].str[:-6], format='ISO8601')
medium["month"] = pd.to_datetime(medium["datetime"].dt.strftime('%Y-%m'))
medium = medium[(medium["datetime"].dt.year >= 2014) & (medium["datetime"].dt.year <= 2024)]
medium["month"]

0        2020-12-01
1        2020-09-01
2        2020-10-01
3        2020-12-01
4        2020-02-01
            ...    
192363   2021-11-01
192364   2021-11-01
192365   2021-11-01
192366   2017-11-01
192367   2017-06-01
Name: month, Length: 192340, dtype: datetime64[ns]

In [7]:
fig = px.histogram(medium, x="month")
fig.update_xaxes(title="month-year published", tickangle=45)
fig.update_yaxes(title="count")
fig.update_layout(width=1000, height=400, title="Date of Publishing")
fig.show()

## Length of Articles

In [None]:
fig = px.histogram(medium["text"].str.len())
fig.update_xaxes(title="count", tickangle=45)
fig.update_yaxes(title="article length")
fig.update_layout(width=1000, height=400, title="Length of Articles")
fig.show()

# Tag Simplification

In [8]:
def get_all_tags(df):
    return [tag for tags_list in df["tags"] for tag in tags_list]

def filter_tags(df, filtered_tags):
    # Remove rows with no tags in filtered_tags
    df = df[df["tags"].apply(lambda tags: any(tag in tags for tag in filtered_tags))]
    # Keep only the tags in filtered_tags
    df.loc[:, "tags"] = df["tags"].apply(lambda tags: [tag for tag in tags if tag in filtered_tags])

    return df 

def get_unique_tags(df):
    return df['tags'].explode().unique()

In [9]:
filtered_tags_5k = [tag for tag, count in tag_counts.items() if count > 5000]
filtered_medium_5k = filter_tags(medium, filtered_tags_5k)
filtered_medium_5k.reset_index(drop=True, inplace=True)

filtered_tags_1k = [tag for tag, count in tag_counts.items() if count > 1000]
filtered_medium_1k = filter_tags(medium, filtered_tags_1k)
filtered_medium_1k.reset_index(drop=True, inplace=True)

filtered_tags_500 = [tag for tag, count in tag_counts.items() if count > 500]
filtered_medium_500 = filter_tags(medium, filtered_tags_500)
filtered_medium_500.reset_index(drop=True, inplace=True)

filtered_tags_100 = [tag for tag, count in tag_counts.items() if count > 100]
filtered_medium_100 = filter_tags(medium, filtered_tags_100)
filtered_medium_100.reset_index(drop=True, inplace=True)

In [11]:
# filtered_medium_5k.to_csv("../data/medium_5k_tags.csv")
# filtered_medium_1k.to_csv("../data/medium_1k_tags.csv")
# filtered_medium_500.to_csv("../data/medium_500_tags.csv")
# filtered_medium_100.to_csv("../data/medium_100_tags.csv")

In [16]:
filtered_medium_1k.iloc[:5000].to_csv("../data/medium_1k_tags.csv_5000.csv")

In [10]:
print("Dataset length with each tag occuring at least X times:")
print("5k - {} rows with {} unique tags".format(len(filtered_medium_5k), len(get_unique_tags(filtered_medium_5k))))
print("1k - {} rows with {} unique tags".format(len(filtered_medium_1k), len(get_unique_tags(filtered_medium_1k))))
print("500 - {} rows with {} unique tags".format(len(filtered_medium_500), len(get_unique_tags(filtered_medium_500))))
print("100 - {} rows with {} unique tags".format(len(filtered_medium_100), len(get_unique_tags(filtered_medium_100))))

Dataset length with each tag occuring at least X times:
5k - 52123 rows with 11 unique tags
1k - 127300 rows with 98 unique tags
500 - 149936 rows with 223 unique tags
100 - 189839 rows with 1417 unique tags


In [11]:
get_unique_tags(filtered_medium_1k) # using this one

array(['Mental Health', 'Health', 'Psychology', 'Science', 'Coronavirus',
       'Society', 'Books', 'Entrepreneurship', 'Writing', 'Marketing',
       'Productivity', 'Storytelling', 'Self Improvement',
       'Machine Learning', 'Artificial Intelligence',
       'Personal Development', 'Startup', 'Fiction', 'Creativity',
       'Covid 19', 'Design', 'Life', 'Lifestyle', 'Work',
       'Data Visualization', 'Writing Tips', 'Business', 'Environment',
       'Art', 'Humor', 'Life Lessons', 'Social Media', 'AI', 'Technology',
       'Self-awareness', 'Leadership', 'Food', 'Inspiration', 'Money',
       'Climate Change', 'Music', 'Python', 'Data Science', 'Innovation',
       'Self', 'Software Development', 'Software Engineering',
       'Programming', 'Poetry', 'Advice', 'History', 'Philosophy', 'Love',
       'Racism', 'Culture', 'Learning', 'Education', 'Relationships',
       'LGBTQ', 'Gaming', 'UX', 'Blockchain', 'Family', 'Pandemic',
       'Mindfulness', 'Finance', 'Digital Marketi

## Embeddings

In [12]:
sys.path.append('../')
import src.embeddings as em


IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html



In [None]:
model_id = 'sentence-transformers/all-MiniLM-L6-v2'
tokenizer, model = initialize_embedding_model(model_id)

In [None]:
# Just text
reference_df = filtered_medium_1k[:5000]
text = list(reference_df["text"])
em.batch_embeddings(text, batch_size=50, save_path='embeddings_medium_1k_title_text.pt')

In [20]:
# Sample for testing
reference_df = filtered_medium_1k[:5000]
print(len(get_unique_tags(reference_df.iloc[:5000])), "unique tags")

96 unique tags


In [None]:
# Get embedding values
title_text = [title + " " + text for title, text in zip(reference_df["title"], reference_df["text"])]
em.batch_embeddings(title_text, batch_size=50, save_path='embeddings_medium_1k_title_text.pt')

# titles = reference_df["title"].to_list()
# em.batch_embeddings(titles, batch_size=50, save_path='embeddings_medium_1k_titles.pt')

# texts = reference_df["text"].to_list()
# em.batch_embeddings(texts, batch_size=50, save_path='embeddings_medium_1k_text.pt')

TypeError: batch_embeddings() missing 2 required positional arguments: 'tokenizer' and 'model'

In [14]:
# LOAD IT BACK -----------------------------------------------------------------------
embeddings_title_text = torch.load('embeddings_medium_1k_title_text.pt')
embeddings_title_text.size()

# embeddings_title = torch.load('embeddings_medium_1k_title.pt')
# embeddings_title

# embeddings_text = torch.load('embeddings_medium_1k_text.pt')
# embeddings_text

torch.Size([5000, 768])

In [22]:
reference_df["embeddings"] = embeddings_title_text.tolist()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [23]:
reference_df

Unnamed: 0,title,text,url,authors,timestamp,tags,embeddings
0,Mental Note Vol. 24,Photo by Josh Riemer on Unsplash\n\nMerry Chri...,https://medium.com/invisible-illness/mental-no...,['Ryan Fan'],2020-12-26 03:38:10.479000+00:00,"[Mental Health, Health, Psychology, Science]","[0.04866546392440796, 0.07017824798822403, -0...."
1,Your Brain On Coronavirus,Your Brain On Coronavirus\n\nA guide to the cu...,https://medium.com/age-of-awareness/how-the-pa...,['Simon Spichak'],2020-09-23 22:10:17.126000+00:00,"[Mental Health, Coronavirus, Science, Psychology]","[-0.010336386039853096, -0.022455617785453796,..."
2,Mind Your Nose,Mind Your Nose\n\nHow smell training can chang...,https://medium.com/neodotlife/mind-your-nose-f...,[],2020-10-10 20:17:37.132000+00:00,[Science],"[0.01623637229204178, 0.07212214171886444, -0...."
3,The 4 Purposes of Dreams,Passionate about the synergy between science a...,https://medium.com/science-for-real/the-4-purp...,['Eshan Samaranayake'],2020-12-21 16:05:19.524000+00:00,"[Health, Mental Health, Psychology, Science]","[0.04641677066683769, 0.047614991664886475, -0..."
4,Surviving a Rod Through the Head,"You’ve heard of him, haven’t you? Phineas Gage...",https://medium.com/live-your-life-on-purpose/s...,['Rishav Sinha'],2020-02-26 00:01:01.576000+00:00,"[Health, Psychology, Science]","[-0.004666609689593315, 0.00974504929035902, 0..."
...,...,...,...,...,...,...,...
4995,The Quest for an Ultimate Theory of Gravity,Without gravity the night sky would look very ...,https://medium.com/discourse/the-quest-for-an-...,['Alastair Isaacs'],2020-10-14 20:06:11.260000+00:00,[Science],"[0.03250093758106232, 0.02133387327194214, 0.0..."
4996,Convert Your Jupyter-notebook to Github pages ...,Convert Your Jupyter-notebook to Github pages ...,https://medium.com/analytics-vidhya/convert-yo...,[],2020-10-29 12:44:48.058000+00:00,[Data Science],"[0.006400084588676691, 0.025723956525325775, -..."
4997,Why 46 B.C. Is the Longest Recorded Year in Hi...,Why 46 B.C. Is the Longest Recorded Year in Hi...,https://medium.com/history-of-yesterday/why-46...,['Hossein Raspberry'],2020-12-28 09:04:05.759000+00:00,"[History, Life, Politics, Science, Culture]","[0.004997506737709045, -0.0050431108102202415,..."
4998,175. Why Legitimacy And Trust Are Huge Problem...,“Ladies and gentlemen. Welcome to this Board o...,https://clausraasted.medium.com/why-legitimacy...,['Claus Raasted'],2020-12-20 21:03:43.898000+00:00,"[Creativity, Business, Culture]","[0.0771794468164444, 0.051210034638643265, -0...."


# Tag Simplification FR

In [2]:
tag_mapping = {
    'Health': 'Health',
    'Science': 'Science',
    'Books': 'Books',
    'Writing': 'Writing',
    'Marketing': 'Marketing',
    'Productivity': 'Productivity',
    'Storytelling': 'Storytelling',
    'Self Improvement': 'Self Improvement',
    'Machine Learning': 'Technology',
    'Artificial Intelligence': 'Technology',
    'Personal Development': 'Self Improvement',
    'Startup': 'Business',
    'Fiction': 'Books',
    'Creativity': 'Art',
    'Design': 'Art',
    'Data Visualization': 'Technology',
    'Business': 'Business',
    'Environment': 'Lifestyle',
    'Art': 'Art',
    'Humor': 'Entertainment',
    'Social Media': 'Technology',
    'AI': 'Technology',
    'Technology': 'Technology',
    'Leadership': 'Business',
    'Food': 'Food',
    'Inspiration': 'Self Improvement',
    'Money': 'Finance',
    'Music': 'Entertainment',
    'Python': 'Technology',
    'Data Science': 'Technology',
    'Innovation': 'Technology',
    'Software Development': 'Technology',
    'Software Engineering': 'Technology',
    'Programming': 'Technology',
    'Poetry': 'Art',
    'Advice': 'Self Improvement',
    'History': 'Culture',
    'Philosophy': 'Culture',
    'Love': 'Relationships',
    'Racism': 'Culture',
    'Culture': 'Culture',
    'Learning': 'Education',
    'Education': 'Education',
    'Relationships': 'Relationships',
    'Gaming': 'Entertainment',
    'UX': 'Technology',
    'Blockchain': 'Technology',
    'Family': 'Family',
    'Mindfulness': 'Self Improvement',
    'Finance': 'Finance',
    'Digital Marketing': 'Marketing',
    'Feminism': 'Culture',
    'Politics': 'Politics',
    'Short Story': 'Books',
    'Parenting': 'Family',
    'Careers': 'Business',
    'News': 'News',
    'Tech': 'Technology',
    'Deep Learning': 'Technology',
    'JavaScript': 'Technology',
    'Women': 'Culture',
    'Web Development': 'Technology',
    'React': 'Technology',
    'Coding': 'Technology',
    'Spirituality': 'Culture',
    'Religion': 'Culture',
    'Data': 'Technology',
    'Movies': 'Entertainment',
    'Cryptocurrency': 'Finance',
    'Bitcoin': 'Finance',
    'Sports': 'Sports',
    'Trump': 'Politics',
    'Investing': 'Finance',
    'Christianity': 'Culture',
    'Poetry On Medium': 'Art',
    'Travel': 'Travel',
    'Poem': 'Art',
    'Ethereum': 'Finance',
    'Crypto': 'Finance',
    'Baby': 'Family',
    'Defi': 'Finance'
}

In [3]:
import ast

In [9]:
def simplify_tags(df, tag_mapping):
    df['tags'] = df['tags'].apply(ast.literal_eval)
    df['simplified_tags'] = df['tags'].apply(lambda tags: list({tag_mapping[tag] if tag in tag_mapping else tag for tag in tags}))
    return df

In [10]:
medium_1k_tags = pd.read_csv("medium_1k_tags.csv")
medium_1k_tags.head()

Unnamed: 0.1,Unnamed: 0,title,text,url,authors,timestamp,tags,datetime,month
0,0,Mental Note Vol. 24,Photo by Josh Riemer on Unsplash\n\nMerry Chri...,https://medium.com/invisible-illness/mental-no...,['Ryan Fan'],2020-12-26 03:38:10.479000+00:00,"['Mental Health', 'Health', 'Psychology', 'Sci...",2020-12-26 03:38:10.479,2020-12-01
1,1,Your Brain On Coronavirus,Your Brain On Coronavirus\n\nA guide to the cu...,https://medium.com/age-of-awareness/how-the-pa...,['Simon Spichak'],2020-09-23 22:10:17.126000+00:00,"['Mental Health', 'Coronavirus', 'Science', 'P...",2020-09-23 22:10:17.126,2020-09-01
2,2,Mind Your Nose,Mind Your Nose\n\nHow smell training can chang...,https://medium.com/neodotlife/mind-your-nose-f...,[],2020-10-10 20:17:37.132000+00:00,['Science'],2020-10-10 20:17:37.132,2020-10-01
3,3,The 4 Purposes of Dreams,Passionate about the synergy between science a...,https://medium.com/science-for-real/the-4-purp...,['Eshan Samaranayake'],2020-12-21 16:05:19.524000+00:00,"['Health', 'Mental Health', 'Psychology', 'Sci...",2020-12-21 16:05:19.524,2020-12-01
4,4,Surviving a Rod Through the Head,"You’ve heard of him, haven’t you? Phineas Gage...",https://medium.com/live-your-life-on-purpose/s...,['Rishav Sinha'],2020-02-26 00:01:01.576000+00:00,"['Health', 'Psychology', 'Science']",2020-02-26 00:01:01.576,2020-02-01


In [11]:
medium_1k_tags = medium_1k_tags.drop("Unnamed: 0", axis=1)

In [12]:
medium_1k_tags_simplified = simplify_tags(medium_1k_tags, tag_mapping)

In [13]:
medium_1k_tags_simplified.head(5)

Unnamed: 0,title,text,url,authors,timestamp,tags,datetime,month,simplified_tags
0,Mental Note Vol. 24,Photo by Josh Riemer on Unsplash\n\nMerry Chri...,https://medium.com/invisible-illness/mental-no...,['Ryan Fan'],2020-12-26 03:38:10.479000+00:00,"[Mental Health, Health, Psychology, Science]",2020-12-26 03:38:10.479,2020-12-01,"[Science, Health, Mental Health, Psychology]"
1,Your Brain On Coronavirus,Your Brain On Coronavirus\n\nA guide to the cu...,https://medium.com/age-of-awareness/how-the-pa...,['Simon Spichak'],2020-09-23 22:10:17.126000+00:00,"[Mental Health, Coronavirus, Science, Psychology]",2020-09-23 22:10:17.126,2020-09-01,"[Coronavirus, Science, Mental Health, Psychology]"
2,Mind Your Nose,Mind Your Nose\n\nHow smell training can chang...,https://medium.com/neodotlife/mind-your-nose-f...,[],2020-10-10 20:17:37.132000+00:00,[Science],2020-10-10 20:17:37.132,2020-10-01,[Science]
3,The 4 Purposes of Dreams,Passionate about the synergy between science a...,https://medium.com/science-for-real/the-4-purp...,['Eshan Samaranayake'],2020-12-21 16:05:19.524000+00:00,"[Health, Mental Health, Psychology, Science]",2020-12-21 16:05:19.524,2020-12-01,"[Science, Health, Mental Health, Psychology]"
4,Surviving a Rod Through the Head,"You’ve heard of him, haven’t you? Phineas Gage...",https://medium.com/live-your-life-on-purpose/s...,['Rishav Sinha'],2020-02-26 00:01:01.576000+00:00,"[Health, Psychology, Science]",2020-02-26 00:01:01.576,2020-02-01,"[Science, Health, Psychology]"


In [14]:
medium_1k_tags_simplified.to_csv("medium_1k_tags_simplified.csv")

In [26]:
reference_df = simplify_tags(reference_df, tag_mapping)
reference_df



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,title,text,url,authors,timestamp,tags,embeddings,simplified_tags
0,Mental Note Vol. 24,Photo by Josh Riemer on Unsplash\n\nMerry Chri...,https://medium.com/invisible-illness/mental-no...,['Ryan Fan'],2020-12-26 03:38:10.479000+00:00,"[Mental Health, Health, Psychology, Science]","[0.04866546392440796, 0.07017824798822403, -0....","[Mental Health, Health, Psychology, Science]"
1,Your Brain On Coronavirus,Your Brain On Coronavirus\n\nA guide to the cu...,https://medium.com/age-of-awareness/how-the-pa...,['Simon Spichak'],2020-09-23 22:10:17.126000+00:00,"[Mental Health, Coronavirus, Science, Psychology]","[-0.010336386039853096, -0.022455617785453796,...","[Mental Health, Coronavirus, Science, Psychology]"
2,Mind Your Nose,Mind Your Nose\n\nHow smell training can chang...,https://medium.com/neodotlife/mind-your-nose-f...,[],2020-10-10 20:17:37.132000+00:00,[Science],"[0.01623637229204178, 0.07212214171886444, -0....",[Science]
3,The 4 Purposes of Dreams,Passionate about the synergy between science a...,https://medium.com/science-for-real/the-4-purp...,['Eshan Samaranayake'],2020-12-21 16:05:19.524000+00:00,"[Health, Mental Health, Psychology, Science]","[0.04641677066683769, 0.047614991664886475, -0...","[Health, Mental Health, Psychology, Science]"
4,Surviving a Rod Through the Head,"You’ve heard of him, haven’t you? Phineas Gage...",https://medium.com/live-your-life-on-purpose/s...,['Rishav Sinha'],2020-02-26 00:01:01.576000+00:00,"[Health, Psychology, Science]","[-0.004666609689593315, 0.00974504929035902, 0...","[Health, Psychology, Science]"
...,...,...,...,...,...,...,...,...
4995,The Quest for an Ultimate Theory of Gravity,Without gravity the night sky would look very ...,https://medium.com/discourse/the-quest-for-an-...,['Alastair Isaacs'],2020-10-14 20:06:11.260000+00:00,[Science],"[0.03250093758106232, 0.02133387327194214, 0.0...",[Science]
4996,Convert Your Jupyter-notebook to Github pages ...,Convert Your Jupyter-notebook to Github pages ...,https://medium.com/analytics-vidhya/convert-yo...,[],2020-10-29 12:44:48.058000+00:00,[Data Science],"[0.006400084588676691, 0.025723956525325775, -...",[Technology]
4997,Why 46 B.C. Is the Longest Recorded Year in Hi...,Why 46 B.C. Is the Longest Recorded Year in Hi...,https://medium.com/history-of-yesterday/why-46...,['Hossein Raspberry'],2020-12-28 09:04:05.759000+00:00,"[History, Life, Politics, Science, Culture]","[0.004997506737709045, -0.0050431108102202415,...","[Culture, Life, Politics, Science, Culture]"
4998,175. Why Legitimacy And Trust Are Huge Problem...,“Ladies and gentlemen. Welcome to this Board o...,https://clausraasted.medium.com/why-legitimacy...,['Claus Raasted'],2020-12-20 21:03:43.898000+00:00,"[Creativity, Business, Culture]","[0.0771794468164444, 0.051210034638643265, -0....","[Art, Business, Culture]"


In [27]:
reference_df.to_csv("data_medium_1k_tags_5k_obs.csv")

## Cosine Similarity

In [None]:
similarity_matrix = em.batch_similarity_rankings_2d(embeddings_title_text, embeddings_title_text)
similarity_matrix.size()

100%|██████████| 157/157 [00:25<00:00,  6.24it/s]


torch.Size([5000, 5000])

## Clustering