In [42]:
import pickle

import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA


In [2]:
bookmark_file = 'bookmarks_df.p'
bookmark_data = pd.read_pickle(bookmark_file).drop_duplicates().dropna().reset_index(drop=True)
bookmark_text = bookmark_data['url_text']

In [3]:
sentences = bookmark_text.to_list()
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings = model.encode(sentences, show_progress_bar=True)

Batches: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 29/29 [00:35<00:00,  1.24s/it]


In [105]:
cluster = AgglomerativeClustering(n_clusters=None, distance_threshold=2)
cluster.fit(embeddings)
cluster.n_clusters_

58

In [106]:
results = bookmark_data.copy().drop(['url_text'], axis=1)
results['label'] = cluster.labels_.astype(str)

In [107]:
proj = TSNE(perplexity=5)
coords = proj.fit_transform(embeddings)



In [108]:
results['x'] = coords[:, 0]
results['y'] = coords[:, 1]
results

Unnamed: 0,title,url,label,x,y
0,Dashboard | edX,https://courses.edx.org/dashboard,41,-51.335945,53.772099
1,GT | GT Login,https://login.gatech.edu/cas/login,41,-47.139946,58.187729
2,BuzzPort Login,https://buzzport.gatech.edu/cp/home/displaylogin,8,-46.276409,59.355103
3,Georgia Tech :: Account Management :: Passport,https://passport.gatech.edu/home,41,-48.268669,57.535728
4,Georgia Tech OMS Analytics - Google+,https://plus.google.com/communities/1006647335...,41,-49.284851,54.685280
...,...,...,...,...,...
917,.gitignore file - ignoring files in Git | Atla...,https://www.atlassian.com/git/tutorials/saving...,48,-60.901543,1.917918
918,linux - Add previously ignored directory to Gi...,https://stackoverflow.com/questions/23296370/a...,48,-60.072048,0.744990
919,How to update your Git credentials on Windows,https://cmatskas.com/how-to-update-your-git-cr...,3,-56.516674,1.360955
920,Getting started with Git and GitHub: the compl...,https://towardsdatascience.com/getting-started...,3,-51.921165,1.411852


In [109]:
from bokeh.plotting import figure, output_file, show, output_notebook
from bokeh.models import ColorBar, ColumnDataSource
from bokeh.transform import factor_cmap
from bokeh.palettes import brewer, d3, Turbo256

In [110]:
output_notebook()

In [113]:
clusters = np.unique(cluster.labels_)
tooltips = [
    ('Title', '@title'),
    ('Label', '@label'),
]
p = figure(
    width=800,
    height=800,
    tooltips=tooltips,
    title=f'Bookmark Clusters by Embeddings',
    x_axis_label='Component 1',
    y_axis_label='Component 2')
source = ColumnDataSource(data=results)
cmap = factor_cmap(
    'label',
    palette=np.random.choice(Turbo256, len(clusters)),
    factors=clusters.astype(str)
)
p.circle('x', 'y', source=source, size=8, color=cmap, alpha=0.5)
# p.circle('x', 'y', source=source, size=10, alpha=.5)
# p.legend.location = "top_left"
show(p)