In [None]:
import pandas as pd
df = pd.read_json("data/MyActivity.json")

df = df[~df['title'].str.contains('Visited')]
df = df[~df['title'].str.contains('Viewed')]
df = df[~df['title'].str.contains('Defined')]
df = df[~df['title'].str.contains('Used')]
df['title'] = df['title'].str.replace('^Searched for ', '', regex=True)

df['title'] = df['title'].str.lower()
df.drop_duplicates(subset='title', keep='first', inplace=True)

df['time'] = pd.to_datetime(df['time'], format='mixed')
df = df.sort_values(by='time')

In [None]:
import numpy as np
embeddings = np.load('data/embeddings.npy')

In [None]:
df['embeddings'] = list(embeddings)

In [None]:
daily_df = df.loc[df['time'].dt.date == pd.to_datetime('2020-09-04').date()]

In [None]:
docs = daily_df['title'].tolist()
embeddings = np.array(daily_df['embeddings'].tolist())

In [None]:
from umap import UMAP
from hdbscan import HDBSCAN

umap_model = UMAP(n_components=10, random_state=42, metric="cosine", verbose=True)
hdbscan_model = HDBSCAN(min_samples=3, gen_min_span_tree=True, prediction_data=False, min_cluster_size=2)


In [None]:
umap_embeddings = umap_model.fit_transform(embeddings)
clusters = hdbscan_model.fit_predict(umap_embeddings)

In [None]:
daily_df['cluster'] = clusters

In [None]:
import plotly.express as px

umap_3d = UMAP(n_components=3, init="random", random_state=0)
proj_3d = umap_3d.fit_transform(embeddings)
fig_3d = px.scatter_3d(
    proj_3d,
    x=0,
    y=1,
    z=2,
    color=daily_df["cluster"].astype(str),
)
fig_3d.update_traces(marker_size=5)
fig_3d.show()