In [1]:
from pathlib import Path
import pandas as pd
from itertools import zip_longest
import torch
from tqdm import tqdm
from transformers import AutoModel, AutoTokenizer

assert torch.cuda.is_available()


# from the itertools examples
def grouper(iterable, n, *, incomplete="fill", fillvalue=None):
    "Collect data into non-overlapping fixed-length chunks or blocks"
    # grouper('ABCDEFG', 3, fillvalue='x') --> ABC DEF Gxx
    # grouper('ABCDEFG', 3, incomplete='strict') --> ABC DEF ValueError
    # grouper('ABCDEFG', 3, incomplete='ignore') --> ABC DEF
    args = [iter(iterable)] * n
    if incomplete == "fill":
        return zip_longest(*args, fillvalue=fillvalue)
    if incomplete == "strict":
        return zip(*args, strict=True)
    if incomplete == "ignore":
        return zip(*args)
    else:
        raise ValueError("Expected fill, strict, or ignore")



In [2]:

device = "cuda:0"
# vs base
model_name = "jinaai/jina-embeddings-v2-small-en"
model = AutoModel.from_pretrained(
    model_name, trust_remote_code=True
)  # trust_remote_code is needed to use the encode method
model.to(device)


JinaBertModel(
  (embeddings): JinaBertEmbeddings(
    (word_embeddings): Embedding(30528, 512, padding_idx=0)
    (token_type_embeddings): Embedding(2, 512)
    (LayerNorm): LayerNorm((512,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): JinaBertEncoder(
    (layer): ModuleList(
      (0-3): 4 x JinaBertLayer(
        (attention): JinaBertAttention(
          (self): JinaBertSelfAttention(
            (query): Linear(in_features=512, out_features=512, bias=True)
            (key): Linear(in_features=512, out_features=512, bias=True)
            (value): Linear(in_features=512, out_features=512, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (output): JinaBertSelfOutput(
            (dense): Linear(in_features=512, out_features=512, bias=True)
            (LayerNorm): LayerNorm((512,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
 

In [3]:
import datetime
import yaml

def strip_header(markdown: str):
    """Strip '---'-delineated header from the passed markdown"""
    _, front_matter, markdown = markdown.split("---", 2)
    fm = yaml.safe_load(front_matter)
    markdown = f"title: {fm['title']}" + markdown
    mtime = datetime.datetime.fromisoformat(fm['mtime'])

    return mtime, markdown


strip_header("---\ntitle: bleh\nmtime: 2022-09-24T15:42:52+0200\n---\nhello there")



(datetime.datetime(2022, 9, 24, 15, 42, 52, tzinfo=datetime.timezone(datetime.timedelta(seconds=7200))),
 'title: bleh\nhello there')

# Embed the exported notes

On this RTX2070 with 8GB of VRAM, via WSL2, processing 1700 org-roam nodes with jina small takes 1m15s.

In [7]:
# in the jina documentation they mention a batch size of 32
# before I knew that, I experimented with batching more than one doc from this side
# for jina-base, batch_size=1 was the fatsest doh
batch_size = 1 
embs = {}
mtimes = {}
all_files = list(Path("/tmp/bleh").glob("*.md"))
for chunk in tqdm(
    grouper(Path("/tmp/bleh/").glob("*.md"), batch_size), desc="embedding docs", total=len(all_files) // batch_size
):
    input_texts = []
    for f in chunk:
        mtime, md = strip_header(f.open().read())
        input_texts.append(md)
        mtimes[str(f.stem)] = mtime

    embs_ = model.encode(input_texts)
    for f,e in zip(chunk, embs_):
        embs[str(f.stem)] = e


# dict key should be row index, hence index
df = pd.DataFrame.from_dict(embs, orient="index")
# PosixPath confuses parquet, so convert to string
#df.index = df.index.astype(dtype="string")

# write out the embeddings
df.to_parquet("embs_jina_small.parq")

# write out the modified times of the files
df_mtimes = pd.DataFrame.from_dict(mtimes, orient="index")
df_mtimes.to_parquet("mtimes.parq")

embedding docs: 100%|██████████| 1747/1747 [03:55<00:00,  7.41it/s]


In [38]:
import numpy as np

embs_df = pd.read_parquet("embs_jina_small.parq")

# cosine similarity = (a . b) / (norm of a * norm of b)
# or just a . b if both are normalized

# normalize the query vector
q = model.encode("large language models for embedding")
qn = q / np.linalg.norm(q)

# once-off normalize the embeddings
meh = embs_df.div(np.linalg.norm(embs_df, axis=1), axis=0)

# now use dot product over the whole embeddings matrix
# to find the 10 most similar documents
print(meh.dot(qn).sort_values(ascending=False).head(10))


9fb41906-6b86-11ee-938a-2f3e9027a68f    0.846584
810c3086-e3f6-414d-833d-07b8ccdc5e80    0.837450
21a523d7-d3e4-4e5f-be70-0390d5fdb7ac    0.832863
0f38c3ae-e988-469d-8a81-251b710d72bb    0.829302
6e2f3bc9-6e59-49a6-b007-308ff7a4d388    0.822630
EDE3D6D0-A370-4C4C-BA51-5195A4B8854A    0.822022
E9F88126-5E51-4564-8037-9D4BD6BBA213    0.820573
generative_ai_practitioners_view.md     0.819274
c4fbe172-6c55-407c-b872-4060bfc27766    0.817052
c0659255-bed1-41bb-8892-e7d6ae7ed747    0.813647
dtype: float32


In [6]:
import datetime
import pandas as pd
import umap
import umap.plot

df = pd.read_parquet("embs_jina_small.parq")
df_mtimes = pd.read_parquet("mtimes.parq")

# ensure element-wise equality between the two indices, which should be a corresponding list of filenames
assert df.index.equals(df_mtimes.index)

mapper = umap.UMAP().fit(df)

In [9]:
from pathlib import Path

hover_data = pd.DataFrame({"filename": df.index})
previews = []
titles = []
for fn in hover_data["filename"]:
    previews.append(strip_header(Path(fn).open().read(512))[1])
    # TODO: truly fugly, we have to fix
    titles.append(previews[-1].split("\n")[0].split("title: ")[1])

# DANGER!!!: comment out "previews" line and re-run this cell if you're going to share your notebook
#            or you might leak some private information!
#hover_data["preview"] = previews

hover_data["title"] = titles

umap.plot.output_notebook()

# calculate ages based on time now, using the timestamps in df_mtimes
note_age = - (df_mtimes[0] - datetime.datetime.now(tz=datetime.timezone.utc)).dt.total_seconds() / (24 * 60 * 60)
plot = umap.plot.interactive(mapper, values=note_age.values, point_size=5, hover_data=hover_data, tools=["pan","wheel_zoom","box_zoom","save","reset","help",])

# https://docs.bokeh.org/en/latest/docs/user_guide/basic/annotations.html#ug-basic-annotations-color-bars
# we need to get renderer r, which in our case is the result of the thing making the dots
# umap.plot.interactive does not store its renderer, so we take a chance on the first one
circle_renderer = plot.renderers[0]
color_bar = circle_renderer.construct_color_bar(padding=1)
plot.add_layout(color_bar, "right")

umap.plot.show(plot)

In [4]:

plot.renderers

[GlyphRenderer(id='p1041', ...)]