In [167]:
import sqlite3

# Connect to the database
conn = sqlite3.connect('data.db')

# Create a cursor object
cursor = conn.cursor()

# Query all the pages
cursor.execute('SELECT * FROM blocks')
blocks = cursor.fetchall()
print('Total blocks: ', len(blocks))

Total blocks:  15398


In [166]:
import sqlite3

# Connect to the database
conn = sqlite3.connect('data.db')

# Create a cursor object
cursor = conn.cursor()
# Query all the pages
cursor.execute('SELECT * FROM pages')
pages = cursor.fetchall()
print('Total pages: ', len(pages))

Total pages:  356


In [99]:
pages[1]

(2,
 'P%3A Bonsol.md',
 '/Users/freddycaceres/Research/Second brain/P%3A Bonsol.md',
 'tags:: [[Projects]]\n',
 '{"tags": ["[[Projects]]"]}')

In [6]:
import pandas as pd
from fastembed.rerank.cross_encoder import TextCrossEncoder

(
    pd.DataFrame(TextCrossEncoder.list_supported_models())
    .sort_values("size_in_GB")
    .drop(columns=["sources", "model_file"])
    .reset_index(drop=True)
)

Unnamed: 0,model,size_in_GB,description,license
0,Xenova/ms-marco-MiniLM-L-6-v2,0.08,MiniLM-L-6-v2 model optimized for re-ranking t...,apache-2.0
1,Xenova/ms-marco-MiniLM-L-12-v2,0.12,MiniLM-L-12-v2 model optimized for re-ranking ...,apache-2.0
2,jinaai/jina-reranker-v1-tiny-en,0.13,Designed for blazing-fast re-ranking with 8K c...,apache-2.0
3,jinaai/jina-reranker-v1-turbo-en,0.15,Designed for blazing-fast re-ranking with 8K c...,apache-2.0
4,BAAI/bge-reranker-base,1.04,BGE reranker base model for cross-encoder re-r...,mit
5,jinaai/jina-reranker-v2-base-multilingual,1.11,A multi-lingual reranker model for cross-encod...,cc-by-nc-4.0


In [168]:
# Embed the content of the blocks
# Note: that the index remains the same so we can match the original data
blocks_to_embed = [f"{block[1]} {block[4]} {block[3]}" for block in blocks]
pages_to_embed = [f"{page[1]} {page[3]}" for page in pages]

In [169]:
blocks_to_embed[0]

'2023_09_04.md#0 {} - [Investment research for everyone](https://openbb.co/) /tit'

In [25]:
from fastembed import TextEmbedding
# This will trigger the model download and initialization
(
    pd.DataFrame(TextEmbedding.list_supported_models())
    .sort_values("size_in_GB")
    .drop(columns=["sources", "model_file"])
    .reset_index(drop=True)
)

Unnamed: 0,model,dim,description,license,size_in_GB,additional_files
0,BAAI/bge-small-en-v1.5,384,"Text embeddings, Unimodal (text), English, 512 input tokens truncation, Prefixes for queries/doc...",mit,0.067,
1,BAAI/bge-small-zh-v1.5,512,"Text embeddings, Unimodal (text), Chinese, 512 input tokens truncation, Prefixes for queries/doc...",mit,0.09,
2,snowflake/snowflake-arctic-embed-xs,384,"Text embeddings, Unimodal (text), English, 512 input tokens truncation, Prefixes for queries/doc...",apache-2.0,0.09,
3,sentence-transformers/all-MiniLM-L6-v2,384,"Text embeddings, Unimodal (text), English, 256 input tokens truncation, Prefixes for queries/doc...",apache-2.0,0.09,
4,jinaai/jina-embeddings-v2-small-en,512,"Text embeddings, Unimodal (text), English, 8192 input tokens truncation, Prefixes for queries/do...",apache-2.0,0.12,
5,BAAI/bge-small-en,384,"Text embeddings, Unimodal (text), English, 512 input tokens truncation, Prefixes for queries/doc...",mit,0.13,
6,snowflake/snowflake-arctic-embed-s,384,"Text embeddings, Unimodal (text), English, 512 input tokens truncation, Prefixes for queries/doc...",apache-2.0,0.13,
7,nomic-ai/nomic-embed-text-v1.5-Q,768,"Text embeddings, Multimodal (text, image), English, 8192 input tokens truncation, Prefixes for q...",apache-2.0,0.13,
8,BAAI/bge-base-en-v1.5,768,"Text embeddings, Unimodal (text), English, 512 input tokens truncation, Prefixes for queries/doc...",mit,0.21,
9,sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2,384,"Text embeddings, Unimodal (text), Multilingual (~50 languages), 512 input tokens truncation, Pre...",apache-2.0,0.22,


In [170]:
from fastembed import TextEmbedding
# This will trigger the model download and initialization
embedding_model = TextEmbedding()

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

In [171]:
blocks_embeddings = list(embedding_model.embed(blocks_to_embed))

In [172]:
pages_embeddings = list(embedding_model.embed(pages_to_embed))

In [173]:
pages_titles = [page[1] for page in pages]
titles_embeddings = list(embedding_model.embed(pages_titles))

In [178]:
query = [ 'Franklin happiness on writing' ]
# This will trigger the model download and initialization
embedding_model = TextEmbedding()
query_embedding = list(embedding_model.embed(query))

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

In [175]:
print("Query embedding dimensions: ", len(query_embedding), "x", len(query_embedding[0]))
print("Blocks embeddings dimensions: ", len(blocks_embeddings), "x", len(blocks_embeddings[0]))
print("Pages embeddings dimensions: ", len(pages_embeddings), "x", len(pages_embeddings[0]))
print("Titles embeddings dimensions: ", len(titles_embeddings), "x", len(titles_embeddings[0]))

Query embedding dimensions:  1 x 384
Blocks embeddings dimensions:  15398 x 384
Pages embeddings dimensions:  356 x 384
Titles embeddings dimensions:  356 x 384


In [179]:
from sklearn.metrics.pairwise import cosine_similarity
# Join the blocks embeddings with the pages embeddings
combined_embeddings = blocks_embeddings + pages_embeddings + titles_embeddings
similarities = cosine_similarity(query_embedding, combined_embeddings)
print("Shape of similarities: ", similarities.shape)

Shape of similarities:  (1, 16110)


In [180]:
import numpy as np
import pandas as pd

blocks_length = len(blocks_embeddings)
pages_length = len(pages_embeddings)

# Find the most similar embedding
top_5_indices = np.argsort(similarities[0])[-20:][::-1]
top_data = [
    ("block", blocks[i][3], blocks[i][2], similarities[0, i]) if i < blocks_length 
    else ("page", pages[i - blocks_length][1], pages[i - blocks_length][2], similarities[0, i]) 
    if i < blocks_length + pages_length
    else ("page_title", pages[i - blocks_length - pages_length][1], pages[i - blocks_length - pages_length][2], similarities[0, i])
    for i in top_5_indices
]
pd.set_option('display.max_colwidth', 100)
pd.DataFrame(top_data, columns=["Type", "Content", "Path", "Score"])

Unnamed: 0,Type,Content,Path,Score
0,block,"- Great quote from [[persons/Benjamin Franklin]] about a happy meaningful life, and the art of j...",/Users/freddycaceres/Research/Second brain/2023_06_28.md#0,0.761746
1,block,"\t- Of the joy he found in writing, he said, ‘To sit at one’s table on a sunny morning, with fou...",/Users/freddycaceres/Research/Second brain/Churchill - Andrew Roberts.md#114,0.725263
2,block,"\t- I saw the justice of his remarks, and thence grew more attentive to the manner in writing, a...",/Users/freddycaceres/Research/Second brain/The Autobiography of Benjamin Franklin - Benjamin Fra...,0.707956
3,block,\t- [[Personality Traits]],/Users/freddycaceres/Research/Second brain/Books___The Autobiography of Benjamin Franklin.md#9,0.707479
4,block,"\t- In truth, I found myself incorrigible with respect to Order; and now I am grown old, and my ...",/Users/freddycaceres/Research/Second brain/The Autobiography of Benjamin Franklin - Benjamin Fra...,0.70675
5,block,\t- He quoted a poem from [[John Dryden]],/Users/freddycaceres/Research/Second brain/Books___The Autobiography of Benjamin Franklin.md#42,0.706448
6,block,\t\t- Persue of [[Honesty]],/Users/freddycaceres/Research/Second brain/Books___The Autobiography of Benjamin Franklin.md#10,0.704935
7,block,\t\t- **Tags**: #[[favorite]],/Users/freddycaceres/Research/Second brain/The Autobiography of Benjamin Franklin - Benjamin Fra...,0.697982
8,block,"\t- ==This gave me occasion to observe, that, when men are employed, they are best contented; fo...",/Users/freddycaceres/Research/Second brain/The Autobiography of Benjamin Franklin - Benjamin Fra...,0.690788
9,block,\t\t- **Tags**: #[[favorite]],/Users/freddycaceres/Research/Second brain/The Autobiography of Benjamin Franklin - Benjamin Fra...,0.690605


In [165]:
import sqlite3

# Connect to the database
conn = sqlite3.connect('data.db')

# Create a cursor object
cursor = conn.cursor()

# Query all the pages
cursor.execute('SELECT * FROM blocks where content like "%felicity%"')
blocks = cursor.fetchall()
print('Total blocks: ', len(blocks))
import pandas as pd

# Generate a table with the id and the content
blocks_df = pd.DataFrame(blocks, columns=["id", "path", "title", "content", "properties", "category"])
blocks_df = blocks_df[["id", "content", "category"]]
blocks_df

Total blocks:  5


Unnamed: 0,id,content,category
0,14393,\t\t\t- My final prescription to you for a life of fuzzy thinking and infelicity is to ignore a ...,pages
1,14758,"\t- That felicity, when I reflected on it, has induced me sometimes to say, that were it offered...",pages
2,14796,"\t- I grew convinced that truth, sincerity and integrity in dealings between man and man were of...",pages
3,14818,"\t- In truth, I found myself incorrigible with respect to Order; and now I am grown old, and my ...",pages
4,14851,"\t- Human felicity is produced not so much by great pieces of good fortune that seldom happen, a...",pages


In [184]:
import sqlite3

# Connect to the database
conn = sqlite3.connect('data.db')

# Create a cursor object
cursor = conn.cursor()

# Query all the pages
cursor.execute('SELECT * FROM blocks where category = "pages"')
filtered_blocks = cursor.fetchall()
print('Shape of filtered blocks: ', len(filtered_blocks), 'x', len(filtered_blocks[0]))

Shape of filtered blocks:  9427 x 6


In [186]:
filtered_blocks_to_embed = [f"{block[1]} {block[4]} {block[3]}" for block in filtered_blocks]
filtered_blocks_embeddings = list(embedding_model.embed(filtered_blocks_to_embed))


In [190]:
import sqlite3

# Connect to the database
conn = sqlite3.connect('data.db')

# Create a cursor object
cursor = conn.cursor()

# Create a new table called "embeddings"
cursor.execute('''
CREATE TABLE IF NOT EXISTS embeddings (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    embedding BLOB
)
''')

# Insert the filtered_blocks_embeddings into the "embeddings" table
import pickle

# Serialize the embeddings list to a binary format
embeddings_blob = pickle.dumps(combined_embeddings)

# Insert the serialized embeddings into the table
cursor.execute('''
INSERT INTO embeddings (id, embedding)
VALUES (1, ?)
ON CONFLICT(id) DO UPDATE SET embedding=excluded.embedding
''', (embeddings_blob,))

# Commit the transaction
conn.commit()


In [191]:
# Create a cursor object
cursor = conn.cursor()

# Query the embeddings from the database
cursor.execute('SELECT * FROM embeddings')
retrieved_embeddings = cursor.fetchall()

# Deserialize the embeddings
retrieved_embeddings = [pickle.loads(row[1]) for row in retrieved_embeddings]

print('Shape of retrieved embeddings: ', len(retrieved_embeddings), 'x', len(retrieved_embeddings[0]))


Shape of retrieved embeddings:  1 x 16110


In [192]:
blocks_length = len(blocks_embeddings)
pages_length = len(pages_embeddings)

print(blocks_length, pages_length)

15398 356
