In [200]:
import sqlite3

# Connect to the database
conn = sqlite3.connect('data.db')

# Create a cursor object
cursor = conn.cursor()

# Query all the pages
cursor.execute('SELECT * FROM blocks')
blocks = cursor.fetchall()
print('Total blocks: ', len(blocks))

Total blocks:  15066


In [201]:
import sqlite3

# Connect to the database
conn = sqlite3.connect('data.db')

# Create a cursor object
cursor = conn.cursor()
# Query all the pages
cursor.execute('SELECT * FROM pages')
pages = cursor.fetchall()
print('Total pages: ', len(pages))

Total pages:  356


In [99]:
pages[1]

(2,
 'P%3A Bonsol.md',
 '/Users/freddycaceres/Research/Second brain/P%3A Bonsol.md',
 'tags:: [[Projects]]\n',
 '{"tags": ["[[Projects]]"]}')

In [6]:
import pandas as pd
from fastembed.rerank.cross_encoder import TextCrossEncoder

(
    pd.DataFrame(TextCrossEncoder.list_supported_models())
    .sort_values("size_in_GB")
    .drop(columns=["sources", "model_file"])
    .reset_index(drop=True)
)

Unnamed: 0,model,size_in_GB,description,license
0,Xenova/ms-marco-MiniLM-L-6-v2,0.08,MiniLM-L-6-v2 model optimized for re-ranking t...,apache-2.0
1,Xenova/ms-marco-MiniLM-L-12-v2,0.12,MiniLM-L-12-v2 model optimized for re-ranking ...,apache-2.0
2,jinaai/jina-reranker-v1-tiny-en,0.13,Designed for blazing-fast re-ranking with 8K c...,apache-2.0
3,jinaai/jina-reranker-v1-turbo-en,0.15,Designed for blazing-fast re-ranking with 8K c...,apache-2.0
4,BAAI/bge-reranker-base,1.04,BGE reranker base model for cross-encoder re-r...,mit
5,jinaai/jina-reranker-v2-base-multilingual,1.11,A multi-lingual reranker model for cross-encod...,cc-by-nc-4.0


In [203]:
# Embed the content of the blocks
# Note: that the index remains the same so we can match the original data
blocks_to_embed = [f"title: {block[1]}, properties: {block[4]}, content: {block[3]}" for block in blocks]
pages_to_embed = [f"title: {page[1]}, content: {page[3]}" for page in pages]

In [169]:
blocks_to_embed[0]

'2023_09_04.md#0 {} - [Investment research for everyone](https://openbb.co/) /tit'

In [25]:
from fastembed import TextEmbedding
# This will trigger the model download and initialization
(
    pd.DataFrame(TextEmbedding.list_supported_models())
    .sort_values("size_in_GB")
    .drop(columns=["sources", "model_file"])
    .reset_index(drop=True)
)

Unnamed: 0,model,dim,description,license,size_in_GB,additional_files
0,BAAI/bge-small-en-v1.5,384,"Text embeddings, Unimodal (text), English, 512 input tokens truncation, Prefixes for queries/doc...",mit,0.067,
1,BAAI/bge-small-zh-v1.5,512,"Text embeddings, Unimodal (text), Chinese, 512 input tokens truncation, Prefixes for queries/doc...",mit,0.09,
2,snowflake/snowflake-arctic-embed-xs,384,"Text embeddings, Unimodal (text), English, 512 input tokens truncation, Prefixes for queries/doc...",apache-2.0,0.09,
3,sentence-transformers/all-MiniLM-L6-v2,384,"Text embeddings, Unimodal (text), English, 256 input tokens truncation, Prefixes for queries/doc...",apache-2.0,0.09,
4,jinaai/jina-embeddings-v2-small-en,512,"Text embeddings, Unimodal (text), English, 8192 input tokens truncation, Prefixes for queries/do...",apache-2.0,0.12,
5,BAAI/bge-small-en,384,"Text embeddings, Unimodal (text), English, 512 input tokens truncation, Prefixes for queries/doc...",mit,0.13,
6,snowflake/snowflake-arctic-embed-s,384,"Text embeddings, Unimodal (text), English, 512 input tokens truncation, Prefixes for queries/doc...",apache-2.0,0.13,
7,nomic-ai/nomic-embed-text-v1.5-Q,768,"Text embeddings, Multimodal (text, image), English, 8192 input tokens truncation, Prefixes for q...",apache-2.0,0.13,
8,BAAI/bge-base-en-v1.5,768,"Text embeddings, Unimodal (text), English, 512 input tokens truncation, Prefixes for queries/doc...",mit,0.21,
9,sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2,384,"Text embeddings, Unimodal (text), Multilingual (~50 languages), 512 input tokens truncation, Pre...",apache-2.0,0.22,


In [204]:
from fastembed import TextEmbedding
# This will trigger the model download and initialization
embedding_model = TextEmbedding()

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

In [205]:
blocks_embeddings = list(embedding_model.embed(blocks_to_embed))

In [206]:
pages_embeddings = list(embedding_model.embed(pages_to_embed))

In [207]:
pages_titles = [page[1] for page in pages]
titles_embeddings = list(embedding_model.embed(pages_titles))

In [212]:
query = [ 'What is the purpose of life' ]
# This will trigger the model download and initialization
embedding_model = TextEmbedding()
query_embedding = list(embedding_model.embed(query))

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

In [209]:
print("Query embedding dimensions: ", len(query_embedding), "x", len(query_embedding[0]))
print("Blocks embeddings dimensions: ", len(blocks_embeddings), "x", len(blocks_embeddings[0]))
print("Pages embeddings dimensions: ", len(pages_embeddings), "x", len(pages_embeddings[0]))
print("Titles embeddings dimensions: ", len(titles_embeddings), "x", len(titles_embeddings[0]))

Query embedding dimensions:  1 x 384
Blocks embeddings dimensions:  15066 x 384
Pages embeddings dimensions:  356 x 384
Titles embeddings dimensions:  356 x 384


In [213]:
from sklearn.metrics.pairwise import cosine_similarity
# Join the blocks embeddings with the pages embeddings
combined_embeddings = blocks_embeddings + pages_embeddings + titles_embeddings
similarities = cosine_similarity(query_embedding, combined_embeddings)
print("Shape of similarities: ", similarities.shape)

Shape of similarities:  (1, 15778)


In [214]:
import numpy as np
import pandas as pd

blocks_length = len(blocks_embeddings)
pages_length = len(pages_embeddings)

# Find the most similar embedding
top_5_indices = np.argsort(similarities[0])[-20:][::-1]
top_data = [
    ("block", blocks[i][3], blocks[i][2], similarities[0, i]) if i < blocks_length 
    else ("page", pages[i - blocks_length][1], pages[i - blocks_length][2], similarities[0, i]) 
    if i < blocks_length + pages_length
    else ("page_title", pages[i - blocks_length - pages_length][1], pages[i - blocks_length - pages_length][2], similarities[0, i])
    for i in top_5_indices
]
pd.set_option('display.max_colwidth', 100)
pd.DataFrame(top_data, columns=["Type", "Content", "Path", "Score"])

Unnamed: 0,Type,Content,Path,Score
0,block,\t\t\t- Our purpose is to glorify God and find joy in him,/Users/freddycaceres/Research/Second brain/2022_12_25.md#9,0.763002
1,block,\t- The purpose driven life,/Users/freddycaceres/Research/Second brain/2023_05_13.md#3,0.756205
2,block,"\t- Purpose, meaning, significance—these are what make a successful life. ([Location 753](https:...","/Users/freddycaceres/Research/Second brain/The ONE Thing - Gary Keller, Jay Papasan.md#52",0.744293
3,block,"\t- Everything is here for a purpose, from horses to vine shoots. What’s surprising about that? ...",/Users/freddycaceres/Research/Second brain/Meditations - Marcus Aurelius.md#146,0.739771
4,block,"\t- When you have a definite purpose for your life,* clarity comes faster, which leads to more c...","/Users/freddycaceres/Research/Second brain/The ONE Thing - Gary Keller, Jay Papasan.md#97",0.724203
5,block,\t- A life lived on purpose is the most powerful of all—and the happiest. ([Location 1376](https...,"/Users/freddycaceres/Research/Second brain/The ONE Thing - Gary Keller, Jay Papasan.md#91",0.723209
6,block,"\t- Lastly, in a sense this is a religious course: I am preaching the message that, with apparen...",/Users/freddycaceres/Research/Second brain/The Art of Doing Science and Engineering - Richard W....,0.715406
7,block,"\t- Notice I leave it to you to pick your goals of excellence, but claim only a life without suc...",/Users/freddycaceres/Research/Second brain/The Art of Doing Science and Engineering - Richard W....,0.711666
8,block,\t- I often talk about how I want to live a purpose driven life but in the last years I have bee...,/Users/freddycaceres/Research/Second brain/2024_07_09.md#19,0.710987
9,block,\t- The act of living a full life by giving time to what matters is a balancing act. Extraordina...,"/Users/freddycaceres/Research/Second brain/The ONE Thing - Gary Keller, Jay Papasan.md#53",0.697588


In [165]:
import sqlite3

# Connect to the database
conn = sqlite3.connect('data.db')

# Create a cursor object
cursor = conn.cursor()

# Query all the pages
cursor.execute('SELECT * FROM blocks where content like "%felicity%"')
blocks = cursor.fetchall()
print('Total blocks: ', len(blocks))
import pandas as pd

# Generate a table with the id and the content
blocks_df = pd.DataFrame(blocks, columns=["id", "path", "title", "content", "properties", "category"])
blocks_df = blocks_df[["id", "content", "category"]]
blocks_df

Total blocks:  5


Unnamed: 0,id,content,category
0,14393,\t\t\t- My final prescription to you for a life of fuzzy thinking and infelicity is to ignore a ...,pages
1,14758,"\t- That felicity, when I reflected on it, has induced me sometimes to say, that were it offered...",pages
2,14796,"\t- I grew convinced that truth, sincerity and integrity in dealings between man and man were of...",pages
3,14818,"\t- In truth, I found myself incorrigible with respect to Order; and now I am grown old, and my ...",pages
4,14851,"\t- Human felicity is produced not so much by great pieces of good fortune that seldom happen, a...",pages


In [219]:
import sqlite3

# Connect to the database
conn = sqlite3.connect('data.db')

# Create a cursor object
cursor = conn.cursor()

# Create a new table called "embeddings"
cursor.execute('''
CREATE TABLE IF NOT EXISTS embeddings (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    embedding BLOB
)
''')

# Insert the filtered_blocks_embeddings into the "embeddings" table
import pickle

# Serialize the embeddings list to a binary format
embeddings_blob = pickle.dumps(combined_embeddings)

# Insert the serialized embeddings into the table
cursor.execute('''
INSERT INTO embeddings (id, embedding)
VALUES (1, ?)
ON CONFLICT(id) DO UPDATE SET embedding=excluded.embedding
''', (embeddings_blob,))

# Commit the transaction
conn.commit()


In [191]:
# Create a cursor object
cursor = conn.cursor()

# Query the embeddings from the database
cursor.execute('SELECT * FROM embeddings')
retrieved_embeddings = cursor.fetchall()

# Deserialize the embeddings
retrieved_embeddings = [pickle.loads(row[1]) for row in retrieved_embeddings]

print('Shape of retrieved embeddings: ', len(retrieved_embeddings), 'x', len(retrieved_embeddings[0]))


Shape of retrieved embeddings:  1 x 16110


In [192]:
blocks_length = len(blocks_embeddings)
pages_length = len(pages_embeddings)

print(blocks_length, pages_length)

15398 356
