In [1]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from transformers import pipeline
import os
import glob

model = SentenceTransformer('paraphrase-distilroberta-base-v1', device="cpu")

def index_notes(directory):
    # List all markdown files in the directory (recursively)
    file_pattern = os.path.join(directory, "**", "*.md")
    markdown_files = glob.glob(file_pattern, recursive=True)

    # Read the content of all markdown files
    notes = {}
    for file_path in markdown_files:
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            notes[os.path.split(file_path)[1]] = content

    return notes

# Index notes from a directory
notes_directory = "example_vault"
notes_dict = index_notes(notes_directory)
notes = list(notes_dict.values())

Downloading .gitattributes:   0%|          | 0.00/391 [00:00<?, ?B/s]

Downloading 1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading README.md:   0%|          | 0.00/3.74k [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/329M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [2]:
# Embed notes
note_embeddings = model.encode(notes)

In [3]:
# Initialize FAISS index
index = faiss.IndexFlatL2(note_embeddings.shape[1])

# Add embeddings to the index
index.add(np.array(note_embeddings))

def find_top_matches(query, index, model, k=3):
    # Embed the query
    query_embedding = model.encode([query])

    # Search for the top k matches
    distances, indices = index.search(np.array(query_embedding), k)

    # Return indices and distances
    return indices, distances

summarizer = pipeline("summarization", model="t5-base", tokenizer="t5-base")

Downloading config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [4]:
#query = "How do I get started with Obsidian?"
#query = "What is PKM?"
query = "How do I create a daily notes template?"

In [5]:
# Find the top 3 matches
indices, distances = find_top_matches(query, index, model)
print(f"{len(notes)=} {indices=}")

# Retrieve the matched notes
matched_notes = [notes[i] for i in indices[0]]

len(notes)=30 indices=array([[19,  7, 17]])


In [None]:
# Summarize the matched notes
summary = summarizer(matched_notes, max_length=100, min_length=30)

Token indices sequence length is longer than the specified maximum sequence length for this model (916 > 512). Running this sequence through the model will result in indexing errors


In [None]:
# Print the summary
for s in summary:
    print("####################################")
    print(s["summary_text"])

In [1]:
from openai import OpenAI
client = OpenAI()
system_prompt = "You are a summarization assistant. Your role is to take a set of notes and use them as a basis for answering a user's query in the form of an abstractive summary no longer than one paragraph. Be sure to identify the title(s) of the note(s) from which the summarization is generated. If the notes do not have an answer, state the answer does not exist in the note and do not offer any further information."
#system_prompt = "You are a speculative fiction character creation assistant. Your job is to prophecy a dramatic fate for the author given their notes. Ensure their fate has sweeping implications for the world they inhabit."

The following experiment is to summarize all of the notes -- testing whether or not we hit the character limit.

In [None]:
set_of_notes = [f"Title: {note_name}\nContents:\n======\n{note_contents}======\n\n" for note_name, note_contents in notes_dict.items()]

In [None]:
completion = client.chat.completions.create(
  model="gpt-4-1106-preview",
  messages=[
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": f"""Question: {query}

Notes:
~~~~~~
{set_of_notes}
~~~~~~
Remember to summarize the notes with no more than 3 sentences. Remember not to contrive an answer but instead cite the notes you use.
"""}
  ]
)

In [None]:
print(completion.choices[0].message)

The following experiment is to summarize the notes before they were sent into the summarization pipeline.

In [None]:
set_of_matched_notes = [f"Contents:\n======\n{note_contents}======\n\n" for note_contents in matched_notes]

In [None]:
completion = client.chat.completions.create(
  model="gpt-4-1106-preview",
  messages=[
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": f"""Question: {query}

Notes:
~~~~~~
{set_of_matched_notes}
~~~~~~
"""}
  ]
)

In [None]:
print(completion.choices[0].message)

In [None]:
for note in set_of_matched_notes:
    print("-----------------------")
    print(note[:100])

The following experiment is to summarize the summaries created by the summarization pipeline.

In [None]:
completion = client.chat.completions.create(
  model="gpt-4-1106-preview",
  messages=[
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": f"""How do I get started with Obsidian?
---
{summary}
---
"""}
  ]
)

In [None]:
completion = client.chat.completions.create(
  model="gpt-4-1106-preview",
  messages=[
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": f"""Given the following note titles

Note titles:
~~~~~~
{notes_dict.keys()}
~~~~~~

Which would you want to read to answer this question: {query}
"""}
  ]
)

In [None]:
print(completion.choices[0].message)

In [None]:
completion = client.chat.completions.create(
  model="gpt-4-1106-preview",
  messages=[
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": f"""Here are the notes you wanted to answer this questions: {query}

Notes:
~~~~~~
###
Note title: Using Templates in Obsidian.md
Note content: {notes_dict['Using Templates in Obsidian.md']}
###
~~~~~~

The notes will either have content related to the question or links in the form of `[[<note_title>]]` where `<note_title>` represents another note.

Either answer the question in 3 or fewer sentences or request additional notes for review. Remember not to contrive an answer but instead *only* summarize the notes provided. Tell me which sentence or paragraph you are summarizing. If the answer to the question is not directly provided do not tell me the answer but may be contained in a linked note, ask for that note instead of answering the question.
"""}
  ]
)

In [None]:
print(completion.choices[0].message)

In [None]:
completion = client.chat.completions.create(
  model="gpt-4-1106-preview",
  messages=[
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": f"""Here are the notes you wanted to answer this questions: {query}

Notes:
~~~~~~
###
Note title: Intro to Personal Knowledge Management.md
Note content: {notes_dict['Intro to Personal Knowledge Management.md']}
###
~~~~~~

The notes will either have content related to the question or links in the form of `[[<note_title>]]` where `<note_title>` represents another note.

Either answer the question in 3 or fewer sentences or request additional notes for review. Remember not to contrive an answer but instead *only* summarize the notes provided. Tell me which sentence or paragraph you are summarizing. If the answer to the question is not directly provided do not tell me the answer but may be contained in a linked note, ask for that note instead of answering the question.
"""}
  ]
)

In [None]:
print(completion.choices[0].message)

In [None]:
completion = client.chat.completions.create(
  model="gpt-4-1106-preview",
  messages=[
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": f"""Here a note that either answers the following question or links to notes that do: {query}

Notes:
~~~~~~
###
Note title: Start Here.md
Note content: {notes_dict['Start Here.md']}
###
~~~~~~

The notes will either have content related to the question or links in the form of `[[<note_title>]]` where `<note_title>` represents another note.

Either answer the question in 3 or fewer sentences or request additional notes for review. Remember not to contrive an answer but instead *only* summarize the notes provided. Tell me which sentence or paragraph you are summarizing. If the answer to the question is not directly provided do not tell me the answer but may be contained in a linked note, ask for that note instead of answering the question.
"""}
  ]
)

In [None]:
print(completion.choices[0].message)

In [None]:
completion = client.chat.completions.create(
  model="gpt-4-1106-preview",
  messages=[
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": f"""Here a note that either answers the following question or links to notes that do: {query}

Notes:
~~~~~~
###
Note title: Journaling.md
Note content: {notes_dict['Journaling.md']}
###
~~~~~~

The notes will either have content related to the question or links in the form of `[[<note_title>]]` where `<note_title>` represents another note.

Either answer the question in 3 or fewer sentences or request additional notes for review. Remember not to contrive an answer but instead *only* summarize the notes provided. Tell me which sentence or paragraph you are summarizing. If the answer to the question is not directly provided do not tell me the answer but may be contained in a linked note, ask for that note instead of answering the question.
"""}
  ]
)

In [None]:
print(completion.choices[0].message)

In [None]:
completion = client.chat.completions.create(
  model="gpt-4-1106-preview",
  messages=[
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": f"""Here a note that either answers the following question or links to notes that do: {query}

Notes:
~~~~~~
###
Note title: Daily Questions in Obsidian.md
Note content: {notes_dict['Daily Questions in Obsidian.md']}
###

###
Note title: Journaling in Obsidian with QuickAdd.md
Note content: {notes_dict['Journaling in Obsidian with QuickAdd.md']}
###
~~~~~~

The notes will either have content related to the question or links in the form of `[[<note_title>]]` where `<note_title>` represents another note.

Either answer the question in 3 or fewer sentences or request additional notes for review. Remember not to contrive an answer but instead *only* summarize the notes provided. Tell me which sentence or paragraph you are summarizing. If the answer to the question is not directly provided do not tell me the answer but may be contained in a linked note, ask for that note instead of answering the question.
"""}
  ]
)

In [None]:
print(completion.choices[0].message)