In [1]:
import os
import requests

# Get PDF document path
pdf_path = '/mnt/c/Documents and Settings/Graham/Desktop/Buddhist Texts/Thanissaro/Individual/AnguttaraNikaya210825.pdf'
url = 'https://www.dhammatalks.org/Archive/Writings/Ebooks/AnguttaraNikaya210825.pdf'

# Download PDF
if not os.path.exists(pdf_path):
    print('[INFO] File does not exist, download...')

    # Local filename to save downloaded file
    filename = pdf_path

    # Send a GET request to the URL
    response = requests.get(url)

    # Check if request was successful
    if response.status_code == 200:
        # Open file and save it
        with open(filename, 'wb') as file:
            file.write(response.content)
        print(f'[INFO] the file has been downloaded and saved as {filename}')
    else:
        print(f"[INFO] Failed to download the file. Status code: {response.status_code}")
else:
    print(f"File {pdf_path} exists.")

File /mnt/c/Documents and Settings/Graham/Desktop/Buddhist Texts/Thanissaro/Individual/AnguttaraNikaya210825.pdf exists.


In [5]:
import fitz
from tqdm.auto import tqdm

def text_formatter(text: str) -> str:
    """Performs minor formatting on text."""
    cleaned_text = text.replace("\n", " ").strip()
    return cleaned_text

def open_and_read_pdf(pdf_path: str) -> list[dict]:
    doc = fitz.open(pdf_path)
    pages_and_text = []

    for page_number, page in tqdm(enumerate(doc)):
        text = page.get_text()
        text = text_formatter(text=text)
        pages_and_text.append(
            {
                "page_number": page_number,
                "page_char_count": len(text),
                "page_word_count": len(text.split(" ")),
                "page_sentence_count_raw": len(text.split(". ")),
                "page_token_count": len(text) / 4, # 1 token = approx 4 chars
                "text": text})
    return pages_and_text

pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)
pages_and_texts[5:8]

0it [00:00, ?it/s]

[{'page_number': 5,
  'page_char_count': 623,
  'page_word_count': 96,
  'page_sentence_count_raw': 4,
  'page_token_count': 155.75,
  'text': '5 The Aṅguttara Nikāya, a collection of short to medium-length discourses, takes its name from the way the discourses are grouped by the number of their parts (aṅga), with the number growing progressively higher (uttara) with each group. No single English term can convey the full meaning of this name, although the translation Numerical Collection gives a workable idea of the principle behind it. The complete collection, counting all its formulaic expansions, contains more than 9,500 discourses. When these expansions are not counted, the total comes to approximately 2,300 discourses, of which 416 are translated here.'},
 {'page_number': 6,
  'page_char_count': 1440,
  'page_word_count': 246,
  'page_sentence_count_raw': 19,
  'page_token_count': 360.0,
  'text': '6 Ones A Single Thing Ekadhamma Suttas\xa0\xa0(AN\xa01:21–30, 39–40) 21. “I don’t e

In [6]:
import polars as pl

df = pl.from_dicts(pages_and_texts)
df.head()



page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
i64,i64,i64,i64,f64,str
0,0,1,1,0.0,""""""
1,122,17,1,30.5,"""1 HANDFULof LEAVES VOLUME FOUR…"
2,1318,233,8,329.5,"""2 Once the Blessed One was sta…"
3,779,113,7,194.75,"""3 Copyright 2014 Ṭhānissaro Bh…"
4,659,107,5,164.75,"""4 Abbreviations AN Aṅguttara N…"


In [51]:
import polars.selectors as cs
df.describe().with_columns(cs.numeric().round(2)).drop('text')

statistic,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,sentences,page_sentence_count_spacy,sentence_chunks,num_chunks
str,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""",800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0
"""null_count""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""mean""",399.5,1861.81,314.76,12.66,465.45,,17.14,,1.07
"""std""",231.08,412.91,74.92,5.25,103.23,,6.07,,0.26
"""min""",0.0,0.0,1.0,1.0,0.0,,0.0,,0.0
"""25%""",200.0,1652.0,272.0,9.0,413.0,,14.0,,1.0
"""50%""",400.0,1898.0,322.0,12.0,474.5,,17.0,,1.0
"""75%""",599.0,2199.0,372.0,16.0,549.75,,21.0,,1.0
"""max""",799.0,2578.0,537.0,36.0,644.5,,43.0,,2.0


In [20]:
from spacy.lang.en import English

nlp = English()

# Create pipeline
## Add a sentencizer
nlp.add_pipe("sentencizer")

## Create document instance as an example
doc = nlp("This is a sentence. I like elephants. Are you a Ph.D.? That was a test.")
assert len(list(doc.sents)) == 4

## Print out our sentences pslit
list(doc.sents)

[This is a sentence., I like elephants., Are you a Ph.D.?, That was a test.]

In [26]:
for item in tqdm(pages_and_texts):
    item["sentences"] = list(nlp(item["text"]).sents)

    # Make sure all sentences are strings (default type is a spacy datatype)
    item["sentences"] = [str(sentence) for sentence in item["sentences"]]

    # Count sentences
    item["page_sentence_count_spacy"] = len(item["sentences"])

  0%|          | 0/800 [00:00<?, ?it/s]

In [27]:
pages_and_texts[30]

{'page_number': 30,
 'page_char_count': 2455,
 'page_word_count': 424,
 'page_sentence_count_raw': 14,
 'page_token_count': 613.75,
 'text': '30 Note 1. The Commentary identiﬁes “voice of another” (parato ghoso) as meaning, in the case of the ﬁrst sutta, the voice of another person teaching what is not true Dhamma, and in the case of the second sutta, the voice of another person teaching true Dhamma. However, Woodward’s translation for the PTS renders parato ghoso as “a voice from another world,” and in a footnote he interprets it as “clairaudience from another (world).” To summarize his reasoning: If ordinary speech were meant, the word vācā or vācī would have been used instead of ghoso; and if another person were meant, aññassa or aññatarassa would have been used instead of parato. Finally, he notes that this passage appears also in MN 43 following a statement of “abnormal powers,” which apparently is meant to show that, in context, this statement must refer to the type of psychic kn

In [100]:
# Chunking sentences together
num_sentence_chunk_size = 10

# Create function to split list of texts recursively into chunk size
# e.g. [20] -> [10, 10]
def split_list(input_list: list[str],
               slice_size: int=num_sentence_chunk_size) -> list[list[str]]:
    return [input_list[i : i + slice_size] for i in range(0, len(input_list), slice_size)]



In [101]:
# Loop through pages and texts and split sentences into chunks

for item in tqdm(pages_and_texts):
    item["sentence_chunks"] = split_list(input_list=item['sentences'],
                                         slice_size=num_sentence_chunk_size)
    item["num_chunks"] = len(item["sentence_chunks"])

  0%|          | 0/800 [00:00<?, ?it/s]

In [102]:
pages_and_texts[35]

{'page_number': 35,
 'page_char_count': 1706,
 'page_word_count': 313,
 'page_sentence_count_raw': 12,
 'page_token_count': 426.5,
 'text': '35 See also: AN 5:254–259; Dhp XVIII The Chariot Maker Pacetana Sutta\xa0\xa0(AN\xa03:15) On one occasion the Blessed One was staying near Vārāṇasī in the Deer Park at Isipatana. There he addressed the monks: “Monks!” “Yes, lord,” the monks responded to him. The Blessed One said: “Once, monks, there was a king named Pacetana. One day King Pacetana said to his chariot maker, ‘My good chariot maker, in six months time from now a battle will take place. Can you make me a new pair of chariot wheels?’ “‘Yes, your majesty, I can,’ the chariot maker replied to the king. “Then in six months minus six days the chariot maker ﬁnished one wheel. King Pacetana said to him, ‘In six days time from now the battle will take place. Will the pair of chariot wheels be ﬁnished?’ “‘Your majesty, in these six months minus six days, I have ﬁnished one wheel.’ “‘But can y

In [103]:
import re

# Break each chunk into its own item

pages_and_chunks = []

for item in tqdm(pages_and_texts):
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]

        # Join sentences into paragraph

        joined_sentence_chunk = " ".join(sentence_chunk).replace("  ", " ").strip()
        
        chunk_dict["sentence_chunk"] = joined_sentence_chunk

        # Get some stats on chunk
        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4

        pages_and_chunks.append(chunk_dict)

len(pages_and_chunks), len(pages_and_texts)

  0%|          | 0/800 [00:00<?, ?it/s]

(1733, 800)

In [104]:
df = pl.from_dicts(pages_and_chunks)
df.describe().with_columns(cs.numeric().round(2))

statistic,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count
str,f64,str,f64,f64,f64
"""count""",1733.0,"""1733""",1733.0,1733.0,1733.0
"""null_count""",0.0,"""0""",0.0,0.0,0.0
"""mean""",398.27,,860.71,147.09,215.18
"""std""",227.75,,443.73,75.83,110.93
"""min""",1.0,"""* * There Ven. Sāriputta addre…",4.0,1.0,1.0
"""25%""",205.0,,550.0,95.0,137.5
"""50%""",395.0,,846.0,145.0,211.5
"""75%""",596.0,,1130.0,192.0,282.5
"""max""",799.0,"""”5 Notes""",2403.0,540.0,600.75


In [105]:
# Filter short chunks
min_token_length = 9
pages_and_chunks_over_min_token_len = df.filter(pl.col("chunk_token_count") > min_token_length).to_dicts()

In [106]:
# Embedding our text chunks
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2",
                                      device="cuda")
# Create list of sentences to embed
sentences = ["The sentence transformer library provides an easy way to create embeddings", 
             "Sentences can be embedded one by one or in a list",
             "I like horses!"]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [107]:
embeddings = embedding_model.encode(sentences)
embedings_dict = dict(zip(sentences, embeddings))

for sentence, embedding in embedings_dict.items():
    print(f"""
        Sentence: {sentence}
        Embedding: {embedding}""")


        Sentence: The sentence transformer library provides an easy way to create embeddings
        Embedding: [-3.17512304e-02  3.37267853e-02 -2.52437647e-02  5.22287488e-02
 -2.35248804e-02 -6.19115215e-03  1.35026276e-02 -6.25500977e-02
  7.50827370e-03 -2.29684655e-02  2.98146866e-02  4.57555130e-02
 -3.26700322e-02  1.39847239e-02  4.18013707e-02 -5.92969656e-02
  4.26309742e-02  5.04660280e-03 -2.44552512e-02  3.98593862e-03
  3.55897620e-02  2.78742872e-02  1.84098538e-02  3.67700048e-02
 -2.29961146e-02 -3.01796924e-02  5.99531224e-04 -3.64503898e-02
  5.69104962e-02 -7.49941031e-03 -3.70004326e-02 -3.04357521e-03
  4.64354642e-02  2.36146804e-03  9.06849664e-07  7.00035505e-03
 -3.92289869e-02 -5.95697295e-03  1.38653144e-02  1.87111693e-03
  5.34202456e-02 -6.18613586e-02  2.19613202e-02  4.86051254e-02
 -4.25697677e-02 -1.69858914e-02  5.04178405e-02  1.54733825e-02
  8.12859759e-02  5.07106222e-02 -2.27497108e-02 -4.35721017e-02
 -2.18388741e-03 -2.14091744e-02 -2.017583

In [110]:
%%time

# Batch embeddings
text_chunks = [item["sentence_chunk"] for item in pages_and_chunks_over_min_token_len]
text_chunk_embeddings = embedding_model.encode(
    text_chunks,
    batch_size=16,
    convert_to_tensor=True,
)


CPU times: user 14.5 s, sys: 328 ms, total: 14.8 s
Wall time: 9.84 s


In [117]:
## Save to file
text_chunks_and_embeddings_df = pl.from_dicts(pages_and_chunks_over_min_token_len).with_columns(
    embedding=pl.Series(text_chunk_embeddings.cpu().numpy())
)
text_chunks_and_embeddings_df.head()

page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding
i64,str,i64,i64,f64,"array[f32, 768]"
1,"""1 HANDFULof LEAVES VOLUME FOUR…",122,17,30.5,"[-0.001696, -0.055177, … -0.050128]"
2,"""2 Once the Blessed One was sta…",854,154,213.5,"[0.057702, 0.016415, … 0.004121]"
2,"""This is stress … This is the o…",468,84,117.0,"[-0.004768, -0.053286, … -0.033697]"
3,"""3 Copyright 2014 Ṭhānissaro Bh…",779,113,194.75,"[0.035766, 0.038918, … -0.029905]"
4,"""4 Abbreviations AN Aṅguttara N…",659,107,164.75,"[0.072552, -0.048293, … -0.023471]"


In [118]:
save_path = '/home/graham/projects/buddhism/rag/data/final_embeddings.jsonl'
text_chunks_and_embeddings_df.write_ndjson(save_path)

In [121]:
# Import to test and view
text_chunks_and_embeddings_df = pl.read_ndjson(save_path)
text_chunks_and_embeddings_df.head()

page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding
i64,str,i64,i64,f64,list[f64]
1,"""1 HANDFULof LEAVES VOLUME FOUR…",122,17,30.5,"[-0.001696, -0.055177, … -0.050128]"
2,"""2 Once the Blessed One was sta…",854,154,213.5,"[0.057702, 0.016415, … 0.004121]"
2,"""This is stress … This is the o…",468,84,117.0,"[-0.004768, -0.053286, … -0.033697]"
3,"""3 Copyright 2014 Ṭhānissaro Bh…",779,113,194.75,"[0.035766, 0.038918, … -0.029905]"
4,"""4 Abbreviations AN Aṅguttara N…",659,107,164.75,"[0.072552, -0.048293, … -0.023471]"


## 2. Rag - Search and Answer

RAG goal: Retrieve relevant passages based on a query and use those passages to augment an input to an LLM to generate output based on relevant passages



In [1]:
import random

import torch
import numpy as np
import polars as pl

from sentence_transformers import util, SentenceTransformer

device = "cuda" if torch.cuda.is_available() else "cpu"

# import texts and embedding df
text_chunks_and_embeddings_df = pl.read_ndjson('/home/graham/projects/buddhism/rag/data/final_embeddings.jsonl')

# Convert to dicts
page_and_chunks = text_chunks_and_embeddings_df.to_dicts()

# Create model
embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2",
                                      device=device)
embeddings = torch.tensor(text_chunks_and_embeddings_df["embedding"].to_list())
text_chunks_and_embeddings_df.head()

  from tqdm.autonotebook import tqdm, trange


page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding
i64,str,i64,i64,f64,list[f64]
1,"""1 HANDFULof LEAVES VOLUME FOUR…",122,17,30.5,"[-0.001696, -0.055177, … -0.050128]"
2,"""2 Once the Blessed One was sta…",854,154,213.5,"[0.057702, 0.016415, … 0.004121]"
2,"""This is stress … This is the o…",468,84,117.0,"[-0.004768, -0.053286, … -0.033697]"
3,"""3 Copyright 2014 Ṭhānissaro Bh…",779,113,194.75,"[0.035766, 0.038918, … -0.029905]"
4,"""4 Abbreviations AN Aṅguttara N…",659,107,164.75,"[0.072552, -0.048293, … -0.023471]"


In [15]:
# 1. Define query
query = "consciousness"
print(f"Query: {query}")

# 2. Embed Query
query_embedding = embedding_model.encode(query, convert_to_tensor=True)

# 3. Get similarity scores with dot-product (use cosine similarity if outputs of model aren't normalized)
from time import perf_counter as timer
start_time = timer()
dot_scores = util.cos_sim(a=query_embedding, b=embeddings.to(device))[0]
end_time = timer()

print(f"[INFO] time taken to get scores on {len(embeddings)}: {end_time - start_time:.3f} seconds")

# 4. Get the top-k results
k = 5
top_results_dot_product = torch.topk(dot_scores, k=k)
top_results_dot_product


Query: consciousness
[INFO] time taken to get scores on 1714: 0.019 seconds


torch.return_types.topk(
values=tensor([0.5836, 0.5802, 0.5782, 0.5752, 0.5743], device='cuda:0'),
indices=tensor([ 554, 1399,  435,  979, 1558], device='cuda:0'))

In [16]:
for el_ in top_results_dot_product.indices:
    print(
        page_and_chunks[el_]['sentence_chunk']
    )
    print('*******************')

This is the cause, this is the reason, why some beings become totally unbound in the present life.” Note 1. A perception with a share in decline is one that causes the mind to fall from concentration. A perception with a share in stability is one that helps to maintain concentration. A perception with a share in distinction is one that leads to higher stages of concentration. A perception with a share in penetration is one leading to liberating insight. See also: MN 118; AN 4:49; AN 7:46; AN 10:60
*******************
Just as pain arises as an aﬄiction for a healthy person, even so the attention to perceptions dealing with forms that beset me was an aﬄiction for me. [ 6] “The thought occurred to me: ‘What if, with the complete transcending of the dimension of the inﬁnitude of space, (perceiving,) “Inﬁnite consciousness,” I were to enter & remain in the dimension of the inﬁnitude of consciousness?’ But my heart didn’t leap up at the dimension of the inﬁnitude of consciousness, didn’t gro

In [12]:
top_results_dot_product.indices

tensor([1315,  120,  485,  785,  568], device='cuda:0')