In [1]:
import os
import os

if "COLAB_GPU" in os.environ:
    print("[INFO] Running in Google Colab, installing requirements.")
    !pip install -U torch # requires torch 2.1.1+ (for efficient sdpa implementation)
    !pip install PyMuPDF # for reading PDFs with Python
    !pip install tqdm # for progress bars
    !pip install sentence-transformers # for embedding models
    !pip install accelerate # for quantization model loading
    !pip install bitsandbytes # for quantizing models (less storage space)
    !pip install flash-attn --no-build-isolation # for faster attention mechanism = faster LLM inference

[INFO] Running in Google Colab, installing requirements.
Collecting torch
  Downloading torch-2.7.1-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (29 kB)
Collecting sympy>=1.13.3 (from torch)
  Downloading sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.6.77 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.6.77-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.6.77 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.6.77-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.6.80 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.6.80-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.5.1.17 (from torch)
  Downloading nvidia_cudnn_cu12-9.5.1.17-py3-none-manylinux_2_28_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.6.4.1 (from torch)
  Downloading nvidia_cublas_cu12-12.6.4.

In [2]:
import os
import requests

# Get PDF document
pdf_path = "BTech corpus.pdf"

# Download PDF if it doesn't already exist
if not os.path.exists(pdf_path):
  print("File doesn't exist, downloading...")

  # The URL of the PDF you want to download
  url = "https://iiitd.ac.in/sites/default/files/docs/education/2024/2024-May-UG%20Regulations.pdf"

  # The local filename to save the downloaded file
  filename = pdf_path

  # Send a GET request to the URL
  response = requests.get(url)

  # Check if the request was successful
  if response.status_code == 200:
      # Open a file in binary write mode and save the content to it
      with open(filename, "wb") as file:
          file.write(response.content)
      print(f"The file has been downloaded and saved as {filename}")
  else:
      print(f"Failed to download the file. Status code: {response.status_code}")
else:
  print(f"File {pdf_path} exists.w2  ")

File doesn't exist, downloading...
The file has been downloaded and saved as BTech corpus.pdf


In [3]:
!pip install PyMuPDF

Collecting PyMuPDF
  Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m37.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.26.3


In [4]:
import fitz # (pymupdf, found this is better than pypdf for our use case, note: licence is AGPL-3.0, keep that in mind if you want to use any code commercially)
from tqdm.auto import tqdm # for progress bars, requires !pip install tqdm

def text_formatter(text: str) -> str:
    """Performs minor formatting on text."""
    cleaned_text = text.replace("\n", " ").strip() # note: this might be different for each doc (best to experiment)

    # Other potential text formatting functions can go here
    return cleaned_text

# Open PDF and get lines/pages
# Note: this only focuses on text, rather than images/figures etc
def open_and_read_pdf(pdf_path: str) -> list[dict]:
    """
    Opens a PDF file, reads its text content page by page, and collects statistics.

    Parameters:
        pdf_path (str): The file path to the PDF document to be opened and read.

    Returns:
        list[dict]: A list of dictionaries, each containing the page number
        (adjusted), character count, word count, sentence count, token count, and the extracted text
        for each page.
    """
    doc = fitz.open(pdf_path)  # open a document
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc)):  # iterate the document pages
        text = page.get_text()  # get plain text encoded as UTF-8
        text = text_formatter(text)
        pages_and_texts.append({"page_number": page_number - 41,  # adjust page numbers since our PDF starts on page 42
                                "page_char_count": len(text),
                                "page_word_count": len(text.split(" ")),
                                "page_sentence_count_raw": len(text.split(". ")),
                                "page_token_count": len(text) / 4,  # 1 token = ~4 chars, see: https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
                                "text": text})
    return pages_and_texts

pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)
pages_and_texts[:2]

0it [00:00, ?it/s]

[{'page_number': -41,
  'page_char_count': 175,
  'page_word_count': 21,
  'page_sentence_count_raw': 2,
  'page_token_count': 43.75,
  'text': '1 REGULATIONS FOR B.TECH. PROGRAMS    For ORDINANCES details please visit at the following link:    https://iiitd.ac.in/sites/default/files/docs/education/BTech-Ordinances.pdf'},
 {'page_number': -40,
  'page_char_count': 2545,
  'page_word_count': 434,
  'page_sentence_count_raw': 26,
  'page_token_count': 636.25,
  'text': '2   REGULATIONS FOR B.TECH. PROGRAMS  1  General  (1) This document gives the general regulations applicable to all B.Tech. programs. Specific  requirements for a particular B.Tech. program (e.g. B.Tech. in Computer Science and  Engineering) are specified in regulations for that program.  (2) While the Senate is the main statutory body for all academic matters, the Academic  Affairs Committee (AAC), a standing committee of Senate, shall oversee matters related  to the undergraduate and postgraduate programs. This committe

In [5]:
import random

random.sample(pages_and_texts, k=3)

[{'page_number': -30,
  'page_char_count': 2540,
  'page_word_count': 490,
  'page_sentence_count_raw': 19,
  'page_token_count': 635.0,
  'text': '12 Grades: In general, the grades corresponding to the transferred credits will not be counted  towards the SGPA/CGPA calculation; however, if there is an MoU with the University  and the MoU permits, then the grades will be included in the SGPA/CGPA calculation.      6.8  B.Tech. students are allowed to do “extra credits" beyond 156 credits  B.Tech. students are allowed to do “extra credits" beyond 156 credits required for  completion of B.Tech. requirement. For N extra credits done, the student will be allowed  worst grades in N credits, with a maximum limit of at most 8 credits, to be not counted  towards CGPA computation. The impact of this decision will be reviewed after two  batches have graduated with this option. Following method will be used for calculating  the CGPA:    Method: The below method will automatically decide which extr

In [6]:
import pandas as pd

df = pd.DataFrame(pages_and_texts)
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,-41,175,21,2,43.75,1 REGULATIONS FOR B.TECH. PROGRAMS For ORDI...
1,-40,2545,434,26,636.25,2 REGULATIONS FOR B.TECH. PROGRAMS 1 Gener...
2,-39,2465,426,18,616.25,"3 demos, etc. The summer term is about half th..."
3,-38,2393,444,20,598.25,4 (2) Those candidates unable to show the proo...
4,-37,2323,448,28,580.75,5 b) A 2-credit course. The course will have...


In [7]:
# Get stats
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count
count,21.0,21.0,21.0,21.0,21.0
mean,-31.0,2179.81,402.76,17.71,544.95
std,6.2,720.74,134.38,7.34,180.19
min,-41.0,175.0,21.0,1.0,43.75
25%,-36.0,2063.0,385.0,16.0,515.75
50%,-31.0,2465.0,444.0,19.0,616.25
75%,-26.0,2545.0,468.0,21.0,636.25
max,-21.0,2973.0,549.0,28.0,743.25


In [8]:
from spacy.lang.en import English # see https://spacy.io/usage for install instructions

nlp = English()

# Add a sentencizer pipeline, see https://spacy.io/api/sentencizer/
nlp.add_pipe("sentencizer")

# Create a document instance as an example
doc = nlp("This is a sentence. This another sentence.")
assert len(list(doc.sents)) == 2

# Access the sentences of the document
list(doc.sents)

[This is a sentence., This another sentence.]

In [9]:
for item in tqdm(pages_and_texts):
    item["sentences"] = list(nlp(item["text"]).sents)

    # Make sure all sentences are strings
    item["sentences"] = [str(sentence) for sentence in item["sentences"]]

    # Count the sentences
    item["page_sentence_count_spacy"] = len(item["sentences"])

  0%|          | 0/21 [00:00<?, ?it/s]

In [10]:
# Inspect an example
random.sample(pages_and_texts, k=1)
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy
count,21.0,21.0,21.0,21.0,21.0,21.0
mean,-31.0,2179.81,402.76,17.71,544.95,17.67
std,6.2,720.74,134.38,7.34,180.19,7.24
min,-41.0,175.0,21.0,1.0,43.75,1.0
25%,-36.0,2063.0,385.0,16.0,515.75,17.0
50%,-31.0,2465.0,444.0,19.0,616.25,20.0
75%,-26.0,2545.0,468.0,21.0,636.25,22.0
max,-21.0,2973.0,549.0,28.0,743.25,28.0


In [11]:
# Define split size to turn groups of sentences into chunks
num_sentence_chunk_size = 10

# Create a function that recursively splits a list into desired sizes
def split_list(input_list: list,
               slice_size: int) -> list[list[str]]:
    """
    Splits the input_list into sublists of size slice_size (or as close as possible).

    For example, a list of 17 sentences would be split into two lists of [[10], [7]]
    """
    return [input_list[i:i + slice_size] for i in range(0, len(input_list), slice_size)]

# Loop through pages and texts and split sentences into chunks
for item in tqdm(pages_and_texts):
    item["sentence_chunks"] = split_list(input_list=item["sentences"],
                                         slice_size=num_sentence_chunk_size)
    item["num_chunks"] = len(item["sentence_chunks"])

  0%|          | 0/21 [00:00<?, ?it/s]

In [12]:
# Sample an example from the group (note: many samples have only 1 chunk as they have <=10 sentences total)
random.sample(pages_and_texts, k=1)

[{'page_number': -30,
  'page_char_count': 2540,
  'page_word_count': 490,
  'page_sentence_count_raw': 19,
  'page_token_count': 635.0,
  'text': '12 Grades: In general, the grades corresponding to the transferred credits will not be counted  towards the SGPA/CGPA calculation; however, if there is an MoU with the University  and the MoU permits, then the grades will be included in the SGPA/CGPA calculation.      6.8  B.Tech. students are allowed to do “extra credits" beyond 156 credits  B.Tech. students are allowed to do “extra credits" beyond 156 credits required for  completion of B.Tech. requirement. For N extra credits done, the student will be allowed  worst grades in N credits, with a maximum limit of at most 8 credits, to be not counted  towards CGPA computation. The impact of this decision will be reviewed after two  batches have graduated with this option. Following method will be used for calculating  the CGPA:    Method: The below method will automatically decide which extr

In [13]:
# Create a DataFrame to get stats
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy,num_chunks
count,21.0,21.0,21.0,21.0,21.0,21.0,21.0
mean,-31.0,2179.81,402.76,17.71,544.95,17.67,2.19
std,6.2,720.74,134.38,7.34,180.19,7.24,0.68
min,-41.0,175.0,21.0,1.0,43.75,1.0,1.0
25%,-36.0,2063.0,385.0,16.0,515.75,17.0,2.0
50%,-31.0,2465.0,444.0,19.0,616.25,20.0,2.0
75%,-26.0,2545.0,468.0,21.0,636.25,22.0,3.0
max,-21.0,2973.0,549.0,28.0,743.25,28.0,3.0


In [14]:
import re

# Split each chunk into its own item
pages_and_chunks = []
for item in tqdm(pages_and_texts):
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]

        # Join the sentences together into a paragraph-like structure, aka a chunk (so they are a single string)
        joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk) # ".A" -> ". A" for any full-stop/capital letter combo
        chunk_dict["sentence_chunk"] = joined_sentence_chunk

        # Get stats about the chunk
        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4 # 1 token = ~4 characters

        pages_and_chunks.append(chunk_dict)

# How many chunks do we have?
len(pages_and_chunks)

  0%|          | 0/21 [00:00<?, ?it/s]

46

In [15]:
# View a random sample
random.sample(pages_and_chunks, k=1)

[{'page_number': -25,
  'sentence_chunk': '17 a) The student earns at least 12 extra discipline credits from in-class courses. b) The student’s program includes a B. Tech.project. c) The student has a CGPA of 8.0 or more at graduation.  8.4 Graduating with a Minor A student enrolled in a B. Tech.program may also take a minor in some other area. Requirements for a minor in an area will be as stated in regulations for that minor. A student can graduate with a minor if he/she satisfies the requirements for his/her program as well as requirements for the minor. Requirement for each approved minor will be specified separately.  8.5 Award of Degrees (1) The Senate recommends a student, who completes all graduation requirements, to the Board of Governors (BOG) for the award of degree in the convocation.',
  'chunk_char_count': 764,
  'chunk_word_count': 133,
  'chunk_token_count': 191.0}]

In [16]:
# Get stats about our chunks
df = pd.DataFrame(pages_and_chunks)
df.describe().round(2)

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,46.0,46.0,46.0,46.0
mean,-31.72,974.85,164.13,243.71
std,5.66,443.79,75.89,110.95
min,-41.0,74.0,10.0,18.5
25%,-36.0,736.75,129.0,184.19
50%,-32.5,1006.5,167.0,251.62
75%,-27.0,1300.25,221.75,325.06
max,-21.0,1765.0,300.0,441.25


In [18]:
# Show random chunks with under 30 tokens in length
min_token_length = 30
for row in df[df["chunk_token_count"] <= min_token_length].sample(5).iterrows():
    print(f'Chunk token count: {row[1]["chunk_token_count"]} | Text: {row[1]["sentence_chunk"]}')

ValueError: a must be greater than 0 unless no samples are taken

In [19]:
pages_and_chunks_over_min_token_len = df[df["chunk_token_count"] > min_token_length].to_dict(orient="records")
pages_and_chunks_over_min_token_len[:2]

[{'page_number': -41,
  'sentence_chunk': '1 REGULATIONS FOR B. TECH. PROGRAMS  For ORDINANCES details please visit at the following link:  https://iiitd.ac.in/sites/default/files/docs/education/BTech-Ordinances.pdf',
  'chunk_char_count': 172,
  'chunk_word_count': 18,
  'chunk_token_count': 43.0},
 {'page_number': -40,
  'sentence_chunk': '2  REGULATIONS FOR B. TECH. PROGRAMS 1 General (1) This document gives the general regulations applicable to all B. Tech.programs. Specific requirements for a particular B. Tech.program (e.g. B. Tech.in Computer Science and Engineering) are specified in regulations for that program. (2) While the Senate is the main statutory body for all academic matters, the Academic Affairs Committee (AAC), a standing committee of Senate, shall oversee matters related to the undergraduate and postgraduate programs. This committee shall be appointed by the Senate and shall have a term of two years. It will consist of DOAA, AAC Chair, Chair PG Affairs, Chair UG Aff

In [24]:
# Requires !pip install sentence-transformers
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2",
                                      device="cpu") # choose the device to load the model to (note: GPU will often be *much* faster than CPU)

# Create a list of sentences to turn into numbers
sentences = [
    "The Sentences Transformers library provides an easy and open-source way to create embeddings.",
    "Sentences can be embedded one by one or as a list of strings.",
    "Embeddings are one of the most powerful concepts in machine learning!",
    "Learn to use embeddings well and you'll be well on your way to being an AI engineer."
]

# Sentences are encoded/embedded by calling model.encode()
embeddings = embedding_model.encode(sentences)
embeddings_dict = dict(zip(sentences, embeddings))

# See the embeddings
for sentence, embedding in embeddings_dict.items():
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")

Sentence: The Sentences Transformers library provides an easy and open-source way to create embeddings.
Embedding: [-2.07982697e-02  3.03164832e-02 -2.01217849e-02  6.86484650e-02
 -2.55256221e-02 -8.47686455e-03 -2.07225574e-04 -6.32377118e-02
  2.81606894e-02 -3.33353989e-02  3.02633960e-02  5.30721806e-02
 -5.03527038e-02  2.62288321e-02  3.33313718e-02 -4.51577231e-02
  3.63044813e-02 -1.37122418e-03 -1.20171458e-02  1.14947259e-02
  5.04510924e-02  4.70856987e-02  2.11913940e-02  5.14606535e-02
 -2.03746483e-02 -3.58889401e-02 -6.67763175e-04 -2.94393823e-02
  4.95859198e-02 -1.05639677e-02 -1.52014112e-02 -1.31758570e-03
  4.48197424e-02  1.56023465e-02  8.60379430e-07 -1.21392624e-03
 -2.37978697e-02 -9.09368275e-04  7.34484056e-03 -2.53933994e-03
  5.23370504e-02 -4.68043424e-02  1.66214760e-02  4.71579395e-02
 -4.15599644e-02  9.01976076e-04  3.60277519e-02  3.42214219e-02
  9.68227163e-02  5.94829023e-02 -1.64984372e-02 -3.51249315e-02
  5.92516130e-03 -7.07903586e-04 -2.4103

In [21]:
single_sentence = "Yo! How cool are embeddings?"
single_embedding = embedding_model.encode(single_sentence)
print(f"Sentence: {single_sentence}")
print(f"Embedding:\n{single_embedding}")
print(f"Embedding size: {single_embedding.shape}")

Sentence: Yo! How cool are embeddings?
Embedding:
[-1.97448079e-02 -4.51075705e-03 -4.98487940e-03  6.55444935e-02
 -9.87675507e-03  2.72835791e-02  3.66426073e-02 -3.30219418e-03
  8.50080699e-03  8.24952591e-03 -2.28497498e-02  4.02430184e-02
 -5.75200468e-02  6.33692369e-02  4.43207286e-02 -4.49506305e-02
  1.25284856e-02 -2.52011903e-02 -3.55292968e-02  1.29559524e-02
  8.67019407e-03 -1.92917567e-02  3.55633814e-03  1.89505871e-02
 -1.47128217e-02 -9.39846132e-03  7.64174713e-03  9.62186605e-03
 -5.98921767e-03 -3.90168838e-02 -5.47824688e-02 -5.67455497e-03
  1.11644398e-02  4.08067666e-02  1.76319099e-06  9.15302895e-03
 -8.77257995e-03  2.39382777e-02 -2.32784543e-02  8.04999843e-02
  3.19177285e-02  5.12598269e-03 -1.47708189e-02 -1.62525009e-02
 -6.03213087e-02 -4.35689837e-02  4.51211371e-02 -1.79053824e-02
  2.63366513e-02 -3.47866826e-02 -8.89171939e-03 -5.47675304e-02
 -1.24372775e-02 -2.38606706e-02  8.33497122e-02  5.71241677e-02
  1.13328574e-02 -1.49595020e-02  9.2037

In [25]:
%%time

# Uncomment to see how long it takes to create embeddings on CPU
# # Make sure the model is on the CPU
# embedding_model.to("cpu")

# # Embed each chunk one by one
# for item in tqdm(pages_and_chunks_over_min_token_len):
#     item["embedding"] = embedding_model.encode(item["sentence_chunk"])

CPU times: user 10 µs, sys: 0 ns, total: 10 µs
Wall time: 20.3 µs


In [53]:
# Turn text chunks into a single list
text_chunks = [item["sentence_chunk"] for item in pages_and_chunks_over_min_token_len]

In [54]:
%%time

# Embed all texts in batches
text_chunk_embeddings = embedding_model.encode(text_chunks,
                                               batch_size=5, # you can use different batch sizes here for speed/performance, I found 32 works well for this use case
                                               convert_to_tensor=True) # optional to return embeddings as tensor instead of array

text_chunk_embeddings

CPU times: user 27.9 s, sys: 1.14 s, total: 29 s
Wall time: 29.1 s


tensor([[ 0.0012, -0.0886,  0.0008,  ...,  0.0061, -0.0317, -0.0027],
        [ 0.0234, -0.0509, -0.0048,  ..., -0.0324, -0.0372, -0.0266],
        [ 0.0049, -0.0366, -0.0114,  ..., -0.0194, -0.0316, -0.0363],
        ...,
        [ 0.0127, -0.0358, -0.0062,  ..., -0.0530, -0.0655, -0.0387],
        [ 0.0197, -0.0206, -0.0188,  ..., -0.0500, -0.0413, -0.0386],
        [ 0.0065, -0.0465,  0.0058,  ..., -0.0033, -0.0560, -0.0319]])

In [55]:
# Save embeddings to file
text_chunks_and_embeddings_df = pd.DataFrame(pages_and_chunks_over_min_token_len)
# # Convert tensor to list of lists (detached from GPU if needed)
# text_chunk_embeddings_list = text_chunk_embeddings.cpu().numpy().tolist()

# # Add embeddings to the dataframe
# text_chunks_and_embeddings_df["embedding"] = text_chunk_embeddings_list

# # Now save to CSV
# text_chunks_and_embeddings_df.to_csv("text_chunks_and_embeddings_df.csv", index=False)
embeddings_df_save_path = "text_chunks_and_embeddings_df.csv"
text_chunks_and_embeddings_df.to_csv(embeddings_df_save_path, index=False)

In [60]:

# Import saved file and view
text_chunks_and_embeddings_df_load = pd.read_csv(embeddings_df_save_path)
text_chunks_and_embeddings_df_load.head()

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count
0,-41,1 REGULATIONS FOR B. TECH. PROGRAMS For ORDIN...,172,18,43.0
1,-40,2 REGULATIONS FOR B. TECH. PROGRAMS 1 General...,942,146,235.5
2,-40,(3) Any condition arising in the B. Tech.progr...,1143,176,285.75
3,-40,Starts after first week of January and ends ar...,416,70,104.0
4,-39,"3 demos, etc. The summer term is about half th...",1310,214,327.5


In [61]:
print(text_chunks_and_embeddings_df.columns)

Index(['page_number', 'sentence_chunk', 'chunk_char_count', 'chunk_word_count',
       'chunk_token_count'],
      dtype='object')


In [69]:
import random

import torch
import numpy as np
import pandas as pd

device = "cuda" if torch.cuda.is_available() else "cpu"

# Import texts and embedding df
text_chunks_and_embeddings_df = pd.read_csv("text_chunks_and_embeddings_df.csv")

# Convert embedding column back to np.array (it got converted to string when it got saved to CSV)
text_chunks_and_embeddings_df["embedding"] = text_chunks_and_embeddings_df["embeddings"].apply(lambda x: np.fromstring(x.strip("[]"), sep="  "))

# Convert texts and embedding df to list of dicts
pages_and_chunks = text_chunks_and_embeddings_df.to_dict(orient="records")

# Convert embeddings to torch tensor and send to device (note: NumPy arrays are float64, torch tensors are float32 by default)
embeddings = torch.tensor(np.array(text_chunks_and_embeddings_df["embedding"].tolist()), dtype=torch.float32).to(device)
embeddings.shape

KeyError: 'embeddings'