In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install PyMuPDF sentence-transformers transformers scikit-learn openai



In [None]:
import fitz
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from openai import OpenAI

In [None]:
pdf_path = "/content/Human-Nutrition-2020.pdf"

In [None]:
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

pdf_text = extract_text_from_pdf(pdf_path)

In [None]:
def clean_text(text):
    text = text.replace('\n', ' ')
    text = ' '.join(text.split())
    return text

cleaned_text = clean_text(pdf_text)

In [None]:
def chunk_text(text, chunk_size=1000, overlap=100):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunk = text[start:end]
        chunks.append(chunk)
        start += chunk_size - overlap
    return chunks

chunks = chunk_text(cleaned_text)
print(f"Total chunks: {len(chunks)}")

Total chunks: 1506


In [None]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

tokens = tokenizer(
    chunks,
    truncation=True,
    padding='max_length',
    max_length=512,
    return_tensors='pt'
)

print(tokens['input_ids'].shape)

torch.Size([1506, 512])


In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(chunks, show_progress_bar=True)

Batches:   0%|          | 0/48 [00:00<?, ?it/s]

In [None]:
np.save('embeddings.npy', embeddings)

loaded_embeddings = np.load('embeddings.npy')
print(loaded_embeddings.shape)

query = input("Enter your question: ")
query_embedding = model.encode([query])



(1506, 384)


In [None]:
similarities = cosine_similarity(query_embedding, loaded_embeddings)
top_k_idx = similarities[0].argsort()[-5:][::-1]

for i, idx in enumerate(top_k_idx):
    score = similarities[0][idx]
    chunk = chunks[idx]
    print(f"Chunk {i+1} (score: {score:.4f}): {chunk[:500]}...\n")

retrieved_chunks = [chunks[i] for i in top_k_idx]

context = "\n\n".join(retrieved_chunks)

prompt = f"""Answer the following question based on the provided context.
If the answer cannot be found in the context, say you don't know.

Question: {query}

Context: {context}
"""


Chunk 1 (score: 0.6497): ins are macromolecules composed of chains of subunits called amino acids. Amino acids are simple subunits composed of carbon, oxygen, hydrogen, and nitrogen. Food sources of proteins include meats, dairy products, seafood, and a variety of different plant- based foods, most notably soy. The word protein comes from a Greek word meaning “of primary importance,” which is an apt description of these macronutrients; they are also known colloquially as the “workhorses” of life. Proteins provide four k...

Chunk 2 (score: 0.6375): al protein. The protein hemoglobin is an example of a protein that has quaternary structure. It is composed of four peptides that bond together to form a functional oxygen carrier. A protein’s structure also influences its nutritional quality. Large fibrous protein structures are more difficult to digest than smaller proteins and some, such as keratin, are indigestible. Because digestion of some fibrous proteins is incomplete, not all of the

In [None]:
client = OpenAI(api_key="")

response = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": prompt}
    ],
    temperature=0
)

print(response.choices[0].message.content)

Proteins are macromolecules composed of chains of subunits called amino acids. They are crucial for the nourishment, renewal, and continuance of life, providing structure to bones, muscles, and skin, and playing a role in conducting chemical reactions in the body. Proteins are often referred to as the "workhorses" of life due to their involvement in various bodily functions, including muscle movement, immune system function, and digestion. Each protein has a unique structure determined by the sequence of amino acids, which influences its function.
