In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

biloxx_nutritiondataset_path = kagglehub.dataset_download('biloxx/nutritiondataset')

print('Data source import complete.')


In [None]:
# =============================
# FULL RAG NOTEBOOK (single cell)
# =============================
# 1) Install dependencies
!pip install -q \
    langchain \
    langchain-community \
    faiss-cpu \
    sentence-transformers \
    transformers \
    pymupdf \
    accelerate \
    bitsandbytes \
    torch \
    tqdm

# 2) Imports
import os
import fitz   # PyMuPDF
from tqdm.auto import tqdm

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain_community.llms import HuggingFacePipeline
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline

# 3) Auto-detect PDFs in ../input
pdf_files = []
for root, dirs, files in os.walk("../input"):
    for f in files:
        if f.lower().endswith(".pdf"):
            pdf_files.append(os.path.join(root, f))

if not pdf_files:
    raise FileNotFoundError("No PDF files found in ../input. Please attach a dataset with PDFs to the notebook.")

print("Found PDF files:")
for p in pdf_files:
    print(" -", p)

# 4) Extract text from all PDFs (concatenate)
all_text = ""
for pdf_path in pdf_files:
    print(f"\nExtracting from: {pdf_path}")
    try:
        doc = fitz.open(pdf_path)
    except Exception as e:
        print(f"  Failed to open {pdf_path}: {e}")
        continue
    for page in doc:
        # get_text() is simple and usually good; if you need layout-aware extraction, adapt here
        all_text += page.get_text()
    doc.close()

print(f"\nTotal extracted characters: {len(all_text)}")
if len(all_text) < 50:
    raise ValueError("Extracted very little text — check that the PDFs actually contain extractable text (not scanned images).")

# 5) Split text into chunks for retrieval
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
texts = splitter.split_text(all_text)
print("Total chunks:", len(texts))

# Optional: show a sample chunk
print("\n--- Sample chunk ---\n")
print(texts[0][:500])
print("\n--- End sample ---\n")

# 6) Create embeddings and FAISS vector store
print("Creating embeddings... (this may take a few minutes)")
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
# Build FAISS DB
db = FAISS.from_texts(texts, embeddings)
retriever = db.as_retriever(search_kwargs={"k": 4})
print("Vector store ready (FAISS).")

# 7) Load local LLM (google/flan-t5-base)
#    NOTE: This model is CPU-friendly but still may take time to download.
model_name = "google/flan-t5-base"
print(f"Loading model {model_name} ... (may take a minute)")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=512)
llm = HuggingFacePipeline(pipeline=pipe)
print("Model loaded.")

# 8) Build RetrievalQA RAG chain
qa = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff",   # 'stuff' works for shorter contexts; change if you want other chain types
)
print("RAG pipeline ready. You can now ask questions.")

# 9) Interactive query loop (type 'exit' to stop)
print("\nType queries to ask about the PDF(s). Type 'exit' to quit.")
while True:
    try:
        query = input("\nYour question: ").strip()
    except Exception:
        # In some Kaggle environments input() may not behave interactively - handle gracefully
        print("Interactive input not available in this environment. To run a one-off query, set 'query' variable and call qa.run(query).")
        break
    if query.lower() in ("exit", "quit"):
        print("Exiting interactive loop.")
        break
    if not query:
        print("Please type a question or 'exit'.")
        continue

    try:
        # Run retrieval + generation
        answer = qa.run(query)
        print("\nAnswer:\n", answer)
    except Exception as e:
        print("Error during QA:", e)
        # fall back to showing top retrieved chunks (useful for debugging)
        try:
            docs = retriever.get_relevant_documents(query)
            print("\nTop retrieved chunks (for debugging):")
            for i, d in enumerate(docs[:3], 1):
                print(f"\n--- chunk {i} ---\n{d.page_content[:800]}\n")
        except Exception as e2:
            print("Also failed to retrieve debug docs:", e2)

# 10) Optional: save FAISS index to disk for reuse
save_dir = "/kaggle/working/faiss_index"
os.makedirs(save_dir, exist_ok=True)
db.save_local(save_dir)
print(f"FAISS index saved to: {save_dir}")
print("Done.")


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m32.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m50.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m71.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.1/60.1 MB[0m [31m27.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m80.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m59.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━

2025-10-19 20:01:59.712401: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1760904119.990138      37 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1760904120.068247      37 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Found PDF files:
 - ../input/nutritiondataset/nutrition.pdf

Extracting from: ../input/nutritiondataset/nutrition.pdf

Total extracted characters: 3076893
Total chunks: 3582

--- Sample chunk ---

VITAMIN
VITAMIN
PANTOTHENIC
LIFE STAGE
VITAMIN A
VITAMIN C
VITAMIN D
VITAMIN E
VITAMIN K
THIAMIN
RIBOFLAVIN
NIACIN
B6
FOLATE
B12
ACID
BIOTIN
CHOLINEg
GROUP
(G/DAY)a
(MG/DAY)
(G/DAY)b,c
(MG/DAY)d
(G/DAY)
(MG/DAY)
(MG/DAY)
(MG/DAY)e
(MG/DAY)
(G/DAY)f
(G/DAY)
(MG/DAY)
(G/DAY)
(MG/DAY)
INFANTS
0–6 mo
400*
40*
5*
4*
2.0*
0.2*
0.3*
2*
0.1*
65*
0.4*
1.7*
5*
125*
7–12 mo
500*
50*
5*
5*
2.5*
0.3*
0.4*
4*
0.3*
80*
0.5*
1.8*
6*
150*
CHILDREN
1–3 y
300
15
5*
6
30*
0.5
0.5
6
0.5
150
0.9
2*
8*
200*
4–8 

--- End sample ---

Creating embeddings... (this may take a few minutes)


  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Vector store ready (FAISS).
Loading model google/flan-t5-base ... (may take a minute)


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Device set to use cpu
  llm = HuggingFacePipeline(pipeline=pipe)


Model loaded.
RAG pipeline ready. You can now ask questions.

Type queries to ask about the PDF(s). Type 'exit' to quit.



Your question:  How many calories are in 100g of cooked quinoa?


  answer = qa.run(query)
Token indices sequence length is longer than the specified maximum sequence length for this model (1301 > 512). Running this sequence through the model will result in indexing errors



Answer:
 100g of cooked quinoa.



Your question:  What vitamins are especially important for bone health?



Answer:
 Vitamin D


KeyboardInterrupt: Interrupted by user