### ✅ SECTION 0: Setup & Imports + Globals


In [58]:
# 📦 Standard & Typing
import os
import time
from typing import List, Dict, Union, Any

# 📁 File Parsing
import fitz  # PyMuPDF for PDFs
import docx2txt
import pandas as pd
import langdetect

# 🤖 LLM & Embedding
from llama_cpp import Llama
import tiktoken
from tiktoken import get_encoding
from chromadb import PersistentClient
from sentence_transformers import SentenceTransformer
import torch

# 📊 Evaluation
from rouge_score import rouge_scorer

# 🌍 Translation (fallback if needed)
from transformers import pipeline

# 🧠 Memory
previous_qas = []

# ⚙️ Global Configuration
USE_GPU = True
GPU_LAYERS = 20  
ENCODER = get_encoding("cl100k_base")

# ⚡ Performance Logger
def log_tokens(text: str, seconds: float, label="LLM"):
    tokens = len(ENCODER.encode(text))
    tps = tokens / seconds if seconds > 0 else 0
    print(f"🔢 {label}: {tokens} tokens in {seconds:.2f}s → {tps:.2f} tokens/sec")

print("✅ All libraries loaded and configurations set.")




✅ All libraries loaded and configurations set.


### ✅ SECTION 1: Document Extraction & Chunking

#### Ensures proper text extraction from .pdf, .docx, .csv, .xls[x/m].

All extracted outputs are standardized into a consistent format.

Table content is flattened but readable.

Uses smarter chunking with overlap for better LLM context.

📥 1A. Document Extraction

In [59]:
def extract_text(file_path: str) -> List[Dict[str, Union[str, int]]]:
    ext = file_path.lower().split('.')[-1]
    filename = os.path.basename(file_path)

    if ext == 'pdf':
        doc = fitz.open(file_path)
        return [
            {
                "source": filename,
                "page": i + 1,
                "text": page.get_text()
            } for i, page in enumerate(doc)
        ]

    elif ext == 'docx':
        text = docx2txt.process(file_path)
        return [{
            "source": filename,
            "page": 1,
            "text": text
        }]

    elif ext in ['xls', 'xlsx', 'xlsm']:
        xls = pd.ExcelFile(file_path)
        results = []
        for sheet in xls.sheet_names:
            df = xls.parse(sheet).astype(str)
            results.append({
                "source": filename,
                "page": sheet,
                "text": df.to_string(index=False)
            })
        return results

    elif ext == 'csv':
        df = pd.read_csv(file_path).astype(str)
        return [{
            "source": filename,
            "page": 1,
            "text": df.to_string(index=False)
        }]

    else:
        return [{
            "source": filename,
            "page": 1,
            "text": f"[UNSUPPORTED FILE TYPE: {ext}]"
        }]


🧩 1B. Smarter Chunking with Overlap

In [None]:
def chunk_text(text: str, max_tokens: int = 500, overlap: int = 50) -> List[str]:
    tokens = ENCODER.encode(text)
    chunks = []
    start = 0
    while start < len(tokens):
        end = min(start + max_tokens, len(tokens))
        chunk = ENCODER.decode(tokens[start:end])
        chunks.append(chunk)
        start += max_tokens - overlap
    return chunks


def detect_table_like(text: str) -> bool:
    """Heuristic: if lots of tabs or linebreaks, likely a table."""
    return text.count('\t') > 5 or text.count('\n') > 10

def chunk_documents(docs: List[Dict[str, Union[str, int]]]) -> List[Dict[str, Any]]:
    all_chunks = []
    for doc in docs:
        file_name = doc["source"]
        page = doc["page"]
        chunk_type = "table" if detect_table_like(doc["text"]) else "text"
        split_chunks = chunk_text(doc["text"])
        for i, chunk in enumerate(split_chunks):
            all_chunks.append({
                "source": file_name,
                "page": page,
                "chunk_number": i + 1,
                "type": chunk_type,
                "text": chunk
            })
    return all_chunks


🗂️ 1C. Load All Files and Chunk

In [61]:
def load_and_chunk_all_documents(folder: str = "Dr.X Files") -> List[Dict[str, Any]]:
    all_docs = []
    for file in os.listdir(folder):
        path = os.path.join(folder, file)
        if os.path.isfile(path):
            extracted = extract_text(path)
            all_docs.extend(extracted)

    print(f"📄 Loaded and extracted {len(all_docs)} document entries.")
    chunks = chunk_documents(all_docs)
    print(f"🧩 Split into {len(chunks)} chunks.")
    return chunks

# Load and prepare chunks
chunks = load_and_chunk_all_documents("Dr.X Files")



  warn("""Cannot parse header or footer so it will be ignored""")

  warn("""Cannot parse header or footer so it will be ignored""")

  warn(msg)

  warn(f"Print area cannot be set to Defined name: {defn.value}.")



📄 Loaded and extracted 220 document entries.
🧩 Split into 508 chunks.


### ✅ SECTION 2: Embedding + Vector DB Storage (ChromaDB + nomic)

Connects to a persistent local ChromaDB client.

embedding model via embedding_functions.

Stores text chunks along with metadata (source, page, chunk_number).

Includes performance logging (tokens/sec).

#### 🧠 2A. Setup Vector Store with nomic

🧠 1. Custom Local Embedding Function

In [62]:
# class LocalEmbeddingFunction:

class LocalEmbeddingFunction:
    def __init__(self):
       
        device = "cuda" if torch.cuda.is_available() else "cpu"
        self.device = device
        # self.model = SentenceTransformer("all-MiniLM-L6-v2", device=device)
        self.model = SentenceTransformer("nomic-ai/nomic-embed-text-v1.5", trust_remote_code=True, device=device)

        print(f"🟢 SentenceTransformer loaded on {self.device}")

    def __call__(self, input: list[str]) -> list[list[float]]:
        return self.model.encode(input, convert_to_numpy=True, device="cuda" if torch.cuda.is_available() else "cpu").tolist()

    def embed_query(self, text: str) -> list[float]:
        return self.__call__([text])[0]



🧠 2. Use It in ChromaDB Setup

In [63]:
# 🌐 Initialize ChromaDB  localy embedding
embedding_fn = LocalEmbeddingFunction()

client = PersistentClient(path="vector_store")
collection = client.get_or_create_collection(
    name="dr_x_chunks",
    embedding_function=embedding_fn
)





🟢 SentenceTransformer loaded on cuda


🧠 2B. Index Chunks with Embeddings

In [64]:
from uuid import uuid4

def index_chunks_in_vector_db(chunks: List[Dict[str, Any]]):
    for chunk in chunks:
        uid = str(uuid4())
        metadata = {
            "source": chunk["source"],
            "page": str(chunk["page"]),
            "chunk_number": chunk["chunk_number"]
        }

        # Time embedding performance
        start_time = time.time()
        collection.add(
            documents=[chunk["text"]],
            metadatas=[metadata],
            ids=[uid]
        )
        duration = time.time() - start_time
        log_tokens(chunk["text"], duration, label="Embedding")

    print(f"✅ Indexed {len(chunks)} chunks into ChromaDB.")

# Index now
index_chunks_in_vector_db(chunks)


🔢 Embedding: 500 tokens in 0.03s → 16129.46 tokens/sec
🔢 Embedding: 500 tokens in 0.02s → 21590.08 tokens/sec
🔢 Embedding: 500 tokens in 0.02s → 21776.15 tokens/sec
🔢 Embedding: 500 tokens in 0.02s → 22809.51 tokens/sec
🔢 Embedding: 500 tokens in 0.02s → 21920.23 tokens/sec
🔢 Embedding: 500 tokens in 0.03s → 14350.29 tokens/sec
🔢 Embedding: 500 tokens in 0.02s → 22074.59 tokens/sec
🔢 Embedding: 500 tokens in 0.02s → 21436.92 tokens/sec
🔢 Embedding: 500 tokens in 0.02s → 21053.63 tokens/sec
🔢 Embedding: 500 tokens in 0.02s → 21146.19 tokens/sec
🔢 Embedding: 500 tokens in 0.03s → 16883.92 tokens/sec
🔢 Embedding: 500 tokens in 0.03s → 19183.96 tokens/sec
🔢 Embedding: 500 tokens in 0.02s → 20798.68 tokens/sec
🔢 Embedding: 500 tokens in 0.02s → 20601.52 tokens/sec
🔢 Embedding: 500 tokens in 0.03s → 19570.84 tokens/sec
🔢 Embedding: 500 tokens in 0.03s → 18218.20 tokens/sec
🔢 Embedding: 220 tokens in 0.02s → 9072.51 tokens/sec
🔢 Embedding: 500 tokens in 0.02s → 22677.31 tokens/sec
🔢 Embedding

✅ SECTION 3: RAG Q&A System (with LLaMA + Conversational Memory)

Uses question embedding to query ChromaDB.

Retrieves top-k relevant chunks using cosine similarity.

Generates the answer using local LLaMA.

Maintains multi-turn memory.




🧠 3A. Load LLaMA Model (via llama.cpp)

In [65]:
USE_GPU = True
GPU_LAYERS = 40  # You can adjust based on your GPU RAM // 32-40 is typical for 8GB 
llm = Llama(
    model_path="models/llama3-8B.gguf",
    # n_ctx=2048,
    n_ctx=3008,
    n_threads=16,
    n_gpu_layers=GPU_LAYERS if USE_GPU else 0,
    use_mlock=True,
    verbose=False
)

print("✅ LLaMA model loaded.")





llama_init_from_model: n_ctx_per_seq (3008) < n_ctx_train (1048576) -- the full capacity of the model will not be utilized


✅ LLaMA model loaded.


❓ 3B. Ask Question with Retrieval & Generation

In [66]:
def ask_question(query: str, k: int = 5) -> str:
    global previous_qas

    # 1️ Embed the query
    query_embedding = embedding_fn.embed_query(query)

    # 2️ Retrieve top-k documents
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=k,
        include=["documents", "metadatas"]
    )

    # 3️ Prepare prompt with context
    context_blocks = []
    for i in range(len(results["documents"][0])):
        meta = results["metadatas"][0][i]
        doc = results["documents"][0][i]
        block = f"Source: {meta['source']} (Page: {meta['page']}, Chunk: {meta['chunk_number']})\n{doc}"
        context_blocks.append(block)

    context_text = "\n\n".join(context_blocks)
    history_text = "\n".join([f"Q: {q}\nA: {a}" for q, a in previous_qas])
    prompt = f"""
# You are an intelligent assistant trained to answer questions using only the context provided.
You are a helpful AI researcher. Answer the user's question using ONLY the context below.


{history_text}

Context:
{context_text}

Q: {query}
A:""".strip()

    # 4️ Generate Answer
    start_time = time.time()
    output = llm(prompt, max_tokens=300, stop=["Q:", "User:"], echo=False)
    response = output['choices'][0]['text'].strip()
    duration = time.time() - start_time
    log_tokens(prompt, duration, label="LLM Generation")

    # 5️ Update memory
    previous_qas.append((query, response))

    return response


In [67]:
response = ask_question("What was Dr. X researching?")
print("\n🧠 Answer:\n", response)

🔢 LLM Generation: 2682 tokens in 10.71s → 250.52 tokens/sec

🧠 Answer:
 According to the context, Dr. X was researching new approaches and procedures for cancer treatment. Specifically, they were looking into gene therapy, targeted therapy, immunotherapy, and stem cell therapy as potential cancer treatments. They also analyzed clinical trials to gather information on these treatments and their potential outcomes. Dr. X was interested in developing safe and efficient cancer nanomedicines and investigating alternative treatments such as thermal ablation and magnetic hyperthermia. The research was aimed at improving prognosis and outcomes for cancer patients.


In [68]:
# # Reset memory
# reset_session()

### ✅ SECTION 4: Translation (Any Language → English or Arabic)

Language auto-detection using langdetect.

Translation into English or Arabic using a local LLM, or fallback to transformers pipeline if needed.

Option to preserve document format.

🌍 4A. Translation Tool (LLM-powered or Transformers)

In [69]:
def get_offline_translator(local_model_path="./models/opus-mt-mul-en", device=-1):
    from transformers import pipeline
    return pipeline("translation", model=local_model_path, device=device, local_files_only=True)

🔁 4B. Translate Text to English or Arabic

In [70]:
def translate_text(text: str, target_lang: str = "en") -> str:
    detected = langdetect.detect(text)
    if detected == target_lang:
        return text

    prompt = f"Translate this text from {detected} to {target_lang}:\n\n{text}\n\nTranslation:"
    try:
        # Try translating with LLaMA (simple prompt)
        start_time = time.time()
        output = llm(prompt, max_tokens=512, stop=["\n\n"], echo=False)
        translated = output['choices'][0]['text'].strip()
        duration = time.time() - start_time
        log_tokens(prompt, duration, label="Translation")
    except Exception as e:
        print(f"⚠️ LLaMA failed, using transformer fallback: {e}")
        translated = get_offline_translator(text, src_lang=detected, tgt_lang=target_lang)[0]['translation_text']

    return translated


📂 4C. Translate a Whole Document

In [71]:
def translate_document(doc: Dict[str, Any], target_lang: str = "en") -> Dict[str, Any]:
    translated_text = translate_text(doc["text"], target_lang=target_lang)
    return {
        "source": doc["source"],
        "page": doc["page"],
        "translated_text": translated_text
    }


🧪 Example: Translate Document to Arabic

In [72]:
sample_doc = chunks[0]  # You can pick any chunk or document
translated_doc = translate_document(sample_doc, target_lang="ar")

print("🌐 Translated Output:\n")
print(translated_doc["translated_text"])


🔢 Translation: 511 tokens in 0.67s → 766.14 tokens/sec
🌐 Translated Output:

نقدم هنا تفاصيل 10 دراسة منشورة حول إستخدام الأعشاب الوعرية في إزالة الكربون من التربة.


### ✅ SECTION 5: Summarization + ROUGE Evaluation
* LLM-based summarization using your local llama_cpp model.

* ROUGE metric to evaluate summary quality.

* Modular design: can summarize per document, per chunk, or whole corpus.

* Support for different summarization strategies (basic prompt engineering).



🧠 5A. Summarization Function via LLaMA

In [73]:
def summarize_text(text: str, strategy: str = "default") -> str:
    if strategy == "default":
        prompt = f"Summarize the following scientific text:\n\n{text}\n\nSummary:"
    elif strategy == "insight":
        prompt = f"Extract the key findings and main ideas from this text:\n\n{text}\n\nKey Findings:"
    else:
        prompt = f"Give a brief overview of the text:\n\n{text}\n\nOverview:"

    start_time = time.time()
    output = llm(prompt, max_tokens=256, stop=["\n\n"], echo=False)
    summary = output['choices'][0]['text'].strip()
    duration = time.time() - start_time
    log_tokens(prompt, duration, label="Summarization")
    return summary


🧪 5B. Run Summarization on a Chunk or Full Text

In [103]:
# Example: summarizing one of the document chunks
sample_text = chunks[0]['text']
summary = summarize_text(sample_text, strategy="insight")

print("📝 Summary:\n", summary)


🔢 Summarization: 516 tokens in 1.10s → 471.22 tokens/sec
📝 Summary:
 The studies in this meta-analysis were conducted in the Mediterranean climate region and were focused on the effects of turfgrass on soil organic carbon (SOC) levels. The findings suggest that the impact of turfgrass on SOC levels is dependent on the type of turfgrass used and the conditions under which it is grown.


📊 5C. Evaluate with ROUGE Score

In [75]:
def evaluate_summary(reference: str, generated: str):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference, generated)

    print("\n📈 ROUGE Evaluation:")
    for k, v in scores.items():
        print(f"{k}: P={v.precision:.3f}, R={v.recall:.3f}, F1={v.fmeasure:.3f}")

    return scores


📊 5C. Evaluate with ROUGE Score



In [76]:
def evaluate_summary(reference: str, generated: str):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference, generated)

    print("\n📈 ROUGE Evaluation:")
    for k, v in scores.items():
        print(f"{k}: P={v.precision:.3f}, R={v.recall:.3f}, F1={v.fmeasure:.3f}")

    return scores


🧪 5D. Test ROUGE on Manual Summary

In [107]:
# Reference summary
# reference_summary = "Dr. X focused on the fields of new cancer treatments, gene therapy, and stem cell therapy in his latest publications."
reference_summary = "The SOC stock in the top 10 cm of soil in turfgrass plots was 0.75 to 3.0 times higher than that in non-turf plots."


evaluate_summary(reference_summary, summary)



📈 ROUGE Evaluation:
rouge1: P=0.167, R=0.333, F1=0.222
rouge2: P=0.019, R=0.038, F1=0.025
rougeL: P=0.111, R=0.222, F1=0.148


{'rouge1': Score(precision=0.16666666666666666, recall=0.3333333333333333, fmeasure=0.2222222222222222),
 'rouge2': Score(precision=0.018867924528301886, recall=0.038461538461538464, fmeasure=0.02531645569620253),
 'rougeL': Score(precision=0.1111111111111111, recall=0.2222222222222222, fmeasure=0.14814814814814814)}

### ✅ SECTION 6: System Logging, Error Handling 

Token-per-second tracking 

Edge case handling: empty retrievals, hallucinations.

Safe fallback prompts.

Summary of utilities and final save/export options.

🛡️ 6A. Fallback for No Retrievals

In [78]:
def safe_ask_question(query: str, k: int = 5) -> str:
    global previous_qas

    query_embedding = embedding_fn.embed_query(query)

    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=k,
        include=["documents", "metadatas"]
    )

    if not results["documents"][0]:
        return "❌ Sorry, I couldn't find relevant information in Dr. X's documents."

    context_blocks = []
    for i in range(len(results["documents"][0])):
        meta = results["metadatas"][0][i]
        doc = results["documents"][0][i]
        context_blocks.append(f"Source: {meta['source']} (Page {meta['page']}, Chunk {meta['chunk_number']})\n{doc}")

    context_text = "\n\n".join(context_blocks)
    history_text = "\n".join([f"Q: {q}\nA: {a}" for q, a in previous_qas])
    prompt = f"""
You are a careful assistant. Respond using only the context provided.

{history_text}

Context:
{context_text}

Q: {query}
A:""".strip()

    try:
        start_time = time.time()
        output = llm(prompt, max_tokens=300, stop=["Q:", "User:"], echo=False)
        response = output['choices'][0]['text'].strip()
        duration = time.time() - start_time
        log_tokens(prompt, duration, label="Safe RAG")
    except Exception as e:
        response = f"❗ Generation failed: {e}"

    previous_qas.append((query, response))
    return response


🧾 6B. Utility: Reset Memory and Cache

In [79]:
def reset_session():
    global previous_qas
    previous_qas = []
    print("🧠 Conversation memory cleared.")


💾 6C. Optional: Save Summaries or Translations

In [80]:
def save_to_txt(text: str, filename: str):
    with open(filename, 'w', encoding='utf-8') as f:
        f.write(text)
    print(f"💾 Saved to {filename}")


#### 🧠 6D. Full Pipeline Test

In [100]:
# Reset memory
reset_session()

# Ask a question from scratch
qa = safe_ask_question("What fields did Dr. X focus on in his latest publications?")
print("🧠 Answer:\n", qa)

# Translate a response to Arabic
print("\n🌍 Arabic Translation:\n", translate_text(qa, target_lang="ar"))

# Summarize that same answer
print("\n📄 Summary:\n", summarize_text(qa, strategy="insight"))


🧠 Conversation memory cleared.
🔢 Safe RAG: 2657 tokens in 1.59s → 1674.57 tokens/sec
🧠 Answer:
 Dr. X focused on the fields of cancer nanomedicines, gene therapy, stem cell therapy, and targeted therapy in his latest publications. He is particularly interested in exploring the potential benefits and limitations of these new approaches for the treatment of cancer.
🔢 Translation: 60 tokens in 1.26s → 47.46 tokens/sec

🌍 Arabic Translation:
 دكتور إكس تتمحور اهتماماته في مجال العلاج النانو لمعظم أنواع السرطان، والtherapy الجينية، والtherapy خلايا الجذع، والtherapy المستهدفة في آخر ما نشره. ويبدو أن دكتور إكس مهتم بشكل خاص في استكشاف المنافع المحتملة ونقاط الضعف لهذه الطرق الجديدة للعلاج للسرطان.
🔢 Summarization: 65 tokens in 1.75s → 37.20 tokens/sec

📄 Summary:
 Dr. X's research has shown that cancer nanomedicines can improve the delivery of chemotherapy agents to tumor cells while reducing systemic toxicity. Gene therapy, which involves delivering therapeutic genes directly to cancer cel

In [82]:
# #Testing Torch GPU acceleration availability
# import torch
# print("CUDA available:", torch.cuda.is_available())
# print("CUDA device:", torch.cuda.get_device_name(0))


# ✅ Extra Utilities for Enhanced Evaluation, Table Detection, and Creativity

#### 🧪 6E. Manual Q&A Evaluation

In [83]:
# Allows you to manually score LLM answers by relevance, factual accuracy, fluency, and confidence.
def evaluate_qa_manual(question: str, answer: str, reference: str = "") -> Dict[str, float]:
    print(f"Q: {question}\nA: {answer}\nReference: {reference}\n")
    relevance = float(input("Relevance (0-1): "))
    factual = float(input("Factual Accuracy (0-1): "))
    fluency = float(input("Fluency (0-1): "))
    confidence = float(input("Confidence (0-1): "))
    return {
        "Relevance": relevance,
        "Factual Accuracy": factual,
        "Fluency": fluency,
        "Confidence": confidence
    }

### 🧩 6F. Table Detection in Text (Used during chunking)

In [84]:
def detect_table_like(text: str) -> bool:
    """Heuristic: if lots of tabs or linebreaks, likely a table."""
    return text.count('\t') > 5 or text.count('\n') > 10

### 🔎 6G. Flag Chunk Type (text vs table) during chunking

In [85]:
# this to tag chunks

def chunk_documents(docs: List[Dict[str, Union[str, int]]]) -> List[Dict[str, Any]]:
    all_chunks = []
    for doc in docs:
        file_name = doc["source"]
        page = doc["page"]
        chunk_type = "table" if detect_table_like(doc["text"]) else "text"
        split_chunks = chunk_text(doc["text"])
        for i, chunk in enumerate(split_chunks):
            all_chunks.append({
                "source": file_name,
                "page": page,
                "chunk_number": i + 1,
                "type": chunk_type,
                "text": chunk
            })
    return all_chunks

#### 🎯 6H. Identify Data-Heavy Questions (e.g., referencing tables)

In [86]:
def is_data_query(query: str) -> bool:
    keywords = ["table", "dataset", "values", "results", "columns", "rows"]
    return any(kw in query.lower() for kw in keywords)

#### 🧠 6I. Sentence-Based Chunking (Alternative to token-based)

In [88]:
import nltk
nltk.download('punkt')

def chunk_by_sentence(text: str, max_tokens=500):
    from nltk.tokenize import sent_tokenize
    sents = sent_tokenize(text)
    chunks, current_chunk = [], ""
    for sent in sents:
        temp = current_chunk + " " + sent
        if len(ENCODER.encode(temp)) <= max_tokens:
            current_chunk = temp
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sent
    if current_chunk:
        chunks.append(current_chunk.strip())
    return chunks

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\GHOST2OM\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


#### ✨ 6J. Post-Translation Grammar Refinement

In [89]:
# Improves fluency of translated text using LLaMA

def refine_translation(text: str, lang: str = "en") -> str:
    prompt = f"""
You are a fluent editor. Improve the grammar and flow of this translated {lang.upper()} text:

{text}

Improved Version:"""
    output = llm(prompt, max_tokens=300, stop=["\n\n"], echo=False)
    return output['choices'][0]['text'].strip()

#### 🧠 6K. Display Retrieval Metadata for Debugging

In [90]:
def display_retrieved_chunks(results):
    print("\n📄 Top Retrieved Chunks:")
    for doc, meta in zip(results["documents"][0], results["metadatas"][0]):
        print(f"→ From {meta['source']} (Page {meta['page']}, Chunk {meta['chunk_number']})")

In [None]:
# ✅ Enhanced ask_question function with debug logging and table-aware logic

def ask_question(query: str, k: int = 5, verbose: bool = False) -> str:
    global previous_qas

    # 1️⃣ Embed the query
    query_embedding = embedding_fn.embed_query(query)

    # 2️⃣ Retrieve top-k documents
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=k,
        include=["documents", "metadatas"]
    )

    if verbose:
        display_retrieved_chunks(results)

    # 3️⃣ Prepare prompt with context
    context_blocks = []
    for i in range(len(results["documents"][0])):
        meta = results["metadatas"][0][i]
        doc = results["documents"][0][i]
        block = f"Source: {meta['source']} (Page: {meta['page']}, Chunk: {meta['chunk_number']})\n{doc}"
        context_blocks.append(block)

    context_text = "\n\n".join(context_blocks)
    history_text = "\n".join([f"Q: {q}\nA: {a}" for q, a in previous_qas])
    prompt = f"""
# You are an intelligent assistant trained to answer questions using only the context provided.
You are a helpful AI researcher. Answer the user's question using ONLY the context below.

{history_text}

Context:
{context_text}

Q: {query}
A:""".strip()

    # 4️⃣ Generate Answer
    start_time = time.time()
    output = llm(prompt, max_tokens=300, stop=["Q:", "User:"], echo=False)
    response = output['choices'][0]['text'].strip()
    duration = time.time() - start_time
    log_tokens(prompt, duration, label="LLM Generation")

    if verbose:
        print("\n🧠 Generated Answer:\n", response)

    # 5️⃣ Update memory
    previous_qas.append((query, response))

    return response


# Swap in sentence-based chunking instead of token-based

from nltk.tokenize import sent_tokenize
import nltk
nltk.download('punkt')

def chunk_text(text: str, max_tokens: int = 500, overlap: int = 50) -> List[str]:
    """Chunk text based on sentences while preserving max token limits."""
    tokens = ENCODER.encode(text)
    sents = sent_tokenize(text)
    chunks, current_chunk = [], ""
    for sent in sents:
        temp = current_chunk + " " + sent
        if len(ENCODER.encode(temp)) <= max_tokens:
            current_chunk = temp
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sent
    if current_chunk:
        chunks.append(current_chunk.strip())
    return chunks


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\GHOST2OM\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [92]:
response = ask_question("What tables mention  Cancer stem cells (CSCs)?", verbose=True)
print("Answer:", response)



📄 Top Retrieved Chunks:
→ From new-approaches-and-procedures-for-cancer-treatment.pdf (Page 3, Chunk 3)
→ From new-approaches-and-procedures-for-cancer-treatment.pdf (Page 3, Chunk 3)
→ From new-approaches-and-procedures-for-cancer-treatment.pdf (Page 3, Chunk 3)
→ From new-approaches-and-procedures-for-cancer-treatment.pdf (Page 3, Chunk 3)
→ From new-approaches-and-procedures-for-cancer-treatment.pdf (Page 3, Chunk 3)
🔢 LLM Generation: 1972 tokens in 0.91s → 2167.66 tokens/sec

🧠 Generated Answer:
 Table 1 mentions Cancer stem cells (CSCs) in the 3rd row.
Answer: Table 1 mentions Cancer stem cells (CSCs) in the 3rd row.


In [93]:
def reload_llm():
    global llm
    import gc
    del llm
    gc.collect()
    torch.cuda.empty_cache()
    llm = Llama(
        model_path="models/llama3-8B.gguf",
        n_ctx=3008,
        n_threads=16,
        n_gpu_layers=GPU_LAYERS,
        use_mlock=True,
        verbose=False
    )
    print("🔄 LLaMA reloaded and memory cleared.")


#### safely restart LLaMA inside code

In [94]:
reload_llm()
# clear_embedding_cache()
reset_session()

llama_init_from_model: n_ctx_per_seq (3008) < n_ctx_train (1048576) -- the full capacity of the model will not be utilized


🔄 LLaMA reloaded and memory cleared.
🧠 Conversation memory cleared.


In [95]:
response = ask_question("What Dr. X published??", verbose=True)
print("Answer:", response)

# Ask a question from scratch
qa = safe_ask_question("What fields did Dr. X focus on in his latest publications?")
print("🧠 Answer:\n", qa)

# Translate a response to Arabic
print("\n🌍 Arabic Translation:\n", translate_text(qa, target_lang="ar"))

# Summarize that same answer
print("\n📄 Summary:\n", summarize_text(qa, strategy="insight"))


📄 Top Retrieved Chunks:
→ From new-approaches-and-procedures-for-cancer-treatment.pdf (Page 1, Chunk 1)
→ From new-approaches-and-procedures-for-cancer-treatment.pdf (Page 1, Chunk 1)
→ From new-approaches-and-procedures-for-cancer-treatment.pdf (Page 1, Chunk 1)
→ From new-approaches-and-procedures-for-cancer-treatment.pdf (Page 1, Chunk 1)
→ From new-approaches-and-procedures-for-cancer-treatment.pdf (Page 1, Chunk 1)
🔢 LLM Generation: 2682 tokens in 1.70s → 1574.51 tokens/sec

🧠 Generated Answer:
 Dr. X published an article titled "New approaches and procedures for cancer treatment: Current perspectives" in the journal SAGE Open Medicine. The article discusses new developments in cancer treatment and the use of combinatorial strategies involving targeted therapies and traditional chemotherapeutics.
Answer: Dr. X published an article titled "New approaches and procedures for cancer treatment: Current perspectives" in the journal SAGE Open Medicine. The article discusses new developm

### Trying to reset the Cashe from GPU

In [96]:
# # killing llm
# import gc
# del llm
# gc.collect()
# torch.cuda.empty_cache()

In [97]:
# import gc

# def clear_embedding_cache():
#     gc.collect()
#     del llm
#     torch.cuda.empty_cache()
#     print("PyTorch embedding GPU cache cleared.")


In [98]:
# vec = index_chunks_in_vector_db.embed_query("Hello world")
# print(f"Vector dimension: {len(vec)}")


In [99]:
# import chromadb

# chroma_client = chromadb.Client()
# chroma_client.delete_collection("my_collection")  # delete if it already exists