#Medical Assistant RAG model

In [None]:
%pip install -Uq "unstructured[all-docs]" pillow lxml pillow
%pip install -Uq chromadb tiktoken
%pip install -Uq python_dotenv

In [None]:
!apt-get install -y poppler-utils


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
poppler-utils is already the newest version (22.02.0-2ubuntu0.9).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.


In [None]:
from unstructured.partition.pdf import partition_pdf

file_path = "/content/standard-treatment-guidelines.pdf"

chunks = partition_pdf(
    filename=file_path,
    strategy="fast",
    chunking_strategy="by_title",
    max_characters=1000,
    combine_text_under_n_chars=200,
    new_after_n_chars=500,
)

print(len(chunks), "chunks extracted")



1828 chunks extracted


In [None]:

texts = []
for chunk in chunks:
    if "CompositeElement" in str(type((chunk))):
        texts.append(chunk)
print(len(texts))

1828


In [None]:
!pip install -U transformers

from transformers import pipeline

pipe = pipeline("summarization", model="facebook/bart-large-cnn",device=0)



Device set to use cuda:0


In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")

In [None]:
pdf_texts = [chunk.text for chunk in texts if hasattr(chunk, "text") and chunk.text]
print(len(pdf_texts))
print(pdf_texts[0][:500])


1828
STANDARD TREATMENT GUIDELINES

A Manual for Medical Therapeutics

First Edition, 2013

Gujarat Medical Services Corporation Limited

Health & Family Welfare Department

Government of Gujarat

i

Message


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch


tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn").to("cuda")

summaries = []
for t in pdf_texts:

    inputs = tokenizer(t, return_tensors="pt", truncation=True, max_length=1024)
    inputs = {k: v.to("cuda") for k, v in inputs.items()}

    # Generate summary
    summary_ids = model.generate(
        **inputs,
        max_length=150,
        min_length=40,
        num_beams=4,
        early_stopping=True
    )


    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    summaries.append(summary)

print(len(summaries))
print(summaries[0])


1828
STANDARD TREATMENT GUIDELINES - A Manual for Medical Therapeutics. First Edition, 2013. For confidential support call the Samaritans in the UK on 08457 90 90 90, visit a local Samaritans branch or see www.samaritans.org for details.


In [None]:
!pip install langchain
!pip install langchain-community
!pip install langchain-google-genai
!pip install chromadb
!pip install google-generativeai


Collecting langchain-community
  Downloading langchain_community-0.3.27-py3-none-any.whl.metadata (2.9 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.10.1-py3-none-any.whl.metadata (3.4 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Downloading langchain_community-0.3.27-py3-none-any.whl (2.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m53.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading httpx_sse-0.4.1-py3-none-any.whl (8.1 kB)
Downloading pydantic_settings-2.10.1-py3-none-any.whl (45 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.2/45.2 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: httpx-sse, pydantic-settings, langchain-community
Successfully installed httpx-sse-0.4.1 langchain-community-0.3.27 pydantic-settings-2.10.1
Collecting langchain-g

Collecting google-ai-generativelanguage==0.6.15 (from google-generativeai)
  Downloading google_ai_generativelanguage-0.6.15-py3-none-any.whl.metadata (5.7 kB)
Downloading google_ai_generativelanguage-0.6.15-py3-none-any.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m30.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: google-ai-generativelanguage
  Attempting uninstall: google-ai-generativelanguage
    Found existing installation: google-ai-generativelanguage 0.6.18
    Uninstalling google-ai-generativelanguage-0.6.18:
      Successfully uninstalled google-ai-generativelanguage-0.6.18
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
langchain-google-genai 2.1.9 requires google-ai-generativelanguage<0.7.0,>=0.6.18, but you have google-ai-generativelanguage 0.6.15 which is incompatible.[0m[

In [None]:
import os


os.environ["GOOGLE_API_KEY"] = "AIzaSyB3XrpR7gOxcvX0joLpXDOeIbagSiV0ejE"

from langchain_google_genai import GoogleGenerativeAIEmbeddings


embeddings = GoogleGenerativeAIEmbeddings(
    model="models/embedding-001",
    google_api_key=os.environ["GOOGLE_API_KEY"]
)


In [None]:
import uuid
from langchain_community.vectorstores import Chroma
from langchain_core.documents import Document
from langchain_google_genai import GoogleGenerativeAIEmbeddings  # Gemini embeddings


embedding_fn = GoogleGenerativeAIEmbeddings(model="models/embedding-001")  # Gemini embeddings

# Initialize Chroma (persistent or in-memory)
vectorstore = Chroma(
    collection_name="pdf_summaries",
    embedding_function=embeddings,
    persist_directory="./chroma_db"
)


doc_ids = [str(uuid.uuid4()) for _ in texts]
summary_texts = [
    Document(page_content=summaries[i], metadata={"source_id": doc_ids[i], "type": "text"})
    for i in range(len(texts))
]
vectorstore.add_documents(summary_texts)

vectorstore.persist()



✅ All summaries stored in ChromaDB!


  vectorstore.persist()


In [None]:

id_to_fulltext = dict(zip(doc_ids, pdf_texts))

In [None]:
query = "i am having cough and fever what is the treatment"
results = vectorstore.similarity_search(query, k=3)

for r in results:
    doc_id = r.metadata["source_id"]
    print("Summary match:", r.page_content)
    print("Original text:", id_to_fulltext[doc_id][:500], "...\n")


Summary match: Fever, productive cough, dyspnoea, chest pain. Bronchial breathing and presence of rales. Chest X-ray shows area of consolidation. Initial treatment can be amino glycoside and cephalosporin.
Original text: Salient features

.

Fever, productive cough, dyspnoea, chest pain. Bronchial breathing and presence of rales. Chest X-ray shows area of consolidation

Pharmacological treatment  Antibiotics: Depending upon sputum culture and sensitivity. Initial treatment can be

started with amino glycoside and cephalosporin. ...

Summary match: Oral Tab. paracetamol 500 mg for fever. Or Syr./Cap. cephalexin 25-50 mg/kg/day in 4 divided doses for 3-4 weeks.
Original text: Oral Tab. paracetamol 500 mg for fever.

Monitor therapy by clinical response. Favourable response characterized by decrease in swelling and fever, improvement in general well being and movements of limb, fall in ESR and C-Reactive protein (better indicator than ESR because CRP closely follows the clinical response

In [None]:
import shutil
import json


vectorstore.persist()

with open("id_to_summary.json", "w") as f:
    json.dump({doc_ids[i]: summaries[i] for i in range(len(summaries))}, f)

with open("id_to_fulltext.json", "w") as f:
    json.dump({doc_ids[i]: pdf_texts[i] for i in range(len(texts))}, f)



shutil.make_archive("chroma_store", "zip", "chroma_db")


'/content/chroma_store.zip'

In [None]:
from google.colab import files
files.download("chroma_store.zip")
files.download("id_to_fulltext.json")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>