In [None]:
!pip install nltk fitz
!pip install -U sentence-transformers
!pip install pdfminer.six|
!pip install autoawq
!pip install vllm

Collecting fitz
  Downloading fitz-0.0.1.dev2-py2.py3-none-any.whl.metadata (816 bytes)
Collecting configobj (from fitz)
  Downloading configobj-5.0.9-py2.py3-none-any.whl.metadata (3.2 kB)
Collecting configparser (from fitz)
  Downloading configparser-7.2.0-py3-none-any.whl.metadata (5.5 kB)
Collecting nipype (from fitz)
  Downloading nipype-1.10.0-py3-none-any.whl.metadata (7.1 kB)
Collecting pyxnat (from fitz)
  Downloading pyxnat-1.6.3-py3-none-any.whl.metadata (5.4 kB)
Collecting prov>=1.5.2 (from nipype->fitz)
  Downloading prov-2.0.2-py3-none-any.whl.metadata (3.7 kB)
Collecting rdflib>=5.0.0 (from nipype->fitz)
  Downloading rdflib-7.1.4-py3-none-any.whl.metadata (11 kB)
Collecting traits>=6.2 (from nipype->fitz)
  Downloading traits-7.0.2-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.8 kB)
Collecting acres (from nipype->fitz)
  Downloading acres-0.5.0-py3-none-any.whl.metadata (6.2 kB)
Collecting etelemetry>=0.3.1

In [None]:
import pandas as pd
import numpy as np
from transformers import pipeline, AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
#from summarizer import Summarizer
from textblob import TextBlob
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

## Models

SapBERT-from-PubMedBert

In [None]:
# tokenizer = AutoTokenizer.from_pretrained("cambridgeltl/SapBERT-from-PubMedBERT-fulltext")  
# model = AutoModel.from_pretrained("cambridgeltl/SapBERT-from-PubMedBERT-fulltext").cuda()

BioMedBERT

In [None]:
# tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext")  
# model = AutoModel.from_pretrained("microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext").cuda()

OphthaBERT

In [None]:
# tokenizer = AutoTokenizer.from_pretrained("ShahRishi/OphthaBERT")  
# model = AutoModel.from_pretrained("ShahRishi/OphthaBERT").cuda()

#AdaptLLM

In [None]:
# tokenizer = AutoTokenizer.from_pretrained("EditorZ/medllm")  
# model = AutoModel.from_pretrained("EditorZ/medllm").cuda()

In [None]:
from vllm import LLM, SamplingParams

prompts = [
"What is ocular rosacea?",
"How to treat red eyes?"
]
prompt_template=f'''### User Input:
{prompt}

### Assistant Output:
'''

prompts = [prompt_template.format(prompt=prompt) for prompt in prompts]

sampling_params = SamplingParams(temperature=0.8, top_p=0.95)

llm = LLM(model="TheBloke/medicine-LLM-AWQ", quantization="awq", dtype="auto")

outputs = llm.generate(prompts, sampling_params)

# Print the outputs.
for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")

## The Wills Eye Manual Preprocessing


In [4]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from typing import List, Dict
import re

In [6]:
STOPWORDS = set(stopwords.words('english'))

def extract_text_by_page(pdf_path: str) -> List[Dict]:
    pages = []
    for i, page_layout in enumerate(extract_pages(pdf_path)):
        page_text = ""
        for element in page_layout:
            if isinstance(element, LTTextContainer):
                page_text += element.get_text()
        pages.append({"page": i + 1, "text": page_text})
    return pages

def clean_text(text: str) -> List[str]:
    """Cleans text by lowercasing, removing punctuation and stopwords, and tokenizing."""
    text = text.lower().translate(str.maketrans("", "", string.punctuation))
    words = word_tokenize(text)
    # return [word for word in words if word.isalnum() and word not in STOPWORDS]
    return [word for word in words if word.isalnum() and word not in STOPWORDS]

def chunk_tokens(tokens: List[str], chunk_size: int = 64) -> List[str]:
    """Chunks a list of tokens into segments with a max token size."""
    chunks = []
    for i in range(0, len(tokens), chunk_size):
        chunk = tokens[i:i + chunk_size]
        chunks.append(" ".join(chunk))
    return chunks

In [7]:
pdf_path = "/kaggle/input/willseyemanual/Kalla Gervasio Travis Peck - The Wills Eye Manual_ Office and Emergency Room Diagnosis and Treatment of Eye Disease (2021 LWW Wolters Kluwer) - libgen.li.pdf"
pages = extract_text_by_page(pdf_path)
pages_list = pages[31:1139] #Skipping the initial and final few pages since they only include possibly irrelevant stuff, such as contents, indices, et.


In [8]:
print(pages_list[0])
print("******************************************")
current_chapter = "Unknown"
CHAPTER_PATTERN = re.compile(r'\bchapter\s+\d+\b', re.IGNORECASE)
chunks_with_meta = []

for page_data in pages_list:
    page_number = page_data["page"]
    text = page_data["text"]

    # Detect chapter titles
    chapter_match = CHAPTER_PATTERN.search(text)
    if chapter_match:
        current_chapter = chapter_match.group(0).title()

    # Clean and chunk
    tokens = clean_text(text)
    token_chunks = chunk_tokens(tokens)

    for chunk_tokens_list in token_chunks:
        chunk_text = " ".join(chunk_tokens_list)
        chunks_with_meta.append({
            "chapter": current_chapter,
            "page": page_number,
            "text": chunk_text
        })

    # Get embeddings in batches
texts = [item["text"] for item in chunks_with_meta]
print(len(texts))
print(chunks_with_meta[0])

{'page': 32, 'text': 'Video List\nThe  accompanying  ebook  includes  embedded  videos,  found  in  their\nrespective  sections  as  listed  below.  Each  is  narrated  with  audio.\nDetails on how to access the ebook are found in the inside front cover.\n 3.8. VIDEO: Eyelid Laceration Repair\n 3.10. VIDEO: Canthotomy and Cantholysis\n 3.11. VIDEO: Relative Afferent Pupillary Defect\n 3.14. VIDEO: Cyanoacrylate Corneal Glue\n 4.11. VIDEO: Corneal Culture Procedure\n 10.5. VIDEO: Third Cranial Nerve Palsy\n 10.7. VIDEO: Fourth Cranial Nerve Palsy\n 10.8. VIDEO: Sixth Cranial Nerve Palsy\n 10.11. VIDEO: Ocular Myasthenia\n 10.13. VIDEO: Internuclear Ophthalmoplegia\n 11.3. VIDEO: B-scan Ultrasound Tutorial\n 11.13. VIDEO: B-scan Ultrasound Tutorial\n 11.27. VIDEO: B-scan Ultrasound Tutorial\n 14.8. VIDEO: B-scan Ultrasound Tutorial\n Appendix A.7. VIDEO: Probe and Irrigation\n Appendix A.11. VIDEO: Intravitreal injection\n Appendix A.11. VIDEO: Intravitreal Tap and Inject\n Appendix A.13

In [9]:
def get_embeddings(texts):
    bs = 128 # batch size during inference
    all_embs = []
    for i in tqdm(np.arange(0, len(texts), bs)):
        toks = tokenizer.batch_encode_plus(texts[i:i+bs], 
                                           padding="max_length", 
                                           max_length=25, 
                                           truncation=True,
                                           return_tensors="pt")
        toks_cuda = {}
        for k,v in toks.items():
            toks_cuda[k] = v.cuda()
        cls_rep = model(**toks_cuda)[0][:,0,:] # use CLS representation as the embedding
        all_embs.append(cls_rep.cpu().detach().numpy())
    
    return np.concatenate(all_embs, axis=0)
    

In [47]:
embeddings = get_embeddings(texts)

100%|██████████| 22/22 [00:02<00:00,  8.88it/s]


In [48]:
# Attach embeddings to metadata
for i in range(len(chunks_with_meta)):
    chunks_with_meta[i]["embedding"] = embeddings[i]

In [49]:
print(chunks_with_meta[2097]['chapter'])

Chapter 12


In [13]:
import json
class json_serialize(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)
        
with open("pdf_chunks_with_embeddings.json", "w") as f:
    json.dump(chunks_with_meta, f, indent=2, cls=json_serialize)

## Tests with Sample Prompts

In [20]:
prompts = ["What are the conditions possible with eyelid twitch?", "What are the symptoms of ocular rosacea", "what is the differential diagnosis for iridocorneal endothelial syndrome"]

cleaned_prompts_chunks = [clean_text(prompts[i]) for i in range(len(prompts))]

cleaned_prompts = [" ".join(cleaned_prompts_chunks[i]) for i in range(len(cleaned_prompts_chunks))]


print(cleaned_prompts)

['conditions possible eyelid twitch', 'symptoms ocular rosacea', 'differential diagnosis iridocorneal endothelial syndrome']


In [50]:
prompt_embeddings = get_embeddings(cleaned_prompts)

print(prompt_embeddings)

100%|██████████| 1/1 [00:00<00:00, 98.44it/s]

[[ 0.19598271  0.18354261 -0.02298099 ... -0.04105059 -0.04858973
  -0.32010317]
 [ 0.21157573  0.21209382  0.13233308 ... -0.03627598  0.07309783
  -0.21771899]
 [ 0.2861817   0.20104831  0.17491114 ... -0.07079957  0.17971215
  -0.23838484]]





## Inference

In [22]:
def get_all_embeddings(chunks: List[Dict]) -> List[List[float]]:
    """Extract only the embedding vectors."""
    return [chunk["embedding"] for chunk in chunks if "embedding" in chunk]

def get_embeddings_by_chapter(chunks: List[Dict], chapter: str) -> List[List[float]]:
    """Filter embeddings by chapter."""
    return [
        chunk["embedding"]
        for chunk in chunks
        if chunk.get("chapter", "").lower() == chapter.lower()
    ]

def get_embeddings_by_page_range(chunks: List[Dict], start: int, end: int) -> List[List[float]]:
    """Filter embeddings by page number range (inclusive)."""
    return [
        chunk["embedding"]
        for chunk in chunks
        if start <= chunk.get("page", 0) <= end
    ]

In [23]:
with open("pdf_chunks_with_embeddings.json", "r") as f:
    chunks_with_meta = json.load(f)

In [65]:
COS_SIM_THRESHOLD = 0.985
prompt_answers_list = []
for i in range(len(prompt_embeddings)):
    prompt_answers = []
    for j in range(len(chunks_with_meta)):
        cos_sim = cosine_similarity([chunks_with_meta[j]["embedding"]], [prompt_embeddings[i]])
        if cos_sim >= COS_SIM_THRESHOLD:
            prompt_answers.append(chunks_with_meta[j])
    prompt_answers_list.append(prompt_answers)

In [66]:
for i in range(len(prompt_answers_list)):
    print(len(prompt_answers_list[i]))

0
1417
740


In [67]:
for i in range(len(prompt_answers_list[1])):
    print(prompt_answers_list[0][i]["chapter"])
    print(prompt_answers_list[0][i]["text"])
    print("\n\n")
    if i == 10:
        break

IndexError: list index out of range