In [74]:
# !pip install nltk fitz
# !pip install -U sentence-transformers
!pip install pdfminer.six

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [75]:
import pandas as pd
import numpy as np
from transformers import pipeline, AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
#from summarizer import Summarizer
from textblob import TextBlob
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

## Model

In [76]:
tokenizer = AutoTokenizer.from_pretrained("cambridgeltl/SapBERT-from-PubMedBERT-fulltext")  
model = AutoModel.from_pretrained("cambridgeltl/SapBERT-from-PubMedBERT-fulltext").cuda()

## The Wills Eye Manual Preprocessing


In [77]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [78]:
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from typing import List, Dict
import re

In [79]:
STOPWORDS = set(stopwords.words('english'))

def extract_text_by_page(pdf_path: str) -> List[Dict]:
    pages = []
    for i, page_layout in enumerate(extract_pages(pdf_path)):
        page_text = ""
        for element in page_layout:
            if isinstance(element, LTTextContainer):
                page_text += element.get_text()
        pages.append({"page": i + 1, "text": page_text})
    return pages

def clean_text(text: str) -> List[str]:
    """Cleans text by lowercasing, removing punctuation and stopwords, and tokenizing."""
    text = text.lower().translate(str.maketrans("", "", string.punctuation))
    words = word_tokenize(text)
    return [word for word in words if word.isalnum() and word not in STOPWORDS]

def chunk_tokens(tokens: List[str], chunk_size: int = 64) -> List[str]:
    """Chunks a list of tokens into segments with a max token size."""
    chunks = []
    for i in range(0, len(tokens), chunk_size):
        chunk = tokens[i:i + chunk_size]
        chunks.append(" ".join(chunk))
    return chunks

In [80]:
pdf_path = "/kaggle/input/willseyemanual/Kalla Gervasio Travis Peck - The Wills Eye Manual_ Office and Emergency Room Diagnosis and Treatment of Eye Disease (2021 LWW Wolters Kluwer) - libgen.li.pdf"
pages = extract_text_by_page(pdf_path)
pages_list = pages[31:1139] #Skipping the initial and final few pages since they only include possibly irrelevant stuff, such as contents, indices, et.


In [81]:
print(pages_list[0])
print("******************************************")
current_chapter = "Unknown"
CHAPTER_PATTERN = re.compile(r'\bchapter\s+\d+\b', re.IGNORECASE)
chunks_with_meta = []

for page_data in pages_list:
    page_number = page_data["page"]
    text = page_data["text"]

    # Detect chapter titles
    chapter_match = CHAPTER_PATTERN.search(text)
    if chapter_match:
        current_chapter = chapter_match.group(0).title()

    # Clean and chunk
    tokens = clean_text(text)
    token_chunks = chunk_tokens(tokens)

    for chunk_tokens_list in token_chunks:
        chunk_text = " ".join(chunk_tokens_list)
        chunks_with_meta.append({
            "chapter": current_chapter,
            "page": page_number,
            "text": chunk_text
        })

    # Get embeddings in batches
texts = [item["text"] for item in chunks_with_meta]
print(len(texts))
print(chunks_with_meta[0])

{'page': 32, 'text': 'Video List\nThe  accompanying  ebook  includes  embedded  videos,  found  in  their\nrespective  sections  as  listed  below.  Each  is  narrated  with  audio.\nDetails on how to access the ebook are found in the inside front cover.\n 3.8. VIDEO: Eyelid Laceration Repair\n 3.10. VIDEO: Canthotomy and Cantholysis\n 3.11. VIDEO: Relative Afferent Pupillary Defect\n 3.14. VIDEO: Cyanoacrylate Corneal Glue\n 4.11. VIDEO: Corneal Culture Procedure\n 10.5. VIDEO: Third Cranial Nerve Palsy\n 10.7. VIDEO: Fourth Cranial Nerve Palsy\n 10.8. VIDEO: Sixth Cranial Nerve Palsy\n 10.11. VIDEO: Ocular Myasthenia\n 10.13. VIDEO: Internuclear Ophthalmoplegia\n 11.3. VIDEO: B-scan Ultrasound Tutorial\n 11.13. VIDEO: B-scan Ultrasound Tutorial\n 11.27. VIDEO: B-scan Ultrasound Tutorial\n 14.8. VIDEO: B-scan Ultrasound Tutorial\n Appendix A.7. VIDEO: Probe and Irrigation\n Appendix A.11. VIDEO: Intravitreal injection\n Appendix A.11. VIDEO: Intravitreal Tap and Inject\n Appendix A.13

In [82]:
def get_embeddings(texts):
    bs = 128 # batch size during inference
    all_embs = []
    for i in tqdm(np.arange(0, len(texts), bs)):
        toks = tokenizer.batch_encode_plus(texts[i:i+bs], 
                                           padding="max_length", 
                                           max_length=25, 
                                           truncation=True,
                                           return_tensors="pt")
        toks_cuda = {}
        for k,v in toks.items():
            toks_cuda[k] = v.cuda()
        cls_rep = model(**toks_cuda)[0][:,0,:] # use CLS representation as the embedding
        all_embs.append(cls_rep.cpu().detach().numpy())
    
    return np.concatenate(all_embs, axis=0)
    

In [83]:
# Attach embeddings to metadata
for i in range(len(chunks_with_meta)):
    chunks_with_meta[i]["embedding"] = embeddings[i]

In [84]:
import json
class json_serialize(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)
        
with open("pdf_chunks_with_embeddings.json", "w") as f:
    json.dump(chunks_with_meta, f, indent=2, cls=json_serialize)

## Tests with Sample Prompts

In [109]:
prompts = []

cleaned_prompts_chunks = [clean_text(prompts[i]) for i in range(len(prompts))]

cleaned_prompts = [" ".join(cleaned_prompts_chunks[i]) for i in range(len(cleaned_prompts_chunks))]


print(cleaned_prompts)

['conditions possible eyelid twitch', 'symptoms ocular rosacea', 'diagnosis iridocorneal endothelial syndrome']


In [110]:
prompt_embeddings = get_embeddings(cleaned_prompts)

print(prompt_embeddings)

100%|██████████| 1/1 [00:00<00:00, 60.53it/s]

[[-0.66155714 -0.01547795 -0.41041806 ... -0.641091    0.026896
   0.1764841 ]
 [ 0.23985203 -0.16884747  0.40169618 ... -0.06667233  0.53196263
  -0.10392955]
 [-0.80470437  0.15148884  0.02588806 ... -0.42807004  1.1037222
  -0.21052438]]





## Inference

In [111]:
def get_all_embeddings(chunks: List[Dict]) -> List[List[float]]:
    """Extract only the embedding vectors."""
    return [chunk["embedding"] for chunk in chunks if "embedding" in chunk]

def get_embeddings_by_chapter(chunks: List[Dict], chapter: str) -> List[List[float]]:
    """Filter embeddings by chapter."""
    return [
        chunk["embedding"]
        for chunk in chunks
        if chunk.get("chapter", "").lower() == chapter.lower()
    ]

def get_embeddings_by_page_range(chunks: List[Dict], start: int, end: int) -> List[List[float]]:
    """Filter embeddings by page number range (inclusive)."""
    return [
        chunk["embedding"]
        for chunk in chunks
        if start <= chunk.get("page", 0) <= end
    ]

In [87]:
with open("pdf_chunks_with_embeddings.json", "r") as f:
    chunks_with_meta = json.load(f)

In [118]:
COS_SIM_THRESHOLD = 0.35
prompt_answers_list = []
for i in range(len(prompt_embeddings)):
    prompt_answers = []
    for j in range(len(chunks_with_meta)):
        cos_sim = cosine_similarity([chunks_with_meta[j]["embedding"]], [prompt_embeddings[i]])
        if cos_sim >= COS_SIM_THRESHOLD:
            prompt_answers.append(chunks_with_meta[j])
    prompt_answers_list.append(prompt_answers)

In [119]:
for i in range(len(prompt_answers_list)):
    print(len(prompt_answers_list[i]))

284
2
620


In [120]:
for i in range(len(prompt_answers_list[1])):
    print(prompt_answers_list[0][i]["chapter"])
    print(prompt_answers_list[0][i]["text"])
    print("\n\n")

Unknown
v i d e o   l i s t   a c c o m p a n y i n g   e b o o k   i n c l u d e s   e m b e d d e d   v i d e o s   f o u n d   r e s p e c t i v e   s e c t i o n s   l i s t e d   n a r r a t e d   a u d i o   d e t a i l s   a c c e s s   e b o o k   f o u n d   i n s i d e   f r o n t   c o v e r   3 8   v i d e o   e y e l i d   l a c e r a t i o n   r e p a i r   3 1 0   v i d e o   c a n t h o t o m y   c a n t h o l y s i s   3 1 1   v i d e o   r e l a t i v e   a f f e r e n t   p u p i l l a r y   d e f e c t   3 1 4   v i d e o   c y a n o a c r y l a t e   c o r n e a l   g l u e   4 1 1   v i d e o   c o r n e a l   c u l t u r e   p r o c e d u r e   1 0 5   v i d e o   t h i r d   c r a n i a l   n e r v e   p a l s y   1 0 7   v i d e o   f o u r t h   c r a n i a l   n e r v e   p a l s y   1 0 8   v i d e o   s i x t h   c r a n i a l   n e r v e   p a l s y   1 0 1 1



Unknown
e y e l i d   d r o o p i n g   p t o s i s   s e e   6 1   p t o s i s   e y e l i d  