In [7]:
%pip install -U -q "google-generativeai>=0.8.3" chromadb PyPDF2

Note: you may need to restart the kernel to use updated packages.


In [8]:
import google.generativeai as genai
from IPython.display import Markdown

In [9]:
from kaggle_secrets import UserSecretsClient

GOOGLE_API_KEY = UserSecretsClient().get_secret("GOOGLE_API_KEY")
genai.configure(api_key=GOOGLE_API_KEY)

In [10]:
for m in genai.list_models():
    if 'embedContent' in m.supported_generation_methods:
        print(m.name)

models/embedding-001
models/text-embedding-004


In [11]:
import os
import kagglehub

def count_subfiles(folder_path):
    total_files = 0
    for root, dirs, files in os.walk(folder_path):
        total_files += len(files)
    return total_files

# Example usage:
dataset_path = kagglehub.dataset_download('classicpsy/med-enc')
total_files = count_subfiles(dataset_path)
print(f"Total number of subfiles in the folder: {total_files}")


Total number of subfiles in the folder: 1


In [12]:
from PyPDF2 import PdfReader

def extract_text_from_pdf_directory(folder_path):
    
    extracted_texts = {}
    
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith(".pdf"):
                pdf_path = os.path.join(root, file)
                try:
                    reader = PdfReader(pdf_path)
                    text = ""
                    for page in reader.pages:
                        if page.extract_text():
                            text += page.extract_text()
                    extracted_texts[file] = text
                except Exception as e:
                    print(f"Error reading {file}: {e}")
    
    return extracted_texts

# Example usage:
pdf_texts = extract_text_from_pdf_directory(dataset_path)

# Print extracted text for each PDF
for filename, text in pdf_texts.items():
    print(f"--- {filename} ---")
    print(text[:500])  # Print the first 500 characters of the text


--- Gale Encyclopedia of Medicine Vol. 4 (N-S).pdf ---
TheGALE
ENCYCLOPEDIA
ofMEDICINE
SECOND EDITIONTheGALE
ENCYCLOPEDIA
ofMEDICINE
SECOND EDITION
JACQUELINE L. LONGE, EDITOR
DEIRDRE S. BLANCHFIELD, ASSOCIATE EDITOR
VOLUME
N-S4STAFF
Jacqueline L. Longe, Project Editor
Deirdre S. Blanchfield, Associate Editor
Christine B. Jeryan, Managing Editor
Donna Olendorf, Senior Editor
Stacey Blachford, Associate Editor
Kate Kretschmann, Melissa C. McDade, Ryan
Thomason, Assistant Editors
Mark Springer, Technical Specialist
Andrea Lopeman, Programmer/Analyst
B


In [23]:
documents = []
chunk_size = 1000

for filename, text in pdf_texts.items():
    for i in range(0, len(text), chunk_size):
        documents.append(text[i:i+chunk_size])

In [24]:
from chromadb import Documents, Embeddings, EmbeddingFunction
from google.api_core import retry

class GeminiEmbeddingFunction(EmbeddingFunction):
    document_mode = True

    def __call__(self, input:Documents)->Embeddings:
        if self.document_mode:
            embedding_task = 'retrieval_document'
        else:
            embedding_task = 'retrieval_query'

        retry_policy = {'retry':retry.Retry(predicate=retry.if_transient_error)}

        response = genai.embed_content(
            model = "models/text-embedding-004",
            content = input,
            task_type = embedding_task,
            request_options = retry_policy,
        )
        return response['embedding']

In [25]:
import chromadb

DB_NAME = "medencdb"
embed_fn = GeminiEmbeddingFunction()
embed_fn.document_mode = True

chroma_client = chromadb.Client()
db = chroma_client.get_or_create_collection(name=DB_NAME, embedding_function=embed_fn)

db.add(documents=documents, ids=[str(i) for i in range(len(documents))])

In [26]:
db.count()

3879

In [27]:
embed_fn.document_mode = False

query = "What is acne? Explain in short."

result = db.query(query_texts=[query], n_results=1)
[[passage]] = result['documents']

Markdown(passage)

sect bites,chemical irritation, or certain viral infections, such asherpes.
• Pustule. A raised lesion filled with pus. A pustule is
usually the result of an infection, such as acne, impti-geo, or boils .
• Papule. A solid, raised lesion less than 0.4 in (1 cm)
across. A patch of closely grouped papules more than0.4 in (1 cm) across is called a plaque. Papules andplaques can be rough in texture and red, pink, or brownin color. Papules are associated with such conditions aswarts, syphilis , psoriasis, seborrheic and actinic ker-
atoses, lichen planus , and skin cancer .
GALE ENCYCLOPEDIA OF MEDICINE 2 3074Skin lesionsGEM -2931 to 3236 - S  10/22/03 6:17 PM  Page 3074• Nodule. A solid lesion that has distinct edges and that is
usually more deeply rooted than a papule. Doctors oftendescribe a nodule as “palpable,” meaning that, whenexamined by touch, it can be felt as a hard mass distinctfrom the tissue surrounding it. A nodule more than 0.8in (2 cm) in diameter is called a tumor. Nodules

In [28]:
passage_oneline = passage.replace("\n", " ")
query_oneline = query.replace("\n", " ")

prompt = f"""You are a helpful and informative bot that answers questions using text from the reference passage included below. 
Be sure to respond in a complete sentence, being comprehensive, including all relevant background information. 
However, you are talking to a non-technical audience, so be sure to break down complicated concepts and 
strike a friendly and converstional tone. If the passage is irrelevant to the answer, you may ignore it.

QUESTION: {query_oneline}
PASSAGE: {passage_oneline}
"""
print(prompt)

You are a helpful and informative bot that answers questions using text from the reference passage included below. 
Be sure to respond in a complete sentence, being comprehensive, including all relevant background information. 
However, you are talking to a non-technical audience, so be sure to break down complicated concepts and 
strike a friendly and converstional tone. If the passage is irrelevant to the answer, you may ignore it.

QUESTION: What is acne? Explain in short.
PASSAGE: sect bites,chemical irritation, or certain viral infections, such asherpes. • Pustule. A raised lesion filled with pus. A pustule is usually the result of an infection, such as acne, impti-geo, or boils . • Papule. A solid, raised lesion less than 0.4 in (1 cm) across. A patch of closely grouped papules more than0.4 in (1 cm) across is called a plaque. Papules andplaques can be rough in texture and red, pink, or brownin color. Papules are associated with such conditions aswarts, syphilis , psoriasis, sebo

In [29]:
model = genai.GenerativeModel("gemini-1.5-flash-latest")
answer = model.generate_content(prompt)
Markdown(answer.text)

Acne is a skin condition that causes pimples, which are raised lesions filled with pus.  The passage explains that a pustule, a type of raised lesion containing pus, is often a result of an infection, such as acne.
