In [3]:
from groq import Groq
import os
from langchain.prompts import PromptTemplate
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_huggingface import HuggingFaceEmbeddings
from PyPDF2 import PdfReader
import requests
from bs4 import BeautifulSoup
import json

This first cell is a function that you shouldent have to run. So dont run the second cell either

In [45]:
def scrape_biorxiv_pdf_urls():
    base_url = "https://www.biorxiv.org"
    pdf_urls = []
    for page_number in range(1, 20): 
        page_url = f"{base_url}/content/early/recent?page={page_number}"
        response = requests.get(page_url)
        
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, "html.parser")

            article_links = soup.find_all("a", class_="highwire-cite-linked-title")
            for article_link in article_links:
                article_href = article_link.get("href")
                if article_href:
                    article_url = base_url + article_href
                    article_response = requests.get(article_url)
                    if article_response.status_code == 200:
                        article_soup = BeautifulSoup(article_response.content, "html.parser")
                        pdf_link = article_soup.find("a", class_="article-dl-pdf-link")
                        
                        if pdf_link:
                            pdf_href = pdf_link.get("href")
                            if pdf_href and pdf_href.endswith(".pdf"):
                                full_pdf_url = base_url + pdf_href
                                pdf_urls.append(full_pdf_url)
    
    return pdf_urls

In [46]:
bio_urls = scrape_biorxiv_pdf_urls()

Just use the cell below to extract some of the scraped urls.

In [60]:
with open("bio_urls.txt", "r") as file:
    bio_urls = [line.strip() for line in file]

Loop up groq and sign up then click around for your api key

In [41]:
api_key = "Your api key"
groq_client = Groq(api_key=api_key)

def load_pdf_from_url(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        with open("temp.pdf", "wb") as f:
            f.write(response.content)
        reader = PdfReader("temp.pdf")
        text = "".join([page.extract_text() for page in reader.pages])
        return text
    except requests.RequestException as e:
        print(f"Failed to download PDF from {url}: {str(e)}")
        return ""

def scrape_website(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        paragraphs = soup.find_all('p')
        content = "\n".join([para.get_text() for para in paragraphs])
        return content
    except requests.RequestException as e:
        print(f"Failed to scrape website {url}: {str(e)}")
        return ""

In [47]:
def load_data():

    pdf_urls = [
        "https://www.biorxiv.org/content/10.1101/2020.07.28.224253v1.full.pdf",
        "https://www.cartercenter.org/resources/pdfs/health/ephti/library/lecture_notes/health_extension_trainees/generalpathology.pdf"
    ]

    
    website_urls = [
        "https://en.wikipedia.org/wiki/Pathology",
        "https://www.mcgill.ca/pathology/about/definition#:~:text=Pathology%20is%20a%20branch%20of,the%20whole%20body%20(autopsy)."
    ]

    pdf_urls.extend(bio_urls)
    documents_with_metadata = []

    for url in pdf_urls:
        data = load_pdf_from_url(url)
        if data:
            text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
            splits = text_splitter.split_text(data)
            for split in splits:
                if split.strip():  
                    documents_with_metadata.append(Document(page_content=split, metadata={"source": url}))

    for url in website_urls:
        data = scrape_website(url)
        if data:
            text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
            splits = text_splitter.split_text(data)
            for split in splits:
                if split.strip():  
                    documents_with_metadata.append(Document(page_content=split, metadata={"source": url}))

    embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


    if not os.path.exists("chromadb"):
        os.makedirs("chromadb")

    vectorstore = Chroma.from_documents(
        documents=documents_with_metadata, 
        embedding=embedding_model, 
        persist_directory="chromadb"
    )

    return vectorstore


vectorstore = load_data()

 impossible to decode XFormObject /Fm606
FloatObject (b'0.00-8976378') invalid; use 0.0 instead
FloatObject (b'0.000-8818898') invalid; use 0.0 instead


In [43]:
pathology_questions = [
    "What is the role of a pathologist in cancer diagnosis?",
    "What are the key markers used to diagnose breast cancer?",
    "How is a biopsy analyzed in pathology?",
    "What are the most common types of cancer identified through pathology?",
    "What is immunohistochemistry and how is it used in cancer diagnosis?",
    "What is the difference between malignant and benign tumors in pathology?",
    "How are tumor grades and stages determined in cancer pathology?",
    "What is the significance of HER2 status in breast cancer pathology?",
    "What is the process for conducting a fine-needle aspiration biopsy?",
    "How is genetic testing used alongside pathology in cancer diagnosis?",
    "What is the role of liquid biopsy in modern cancer diagnostics?",
    "What are the key features of a pathology report in oncology?",
    "What types of imaging techniques support pathological diagnoses?",
    "What is the importance of lymph node analysis in cancer staging?",
    "What is the role of molecular pathology in identifying cancer mutations?",
    "How is cytopathology used to detect cancer cells?",
    "What is the role of a frozen section in cancer surgeries?",
    "How do pathologists identify metastatic cancer?",
    "What are the challenges in diagnosing rare cancers in pathology?",
    "How does pathology differentiate between primary and secondary cancers?",
    "What are the most common histological types of lung cancer?",
    "How is HPV testing used in cervical cancer diagnosis?",
    "What is the significance of KRAS and EGFR mutations in cancer pathology?",
    "How is pathology used to evaluate the efficacy of cancer treatments?",
    "What is the role of tumor markers like CA-125 and PSA in cancer detection?",
    "How is pathology used to monitor cancer recurrence?",
    "What are the differences between Hodgkin and non-Hodgkin lymphoma in pathology?",
    "What are the most common staining techniques in cancer pathology?",
    "What is the role of pathology in diagnosing gastrointestinal cancers?",
    "How is pathology used to diagnose skin cancers such as melanoma?",
    "What is the role of PD-L1 testing in immunotherapy decisions?",
    "What are the common histological features of prostate cancer?",
    "How are brain tumors classified in cancer pathology?",
    "What is the importance of margins in pathology after tumor excision?",
    "How are sentinel lymph nodes analyzed in pathology?",
    "What are the key features of colorectal cancer in pathology?",
    "What are the differences between squamous cell carcinoma and adenocarcinoma?",
    "How is pathology used in diagnosing pediatric cancers?",
    "What is the role of pathology in diagnosing hematological malignancies?",
    "What is the significance of genetic translocations in cancer pathology?",
    "How is fluorescence in situ hybridization (FISH) used in cancer diagnostics?",
    "What is the importance of mitotic index in cancer pathology?",
    "How is pathology used to detect cancer-related infections such as EBV or H. pylori?",
    "What are the latest advancements in digital pathology for cancer diagnosis?",
    "What is the role of artificial intelligence in modern cancer pathology?",
    "How is pathology used in diagnosing soft tissue sarcomas?",
    "What is the importance of a multidisciplinary team in cancer diagnosis and treatment?",
    "What are the differences between ductal and lobular breast carcinoma in pathology?",
    "How are blood cancers like leukemia diagnosed in pathology?",
    "What are the common immunohistochemical markers for renal cell carcinoma?",
    "How is pathology used to assess tumor response to neoadjuvant therapy?",
    "What is the role of liquid biopsies in detecting circulating tumor DNA?",
    "How do pathologists identify cancer from exfoliative cytology samples?",
    "What are the ethical considerations in pathology related to cancer diagnostics?",
    "How is pathology evolving with personalized cancer medicine?"
]

In [48]:
def query_rag(question):
    template = """
    Use the following pieces of context to answer the question at the end.
    If you don't know the answer, just say that you don't know; don't try to make up an answer.
    Keep the answer as concise as possible, preferably in three sentences.
    Quote the url the info came from.
    Context:
    {context}
    Question: {question}
    Answer:
    """

    retriever = vectorstore.as_retriever()
    context_docs = retriever.get_relevant_documents(question)

    # Ensure valid content in context_docs
    if not context_docs:
        return "No relevant context found to answer this question."

    context_text = "\n".join([doc.page_content for doc in context_docs if doc.page_content.strip()])

    if not context_text.strip():
        return "No relevant context found to answer this question."

    prompt = template.format(context=context_text, question=question)

    response = groq_client.chat.completions.create(
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        model="llama3-8b-8192"
    )

    return response.choices[0].message.content

qa_pairs = []

for question in pathology_questions:
    try:
        answer = query_rag(question)
    except Exception as e:
        answer = f"Error processing question: {e}"
    
    qa_pairs.append({"question": question, "answer": answer})
    print(f"Question: {question}")
    print(f"Answer: {answer}\n")

# Save QA pairs to JSON
with open("pathology_qna.json", "w") as file:
    json.dump(qa_pairs, file, indent=4)

Question: What is the role of a pathologist in cancer diagnosis?
Answer: A pathologist plays a crucial role in cancer diagnosis, as they examine cellular patterns of tissue samples under a microscope to determine if a sample is cancerous or non-cancerous (benign) in order to help diagnose a wide range of diseases, including cancer (https://www.pathologyresidency.com/our-program/). Our Residency Program trains candidates in the diagnosis of disease, highlighting the importance of pathologists in this process.

Question: What are the key markers used to diagnose breast cancer?
Answer: I couldn't find any information in the provided context about the key markers used to diagnose breast cancer. The context only mentions tumor markers used for diagnosing cervical cancers, endometrial, bronchogenic carcinomas, bladder and prostatic tumors, and gastric carcinoma.

Source: None (since the information wasn't found in the provided context)

Question: How is a biopsy analyzed in pathology?
Answer