In [126]:
import os
import re
from dotenv import load_dotenv
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_groq import ChatGroq
from langchain_community.vectorstores import FAISS
import streamlit as st
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings

load_dotenv()

True

In [42]:
# data = PyPDFLoader("/home/skynet/Documents/Gen_ai/med_scan/data/raw/WM17S.pdf").load()

In [127]:
def clean_text(text):
    """Cleans a single text by removing common escape sequences, unnecessary characters, and empty elements."""

    # Remove common escape sequences (adjust as needed)
    cleaned_text = re.sub(
        r"[\x00-\x1F\x7F]", " ", text
    )  # Removes control characters and replaces them with spaces
    cleaned_text = re.sub(
        r"\u200b", " ", cleaned_text
    )  # Removes zero-width space and replaces it with a space

    # Remove excessive whitespace
    cleaned_text = re.sub(r"\s+", " ", cleaned_text)

    # Remove empty strings
    cleaned_text = cleaned_text.strip()

    return cleaned_text

In [44]:
# remove empty pages
# doc = [document for document in data if document.page_content]

In [128]:
# corpus = clean_text(text.split(" "))
from PyPDF2 import PdfFileReader, PdfFileWriter, PdfReader

pdf_reader = PdfReader("/home/skynet/Documents/Gen_ai/med_scan/data/raw/WM17S.pdf")
text = ""
for page in pdf_reader.pages:
    text += page.extract_text()

In [131]:
text = clean_text(text)

In [132]:
corpus = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=40).split_text(text)

In [143]:
extrct_embeddings = HuggingFaceInferenceAPIEmbeddings(
    api_key=os.getenv("HUGGINGFACE_APIKEY"),
    model_name="sentence-transformers/all-MiniLM-L6-v2",
)

extrct_model = ChatGroq(
    model="llama-3.1-8b-instant",
    max_tokens=7000,
    temperature=0.3,
    streaming=True,
    model_kwargs={"top_p": 0.5},
    api_key=os.getenv("GROQ_API_KEY_EXTRCT"),
)
summ_model = ChatGroq(
    model="llama-3.1-70b-versatile",
    max_tokens=7000,
    temperature=0.3,
    streaming=True,
    model_kwargs={"top_p": 0.5},
    api_key=os.getenv("GROQ_API_KEY_SUMM"),
)

In [144]:
extrct_db = FAISS.from_texts(corpus, extrct_embeddings)

In [145]:
extrct_db = extrct_db.as_retriever()

In [146]:
summ_prompt = ChatPromptTemplate.from_template(
    """
    Task: Write a concise, detailed, and extensive summary of the following medical report.

Instructions:

1. **Understand the Report:** Carefully read the medical report to grasp its key points, findings, and recommendations.
2. **Summarize Key Points:** Concisely summarize the main findings, diagnoses, procedures, treatment recommendations, and outcomes.
3. **Provide Insights:** Offer insights into potential implications, related conditions, recommended follow-up actions, and alternative treatment options.
4. **Generate Questions:** Ask relevant questions that could help clarify the report's content, suggest further investigation, or address potential concerns.
5. **Use Plain Language:** Explain complex medical terms or concepts in a clear and understandable way.
6. **Avoid Medical Advice:** Refrain from providing medical advice or diagnoses. Always refer users to consult with a healthcare professional.
7. **Find Values:** Summarize relevant values, such as lab results, vital signs, and measurements.
8. **Include Patient Information:** Provide general patient information, including age and gender (without personally identifiable details).
9. **Address Patient Concerns:** Summarize any patient concerns or questions and the corresponding responses if mentioned.

Guidelines:

- **Focus on Relevance:** Ensure your responses are directly related to the content of the medical report.
- **Be Objective:** Avoid personal opinions or biases in your analysis.
- **Provide Clear Explanations:** Use simple language and avoid jargon.
- **Be Respectful:** Treat the medical information with sensitivity and respect.

The medical report is enclosed below:
{context}
"""
)

In [147]:
extrct_prompt = ChatPromptTemplate.from_template(
    """
Task: Answer the user's queries based on the provided summary and the original Medical Report.

Instructions:

1. **Understand the Summary:** Carefully read the summary to grasp its key points.
2. **Reference the PDF:** Use the original PDF to verify and supplement the information in the summary.
3. **Provide Comprehensive Answers:** Combine the information from both the summary and the PDF to provide accurate and informative responses.
4. **Be Objective:** Avoid personal opinions or biases in your responses.
5. **Be Accurate:** Verify the information in your responses using both sources.
6. **Be Professional:** Maintain a professional tone and approach in your responses.
7. **Be Respectful:** Treat the medical information with sensitivity and respect.
8. **Provide Context:** Your answers should contain the section of the report the information is derived from.
9. **Clarity and Detail:** Provide clear and detailed information to answer the query comprehensively.
10. **Use Subheadings:** Organize your answers with subheadings to enhance readability.
11. **Avoid Medical Advice:** Refrain from providing medical advice or diagnoses. Always refer users to consult with a healthcare professional.
12. **Stay on Topic:** Always stay relevant to the query and avoid out-of-context or out-of-scope answers.
13. **Greeting Style:** Start with a greeting, then provide the answer in a detailed manner.
14. Comments: Your answer should also contain final comments about the query.

Guidelines:

- **Focus on Relevance:** Ensure your responses are directly related to the query and the summary/Medical Report.
- **Be Clear and Detailed:** Explain your answers in an understandable manner with sufficient detail.
- **Provide References:** Always reference the original data to support your answers.
- **Respect Privacy:** Maintain the privacy of the user and the data at all times.

Your context is as follows:
<context>
Summary:
{summary}

Original PDF:
{context}
</context>

Question: {input}
"""
)

In [148]:
from langchain_core.output_parsers import StrOutputParser

In [149]:
summ_chain = summ_prompt | summ_model | StrOutputParser()

In [150]:
result = summ_chain.invoke({"context": corpus})

In [154]:
with open("summ.txt", "w") as f:
    f.write(re.sub(r"\*", "", result))

In [155]:
extrct_chain = create_stuff_documents_chain(extrct_model, extrct_prompt)

In [157]:
retrieval_chain = create_retrieval_chain(extrct_db, extrct_chain)

In [159]:
answer = retrieval_chain.invoke(
    {
        "input": "What is the status of patient's lipid profile?",
        "summary": result,
    }
)["answer"]

with open("answer.txt", "w") as f:
    f.write(re.sub(r"\*", "", answer))