In [27]:
import os
from unstructured.partition.pdf import partition_pdf
import pytesseract
import uuid

from langchain.embeddings import OpenAIEmbeddings
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.schema.document import Document
from langchain.storage import InMemoryStore
from langchain_chroma import Chroma

import base64
from langchain.chat_models import ChatOpenAI
from langchain.schema.messages import HumanMessage, AIMessage
from dotenv import load_dotenv

from langchain.schema.runnable import RunnablePassthrough
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser

import base64
load_dotenv()

True

In [6]:
OUTPUT_PATH = os.path.join(os.getcwd(), "figures")
DATA_FOLDER = os.path.join(os.getcwd(), "data/1 TB Education VDH.pdf")

In [7]:
raw_pdf_elements = partition_pdf(
            filename=DATA_FOLDER,
            extract_images_in_pdf=True,
            infer_table_structure=True,
            chunking_strategy="by_title",
            max_characters=10000,
            new_after_n_chars=5000,
            combine_text_under_n_chars=2000,
            image_output_dir_path=OUTPUT_PATH,
        )

In [17]:
text_elements = []
table_elements = []
image_elements = []

def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

for element in raw_pdf_elements:
    if 'CompositeElement' in str(type(element)):
        text_elements.append(element)
    elif 'Table' in str(type(element)):
        table_elements.append(element)

table_elements = [i.text for i in table_elements]
text_elements = [i.text for i in text_elements]

print("The length of table elements are :", len(table_elements))
print("The length of text elements are :", len(text_elements))

for image_file in os.listdir(OUTPUT_PATH):
    if image_file.endswith(('.png', '.jpg', '.jpeg')):
        image_path = os.path.join(OUTPUT_PATH, image_file)
        encoded_image = encode_image(image_path)
        image_elements.append(encoded_image)

print("The length of image elements are :",len(image_elements))

The length of table elements are : 0
The length of text elements are : 5
The length of image elements are : 22


In [18]:
from together import Together
client = Together(api_key=os.getenv('TOGETHER_API_KEY'))

In [19]:
def summarize_image(encoded_image):
    response = client.chat.completions.create(
        model="meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo",
        messages=[
            {
                "role": "user", 
                "content": [
                    {"type": "text", "text": "Describe the contents of this image in detail."},
                    {
                        "type": "image_url", 
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{encoded_image}"
                        }
                    }
                ]
            }
        ],
        max_tokens=None,
        temperature=0.1,
        top_p=0.7,
        top_k=50,
        repetition_penalty=1,
    )
    # for token in response:
    #     if hasattr(token, 'choices'):
    #         print(token.choices[0].delta.content, end='', flush=True)
    return response.choices[0].message.content

In [20]:
res = summarize_image(image_elements[8])

In [21]:
len(res)

915

In [22]:
image_summaries = [summarize_image(i) for i in image_elements]

In [23]:
# text-embedding-3-small: Minimum dimensions are 1, and maximum dimensions are 1536
# text-embedding-3-large: Minimum dimensions are 1, and maximum dimensions are 3072

In [33]:
len(text_elements)

5

In [25]:
from langchain_together import TogetherEmbeddings

embeddings = TogetherEmbeddings(
    model="togethercomputer/m2-bert-80M-8k-retrieval",
)

In [30]:
vectorstore = Chroma(collection_name="summaris", embedding_function=embeddings)
store = InMemoryStore()
id_key = "doc_id"
retriever = MultiVectorRetriever(vectorstore=vectorstore, docstore=store, id_key=id_key,top_k=1)

In [34]:
def add_documents_to_retriever(summaries, original_contents):
    doc_ids = [str(uuid.uuid4()) for _ in summaries]
    summary_docs = [
        Document(page_content=s, metadata={id_key: doc_ids[i]})
        for i, s in enumerate(summaries)
    ]
    retriever.vectorstore.add_documents(summary_docs)
    retriever.docstore.mset(list(zip(doc_ids, original_contents)))

add_documents_to_retriever(text_elements, text_elements)

# add_documents_to_retriever(table_elements, table_elements)

add_documents_to_retriever(image_summaries, image_elements) 

In [35]:
from langchain_together import ChatTogether

In [47]:
llm = ChatTogether(
    model="meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo",
    temperature=0.1,
)

In [48]:
llm.invoke("What is tb?")

AIMessage(content="TB, also known as tuberculosis, is a bacterial infection caused by Mycobacterium tuberculosis (M. tuberculosis). It's a serious and potentially life-threatening disease that primarily affects the lungs, but can also affect other parts of the body, such as the kidneys, spine, and brain.\n\nHere are some key facts about TB:\n\n**How is TB spread?**\n\nTB is usually spread through the air when an infected person coughs, sneezes, or talks, releasing droplets that contain the bacteria. People nearby can breathe in these droplets and become infected.\n\n**Symptoms of TB**\n\nThe symptoms of TB can vary, but common ones include:\n\n* Coughing, which may produce mucus or blood\n* Chest pain or discomfort\n* Fatigue\n* Weight loss\n* Fever\n* Chills\n* Night sweats\n\n**Types of TB**\n\nThere are two main types of TB:\n\n1. **Latent TB**: This is when the bacteria are present in the body, but the person is not showing symptoms and is not contagious.\n2. **Active TB**: This is

In [None]:
template = """Answer the question based only on the following context, which can include text, images and tables:
{context}
Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)
question= "Tell me about key points of tb?"
answer = chain.invoke(question)
print(answer)

TB is a type of tuberculosis that affects the lungs. Key points about TB include:

* TB is a bacterial infection caused by Mycobacterium tuberculosis.
* TB is a serious disease that can be fatal if left untreated.
* TB is contagious and can be spread through the air when an infected person coughs or sneezes.
* Symptoms of TB include a persistent cough, weight loss, fever, and night sweats.
* TB can be diagnosed with a chest X-ray, sputum test, or skin test.
* Treatment for TB typically involves a combination of antibiotics taken for at least 6 months.
* TB can be prevented with the BCG vaccine, which is commonly given to children in countries with high TB prevalence.
* TB is a major public health concern, particularly in developing countries, and requires prompt treatment to prevent complications and transmission.

It's worth noting that TB is a complex disease, and this is not an exhaustive list of key points.


In [69]:
ret = chain.first
contexts = ret.invoke("Summarize")
contexts

{'context': ['/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBwcJCQgKDBQNDAsLDBkSEw8UHRofHh0aHBwgJC4nICIsIxwcKDcpLDAxNDQ0Hyc5PTgyPC4zNDL/2wBDAQkJCQwLDBgNDRgyIRwhMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjL/wAARCAHdBIoDASIAAhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/8QAHwEAAwEBAQEBAQEBAQAAAAAAAAECAwQFBgcICQoL/8QAtREAAgECBAQDBAcFBAQAAQJ3AAECAxEEBSExBhJBUQdhcRMiMoEIFEKRobHBCSMzUvAVYnLRChYkNOEl8RcYGRomJygpKjU2Nzg5OkNERUZHSElKU1RVVldYWVpjZGVmZ2hpanN0dXZ3eHl6goOEhYaHiImKkpOUlZaXmJmaoqOkpaanqKmqsrO0tba3uLm6wsPExcbHyMnK0tPU1dbX2Nna4uPk5ebn6Onq8vP09fb3+Pn6/9oADAMBAAIRAxEAPwD5/r7f8Cf8k88Nf9gq1/8ARS18QV9v+BP+SeeGv+wVa/8AopaAOgooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKK

In [41]:
text_elements

['Annual Tuberculosis (TB) Education\n\nRiteChoice Hospice LivinRite Home Health Services RiteChoice Caregiving Solutions\n\nOverview\n\n• Tuberculosis Disease vs. Latent Tuberculosis Infection • Risk Assessment, Screening and Testing\n\nTB Transmission\n\nInfection Control\n\nTB Symptoms\n\nTB Testing\n\nLTBI Treatment\n\nTuberculosis\n\n• Tuberculosis (TB) is caused by a bacterium called Mycobacterium tuberculosis. • The bacteria usually attack the lungs, but TB bacteria can attack any part of the body. • Not everyone infected with TB bacteria becomes sick. • Latent TB infection (LTBI)\n\nTB disease\n\nLTBI vs. TB Disease\n\nLatent TB Infection Latent TB infection means TB germs are in the body, but not enough to cause sickness or spread germs to others.\n\nTB Disease If TB germs become active & multiply, latent TB infection can turn into TB disease.\n\nTB Screening, Testing and Treatment of U.S. Healthcare Personnel\n\nUpdated guidance released in May of 2019 to supplement the 2005 

In [55]:
image_summaries

['The image is a digital illustration of a banner with text and cartoon figures. The banner features a purple background with white text that reads, "Annual TB testing is not recommended for most health care personnel." \n\nOn the left side of the banner, there are two cartoon figures: one male and one female. The male figure is wearing blue scrubs and a blue cap, while the female figure is wearing blue scrubs and has her hands on her hips. On the right side of the banner, there are three additional cartoon figures: two females and one male. The first female figure is wearing a brown dress and headscarf, while the second female figure is wearing a white lab coat and green pants. The male figure is wearing a light blue shirt and white pants.\n\nIn the bottom-right corner of the image, there is a vertical strip of teal color with white text that reads, "T f p d w T." The overall design of the image suggests that it may be used as a public health awareness campaign or educational material

In [56]:
text_elements

['Annual Tuberculosis (TB) Education\n\nRiteChoice Hospice LivinRite Home Health Services RiteChoice Caregiving Solutions\n\nOverview\n\n• Tuberculosis Disease vs. Latent Tuberculosis Infection • Risk Assessment, Screening and Testing\n\nTB Transmission\n\nInfection Control\n\nTB Symptoms\n\nTB Testing\n\nLTBI Treatment\n\nTuberculosis\n\n• Tuberculosis (TB) is caused by a bacterium called Mycobacterium tuberculosis. • The bacteria usually attack the lungs, but TB bacteria can attack any part of the body. • Not everyone infected with TB bacteria becomes sick. • Latent TB infection (LTBI)\n\nTB disease\n\nLTBI vs. TB Disease\n\nLatent TB Infection Latent TB infection means TB germs are in the body, but not enough to cause sickness or spread germs to others.\n\nTB Disease If TB germs become active & multiply, latent TB infection can turn into TB disease.\n\nTB Screening, Testing and Treatment of U.S. Healthcare Personnel\n\nUpdated guidance released in May of 2019 to supplement the 2005 

In [75]:
doc = []
doc.extend(text_elements)
doc.extend(image_summaries)

In [71]:
retriever

MultiVectorRetriever(vectorstore=<langchain_chroma.vectorstores.Chroma object at 0x7ff52edbf620>, docstore=<langchain_core.stores.InMemoryStore object at 0x7ff5465be990>, search_kwargs={})

In [72]:
len(doc)

1

In [77]:
template = """Summarize all the content into 10 slides without losing any information.
{context}
Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

chain = (
    {"context": RunnablePassthrough(), "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

doc_string = "\n".join(doc)  # Convert the list to a single string
question = "Slides content must be brief and must include all the points in a structured manner for presentation."

# Wrap the inputs in a dictionary
inputs = {"context": doc_string, "question": question}

# Invoke the chain with a single dictionary
answer = chain.invoke(inputs)
print(answer)

Here is a suggested outline for a 10-slide presentation based on the provided content:

**Slide 1: Introduction**

* Title: Annual Tuberculosis (TB) Education
* Subtitle: RiteChoice Hospice LivinRite Home Health Services RiteChoice Caregiving Solutions
* Image: A relevant image, such as a logo or a picture of a healthcare professional

**Slide 2: What is Tuberculosis?**

* Title: Tuberculosis (TB)
* Bullet points:
	+ Caused by a bacterium called Mycobacterium tuberculosis
	+ Usually attacks the lungs, but can attack any part of the body
	+ Not everyone infected with TB bacteria becomes sick
	+ Latent TB infection (LTBI)
* Image: A simple illustration of a lung or a bacterium

**Slide 3: TB Transmission**

* Title: TB Transmission
* Bullet points:
	+ Airborne
	+ M. tuberculosis is carried in airborne particles
	+ Transmission occurs when a person breathes in the bacteria from the air
* Image: An illustration of a person coughing or sneezing, with airborne particles

**Slide 4: Infection