# Lucido Prototype
This proof-of-concept runs locally for now


## 1. Data Ingestion Pipeline

In [1]:
from haystack import Pipeline
from haystack.components.writers import DocumentWriter
from haystack.components.converters import MarkdownToDocument, PyPDFToDocument, TextFileToDocument, PPTXToDocument
from haystack.components.preprocessors import DocumentSplitter, DocumentCleaner
from haystack.components.routers import FileTypeRouter
from haystack.components.joiners import DocumentJoiner
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack.document_stores.in_memory import InMemoryDocumentStore

document_store = InMemoryDocumentStore()

# File routing and reading
text_file_converter = TextFileToDocument()
markdown_converter = MarkdownToDocument()
pdf_converter = PyPDFToDocument()
powerpoint_converter = PPTXToDocument()
file_type_router = FileTypeRouter(mime_types=[
    'text/plain', 
    'application/pdf', 
    'text/markdown', 
    'application/vnd.openxmlformats-officedocument.presentationml.presentation'
])

# Document joining and pre-processing
document_joiner = DocumentJoiner()
document_cleaner = DocumentCleaner()
document_splitter = DocumentSplitter(split_by='word', split_length=150, split_overlap=50)

# Document embedding and writing to
document_embedder = SentenceTransformersDocumentEmbedder(model='sentence-transformers/all-MiniLM-L6-v2')
document_writer = DocumentWriter(document_store=document_store)

# Initializing data ingestion pipeline
ingestion_pipeline = Pipeline()
ingestion_pipeline.add_component(instance=file_type_router, name="file_type_router")
ingestion_pipeline.add_component(instance=text_file_converter, name="text_file_converter")
ingestion_pipeline.add_component(instance=markdown_converter, name="markdown_converter")
ingestion_pipeline.add_component(instance=pdf_converter, name="pypdf_converter")
ingestion_pipeline.add_component(instance=powerpoint_converter, name='powerpoint_converter')
ingestion_pipeline.add_component(instance=document_joiner, name="document_joiner")
ingestion_pipeline.add_component(instance=document_cleaner, name="document_cleaner")
ingestion_pipeline.add_component(instance=document_splitter, name="document_splitter")
ingestion_pipeline.add_component(instance=document_embedder, name="document_embedder")
ingestion_pipeline.add_component(instance=document_writer, name="document_writer")

# Connecting components
ingestion_pipeline.connect("file_type_router.text/plain", "text_file_converter.sources")
ingestion_pipeline.connect("file_type_router.application/pdf", "pypdf_converter.sources")
ingestion_pipeline.connect("file_type_router.text/markdown", "markdown_converter.sources")
ingestion_pipeline.connect('file_type_router.application/vnd.openxmlformats-officedocument.presentationml.presentation', 'powerpoint_converter.sources')
ingestion_pipeline.connect("text_file_converter", "document_joiner")
ingestion_pipeline.connect("pypdf_converter", "document_joiner")
ingestion_pipeline.connect("markdown_converter", "document_joiner")
ingestion_pipeline.connect('powerpoint_converter', 'document_joiner')
ingestion_pipeline.connect("document_joiner", "document_cleaner")
ingestion_pipeline.connect("document_cleaner", "document_splitter")
ingestion_pipeline.connect("document_splitter", "document_embedder")
ingestion_pipeline.connect("document_embedder", "document_writer")

ingestion_pipeline.draw('drawings/ingestion_pipeline.png')


In [2]:
from pathlib import Path

# Testing the ingestion pipeline
content_dir = 'bien210'
ingestion_pipeline.run({'file_type_router': {'sources': list(Path(content_dir).glob("**/*"))}})

Ignoring wrong pointing object 134 0 (offset 0)
Ignoring wrong pointing object 140 0 (offset 0)
Ignoring wrong pointing object 146 0 (offset 0)
Ignoring wrong pointing object 809 0 (offset 0)
Ignoring wrong pointing object 811 0 (offset 0)
Ignoring wrong pointing object 820 0 (offset 0)
Ignoring wrong pointing object 825 0 (offset 0)
Ignoring wrong pointing object 925 0 (offset 0)
Ignoring wrong pointing object 931 0 (offset 0)
Ignoring wrong pointing object 937 0 (offset 0)
Ignoring wrong pointing object 21 0 (offset 0)
Ignoring wrong pointing object 27 0 (offset 0)
Ignoring wrong pointing object 33 0 (offset 0)
Ignoring wrong pointing object 65 0 (offset 0)
Ignoring wrong pointing object 263 0 (offset 0)
Ignoring wrong pointing object 269 0 (offset 0)
Ignoring wrong pointing object 319 0 (offset 0)


Batches:   0%|          | 0/9 [00:00<?, ?it/s]

{'document_writer': {'documents_written': 266}}

## 2. RAG Pipeline


### 2.1 Augmenting ambiguous queries

In [79]:
from typing import List
from haystack.components.generators.openai import OpenAIGenerator
from haystack.components.generators.hugging_face_api import HuggingFaceAPIGenerator
from haystack.dataclasses import ChatMessage
from haystack import component

disambiguator_llm = HuggingFaceAPIGenerator(
  api_type="serverless_inference_api",
  api_params={"model": "microsoft/Phi-3.5-mini-instruct"},
  generation_kwargs={
    'max_new_tokens': 25,
    'temperature': 100,
    'top_k': 1,
    'top_p': 0.1
  }
)

@component
class QueryDisambiguator:
  """
  A component that disambiguates the query using the conversation history.
  """
  @component.output_types(disambiguated_query=str)
  def run(self, query: str, messages: List[ChatMessage]):
    chat = " ".join(list(map(lambda m: m.content, messages)))
    prompt = f"""
    You are a helpful AI assistant. You identify and name the topic that the current query refers to.
    Based on the context, name the topic that the current query is referring to. Be brief
    
    Context:
    {chat}

    Current Query:
    {query}

    Topic:
    """
    result = disambiguator_llm.run(prompt=prompt)
    disambiguated_query = result['replies'][0]
    print('disambiguated_query: ', disambiguated_query)

    return {'disambiguated_query': disambiguated_query}

In [80]:
example_messages = [
  ChatMessage.from_system("You are Lucido, a helpful teaching assistant. Answer questions based on hints provided to you by the system. If the hints are insufficient, inform the user that you do not have the knowledge to answer that question."),
  ChatMessage.from_user('How do you build a lateral flow assay, and more specifically, what are its components?'),
  ChatMessage.from_assistant("""To build a lateral flow assay, you need to assemble the following components:
  Sample Pad: This pad is typically made of chromatographic materials such as cellulose or glass fiber. It enables the flow of the sample towards the other parts of the assay by capillary action. It also helps in removing unwanted substances from the sample, such as salt or other impurities that may interfere with the assay's performance.
  Conjugate Pad: This pad is placed immediately after the sample pad, and it holds the conjugate solution, which is a mixture of a labeling molecule (such as gold nanoparticles or fluorescent dye) and the antibody which will bind to the analyte. Capillary action then draws the conjugate solution onto the test area through the sample pad.
  Test Line: This line contains the second antibody immobilized on the membrane. This antibody will bind only to the analyte-conjugate complex, specifically preventing non-specific binding. The secondary antibody is labeled with a colored molecule, such as gold nanoparticles or a red dye. This line will turn pink or red when the analyte is present, as these labeled molecules show up under a magnifying glass or a microscope.
  Control Line: This line is comprised of a single antibody that binds to the initial antibody in the conjugate pad, not the analyte. This line ensures that the entire assay still works for control purposes, as the labeled molecules will still get released in the absence of the analyte.
  Moisture Absorption Pad: Also known as the absorbent pad, it is placed at the end of the membrane to absorb any excess solution, specifically, the conjugate or analyte solutions, to prevent false results.
  The lateral flow assay works by drawing the sample through the membrane, and the antibody-analyte bond causes a visible line at the test line. A control line confirms that the test is working correctly. Compared to other tests that require the use of a reader or other instruments, lateral flow assays are more straightforward and can provide instant results, making them suitable for use in developing countries and in resource-poor settings where laboratory equipment is not easily available.
  """)
]

query = 'What are its limitations?'
disambiguator = QueryDisambiguator()
disambiguator.run(query=query, messages=example_messages)

disambiguated_query:   Limitations of Lateral Flow Assays




{'disambiguated_query': ' Limitations of Lateral Flow Assays\n\n'}

### 2.2 Building the RAG pipeline

In [24]:
import os
from getpass import getpass
from haystack.components.embedders import SentenceTransformersTextEmbedder
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever, InMemoryBM25Retriever
from haystack.components.rankers import TransformersSimilarityRanker
from haystack.components.builders import PromptBuilder
from haystack.components.generators.hugging_face_api import HuggingFaceAPIGenerator

template = """
Answer the question based on the given context. Be brief.

Context:
{% for document in documents %}
    {{ document.content }}
{% endfor %}

Question: {{ question }}
Answer:
"""

# Initialize disambiguator the embedder 
disambiguator = QueryDisambiguator()
text_embedder = SentenceTransformersTextEmbedder(model="sentence-transformers/all-MiniLM-L6-v2")

# Initialize store retrievers
embedding_retriever = InMemoryEmbeddingRetriever(document_store=document_store)
bm25_retriever = InMemoryBM25Retriever(document_store=document_store)

# Initialize document joiner and ranker
document_joiner = DocumentJoiner()
ranker = TransformersSimilarityRanker(model='BAAI/bge-reranker-base')

# Initialize LLM generation components
prompt_builder = PromptBuilder(template=template)
llm = HuggingFaceAPIGenerator(
  api_type="serverless_inference_api",
  api_params={"model": "HuggingFaceH4/zephyr-7b-beta"},
  generation_kwargs={'max_new_tokens': 150}
)

# Initialize the RAG pipeline
rag_pipeline = Pipeline()
rag_pipeline.add_component('disambiguator', disambiguator)
rag_pipeline.add_component("text_embedder", text_embedder)
rag_pipeline.add_component("embedding_retriever", embedding_retriever)
rag_pipeline.add_component('bm25_retriever', bm25_retriever)
rag_pipeline.add_component('document_joiner', document_joiner)
rag_pipeline.add_component('ranker', ranker)
rag_pipeline.add_component("prompt_builder", prompt_builder)
rag_pipeline.add_component('llm', llm)

# Connecting the components
rag_pipeline.connect('disambiguator.disambiguated_query', 'text_embedder.text')
rag_pipeline.connect('disambiguator.disambiguated_query', 'bm25_retriever.query')
rag_pipeline.connect('disambiguator.disambiguated_query', 'ranker.query')
rag_pipeline.connect('disambiguator.disambiguated_query', 'prompt_builder.question')
rag_pipeline.connect('text_embedder', 'embedding_retriever')
rag_pipeline.connect('embedding_retriever', 'document_joiner')
rag_pipeline.connect('bm25_retriever', 'document_joiner')
rag_pipeline.connect('document_joiner', 'ranker')
rag_pipeline.connect("ranker.documents", "prompt_builder.documents")
rag_pipeline.connect('prompt_builder', 'llm')

rag_pipeline.draw('drawings/rag_pipeline.png')

## 3. Gradio Chatbot Interface

In [25]:
from haystack.components.generators.chat import HuggingFaceAPIChatGenerator
from haystack.dataclasses import ChatMessage

if "HF_API_TOKEN" not in os.environ:
    os.environ["HF_API_TOKEN"] = getpass("Enter HuggingFace API token:")

messages = [ChatMessage.from_system("""
You are Lucido, a helpful teaching assistant. Answer questions based on hints provided to you by the system. 
If the hints are insufficient, inform the user that you do not have the knowledge to answer that question.
""")]

chat_generator = HuggingFaceAPIChatGenerator(
  api_type="serverless_inference_api",
  api_params={"model": "HuggingFaceH4/zephyr-7b-beta"},
  # generation_kwargs={'max_tokens': 150}
)

def chatbot(query, _):
  # Generate the RAG response 
  rag_response = rag_pipeline.run({
    'disambiguator': {
       'query': query, 
       'messages': messages
      },
  })
  
  rag_reply = rag_response['llm']['replies'][0]
  messages.append(ChatMessage.from_system(rag_reply))
  messages.append(ChatMessage.from_user(query))
  result = chat_generator.run(messages=messages)
  reply = result['replies'][0].content

  return reply

In [26]:
import gradio as gr

def upload_file(filepath):
  print(filepath)
  ingestion_pipeline.run({'file_type_router': {'sources': [Path(filepath)]}})

with gr.Blocks() as demo: 
  gr.ChatInterface(
    fn=chatbot, 
    title="Lucido — A BIEN 210 Application",
    examples=[
      'How does myelination of the axons help increase signal speeds in neurons?',
      'How does bioluminescence differ from chemiluminescence?',
      'Why can the cell membrane be compared to a PN-junction?',
      'How could artificial leaves help society and how do they compare to photosynthesis?',
      'What are the two main superhydrophobic regimes and how can they be applied?',
      'How do you build a lateral flow assay and, more specifically, what are its components?'
    ]
  )
  
  upload_button = gr.UploadButton('Upload a file', file_count='single', variant='primary')
  upload_button.upload(upload_file, upload_button)

demo.launch()



* Running on local URL:  http://127.0.0.1:7864

To create a public link, set `share=True` in `launch()`.






disambiguated_query:  1. How could artificial leaves benefit society?
    2. How do artificial leaves compare to natural photosynthesis?

Support: 1


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

disambiguated_query:  
Why can the cell membrane be compared to a PN junction in terms of their functions or properties?


## Response: How


Batches:   0%|          | 0/1 [00:00<?, ?it/s]