# Lucido Prototype
This proof-of-concept runs locally for now


## 1. Data Ingestion Pipeline

In [1]:
from haystack import Pipeline
from haystack.components.writers import DocumentWriter
from haystack.components.converters import MarkdownToDocument, PyPDFToDocument, TextFileToDocument, PPTXToDocument
from haystack.components.preprocessors import DocumentSplitter, DocumentCleaner
from haystack.components.routers import FileTypeRouter
from haystack.components.joiners import DocumentJoiner
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack.document_stores.in_memory import InMemoryDocumentStore

document_store = InMemoryDocumentStore()

# File routing and reading
text_file_converter = TextFileToDocument()
markdown_converter = MarkdownToDocument()
pdf_converter = PyPDFToDocument()
powerpoint_converter = PPTXToDocument()
file_type_router = FileTypeRouter(mime_types=[
    'text/plain', 
    'application/pdf', 
    'text/markdown', 
    'application/vnd.openxmlformats-officedocument.presentationml.presentation'
])

# Document joining and pre-processing
document_joiner = DocumentJoiner()
document_cleaner = DocumentCleaner()
document_splitter = DocumentSplitter(split_by='word', split_length=150, split_overlap=50)

# Document embedding and writing to
document_embedder = SentenceTransformersDocumentEmbedder(model='sentence-transformers/all-MiniLM-L6-v2')
document_writer = DocumentWriter(document_store=document_store)

# Initializing data ingestion pipeline
ingestion_pipeline = Pipeline()
ingestion_pipeline.add_component(instance=file_type_router, name="file_type_router")
ingestion_pipeline.add_component(instance=text_file_converter, name="text_file_converter")
ingestion_pipeline.add_component(instance=markdown_converter, name="markdown_converter")
ingestion_pipeline.add_component(instance=pdf_converter, name="pypdf_converter")
ingestion_pipeline.add_component(instance=powerpoint_converter, name='powerpoint_converter')
ingestion_pipeline.add_component(instance=document_joiner, name="document_joiner")
ingestion_pipeline.add_component(instance=document_cleaner, name="document_cleaner")
ingestion_pipeline.add_component(instance=document_splitter, name="document_splitter")
ingestion_pipeline.add_component(instance=document_embedder, name="document_embedder")
ingestion_pipeline.add_component(instance=document_writer, name="document_writer")

# Connecting components
ingestion_pipeline.connect("file_type_router.text/plain", "text_file_converter.sources")
ingestion_pipeline.connect("file_type_router.application/pdf", "pypdf_converter.sources")
ingestion_pipeline.connect("file_type_router.text/markdown", "markdown_converter.sources")
ingestion_pipeline.connect('file_type_router.application/vnd.openxmlformats-officedocument.presentationml.presentation', 'powerpoint_converter.sources')
ingestion_pipeline.connect("text_file_converter", "document_joiner")
ingestion_pipeline.connect("pypdf_converter", "document_joiner")
ingestion_pipeline.connect("markdown_converter", "document_joiner")
ingestion_pipeline.connect('powerpoint_converter', 'document_joiner')
ingestion_pipeline.connect("document_joiner", "document_cleaner")
ingestion_pipeline.connect("document_cleaner", "document_splitter")
ingestion_pipeline.connect("document_splitter", "document_embedder")
ingestion_pipeline.connect("document_embedder", "document_writer")

ingestion_pipeline.draw('drawings/ingestion_pipeline.png')


In [2]:
from pathlib import Path

# Testing the ingestion pipeline
content_dir = 'bien210'
ingestion_pipeline.run({'file_type_router': {'sources': list(Path(content_dir).glob("**/*"))}})

Ignoring wrong pointing object 134 0 (offset 0)
Ignoring wrong pointing object 140 0 (offset 0)
Ignoring wrong pointing object 146 0 (offset 0)
Ignoring wrong pointing object 809 0 (offset 0)
Ignoring wrong pointing object 811 0 (offset 0)
Ignoring wrong pointing object 820 0 (offset 0)
Ignoring wrong pointing object 825 0 (offset 0)
Ignoring wrong pointing object 925 0 (offset 0)
Ignoring wrong pointing object 931 0 (offset 0)
Ignoring wrong pointing object 937 0 (offset 0)
Ignoring wrong pointing object 21 0 (offset 0)
Ignoring wrong pointing object 27 0 (offset 0)
Ignoring wrong pointing object 33 0 (offset 0)
Ignoring wrong pointing object 65 0 (offset 0)
Ignoring wrong pointing object 263 0 (offset 0)
Ignoring wrong pointing object 269 0 (offset 0)
Ignoring wrong pointing object 319 0 (offset 0)


Batches:   0%|          | 0/9 [00:00<?, ?it/s]

{'document_writer': {'documents_written': 266}}

## 2. RAG Pipeline


In [29]:
import os
from getpass import getpass
from haystack.components.embedders import SentenceTransformersTextEmbedder
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever, InMemoryBM25Retriever
from haystack.components.rankers import TransformersSimilarityRanker
from haystack.components.builders import PromptBuilder
from haystack.components.generators.hugging_face_api import HuggingFaceAPIGenerator

template = """
Answer the question based on the given context. Be brief.

Context:
{% for document in documents %}
    {{ document.content }}
{% endfor %}

Question: {{ question }}
Answer:
"""

# Initialize embedder and store retrievers
text_embedder = SentenceTransformersTextEmbedder(model="sentence-transformers/all-MiniLM-L6-v2")
embedding_retriever = InMemoryEmbeddingRetriever(document_store=document_store)
bm25_retriever = InMemoryBM25Retriever(document_store=document_store)

# Initialize document joiner and ranker
document_joiner = DocumentJoiner()
ranker = TransformersSimilarityRanker(
  model='BAAI/bge-reranker-base'
)

# Initialize LLM generation components
prompt_builder = PromptBuilder(template=template)
llm = HuggingFaceAPIGenerator(
  api_type="serverless_inference_api",
  api_params={"model": "HuggingFaceH4/zephyr-7b-beta"},
  generation_kwargs={'max_new_tokens': 150}
)

# Initialize the RAG pipeline
rag_pipeline = Pipeline()
rag_pipeline.add_component("text_embedder", text_embedder)
rag_pipeline.add_component("embedding_retriever", embedding_retriever)
rag_pipeline.add_component('bm25_retriever', bm25_retriever)
rag_pipeline.add_component('document_joiner', document_joiner)
rag_pipeline.add_component('ranker', ranker)
rag_pipeline.add_component("prompt_builder", prompt_builder)
rag_pipeline.add_component('llm', llm)

# Connecting the components
rag_pipeline.connect('text_embedder', 'embedding_retriever')
rag_pipeline.connect('embedding_retriever', 'document_joiner')
rag_pipeline.connect('bm25_retriever', 'document_joiner')
rag_pipeline.connect('document_joiner', 'ranker')
rag_pipeline.connect("ranker.documents", "prompt_builder.documents")
rag_pipeline.connect('prompt_builder', 'llm')

rag_pipeline.draw('drawings/rag_pipeline.png')

In [4]:
# Test the RAG pipeline
question = 'How does myelination of the axons help increase signal speeds in neurons?'
result = rag_pipeline.run({
  'text_embedder': {'text': question},
  'bm25_retriever': {'query': question},
  'ranker': {'query': question},
  'prompt_builder': {'question': question}
})

Batches:   0%|          | 0/1 [00:00<?, ?it/s]



In [5]:
result['llm']['replies'][0]

' Myelinated neurons transmit information at least one order of magnitude faster than unmyelinated neurons due to saltatory conduction, which allows for an action potential to move in a very targeted way, from node to node, as a result of myelin being added at just the right locations along the axon to allow sodium channels to open at specific nodes called Nodes of Ranvier. This allows for an action potential to move quickly and efficiently along the axon, as opposed to unmyelinated neurons, where the action potential must travel more slowly due to the lack of myelin and the need for the signal to regenerate at each point along the axon.'

## 3. Gradio Chatbot Interface

In [30]:
from haystack.components.generators.chat import HuggingFaceAPIChatGenerator
from haystack.dataclasses import ChatMessage

if "HF_API_TOKEN" not in os.environ:
    os.environ["HF_API_TOKEN"] = getpass("Enter HuggingFace API token:")

messages = [ChatMessage.from_system("""
You are Lucido, a helpful teaching assistant. Answer questions based on hints provided to you by the system. 
If the hints are insufficient, inform the user that you do not have the knowledge to answer that question.
""")]

chat_generator = HuggingFaceAPIChatGenerator(
  api_type="serverless_inference_api",
  api_params={"model": "HuggingFaceH4/zephyr-7b-beta"},
  # generation_kwargs={'max_tokens': 150}
)

def chatbot(query, _):
  # Generate the RAG response 
  rag_response = rag_pipeline.run({
    'text_embedder': {'text': query},
    'bm25_retriever': {'query': query},
    'ranker': {'query': query},
    'prompt_builder': {'question': query}
  })
  
  rag_reply = rag_response['llm']['replies'][0]
  messages.append(ChatMessage.from_system(rag_reply))
  messages.append(ChatMessage.from_user(query))
  result = chat_generator.run(messages=messages)
  reply = result['replies'][0].content

  return reply

In [31]:
import gradio as gr

def upload_file(filepath):
  print(filepath)
  ingestion_pipeline.run({'file_type_router': {'sources': [Path(filepath)]}})

with gr.Blocks() as demo: 
  gr.ChatInterface(
    fn=chatbot, 
    title="Lucido â€” A BIEN 210 Application",
    examples=[
      'How does myelination of the axons help increase signal speeds in neurons?',
      'How does bioluminescence differ from chemiluminescence?',
      'Why can the cell membrane be compared to a PN-junction?',
      'How could artificial leaves help society and how do they compare to photosynthesis?',
      'What are the two main superhydrophobic regimes and how can they be applied?',
      'What is a lateral flow assay, how is it built, and what are its uses?'
    ]
  )
  
  upload_button = gr.UploadButton('Upload a file', file_count='single', variant='primary')
  upload_button.upload(upload_file, upload_button)

demo.launch()



* Running on local URL:  http://127.0.0.1:7878

To create a public link, set `share=True` in `launch()`.


