# Lucido Prototype
This proof-of-concept runs locally for now


## 1. Data Ingestion Pipeline

In [33]:
from haystack import Pipeline
from haystack.components.writers import DocumentWriter
from haystack.components.converters import MarkdownToDocument, PyPDFToDocument, TextFileToDocument, PPTXToDocument
from haystack.components.preprocessors import DocumentSplitter, DocumentCleaner
from haystack.components.routers import FileTypeRouter
from haystack.components.joiners import DocumentJoiner
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack.document_stores.in_memory import InMemoryDocumentStore

document_store = InMemoryDocumentStore()

# File routing and reading
file_type_router = FileTypeRouter(mime_types=['text/plain', 'application/pdf', 'text/markdown', 'application/vnd.openxmlformats-officedocument.presentationml.presentation'])
text_file_converter = TextFileToDocument()
markdown_converter = MarkdownToDocument()
pdf_converter = PyPDFToDocument()
powerpoint_converter = PPTXToDocument()

# Document joining and pre-processing
document_joiner = DocumentJoiner()
document_cleaner = DocumentCleaner()
document_splitter = DocumentSplitter(split_by='word', split_length=150, split_overlap=50)

# Document embedding and writing to
document_embedder = SentenceTransformersDocumentEmbedder(model='sentence-transformers/all-MiniLM-L6-v2')
document_writer = DocumentWriter(document_store=document_store)

# Initializing data ingestion pipeline
ingestion_pipeline = Pipeline()
ingestion_pipeline.add_component(instance=file_type_router, name="file_type_router")
ingestion_pipeline.add_component(instance=text_file_converter, name="text_file_converter")
ingestion_pipeline.add_component(instance=markdown_converter, name="markdown_converter")
ingestion_pipeline.add_component(instance=pdf_converter, name="pypdf_converter")
ingestion_pipeline.add_component(instance=powerpoint_converter, name='powerpoint_converter')
ingestion_pipeline.add_component(instance=document_joiner, name="document_joiner")
ingestion_pipeline.add_component(instance=document_cleaner, name="document_cleaner")
ingestion_pipeline.add_component(instance=document_splitter, name="document_splitter")
ingestion_pipeline.add_component(instance=document_embedder, name="document_embedder")
ingestion_pipeline.add_component(instance=document_writer, name="document_writer")

# Connecting components
ingestion_pipeline.connect("file_type_router.text/plain", "text_file_converter.sources")
ingestion_pipeline.connect("file_type_router.application/pdf", "pypdf_converter.sources")
ingestion_pipeline.connect("file_type_router.text/markdown", "markdown_converter.sources")
ingestion_pipeline.connect('file_type_router.application/vnd.openxmlformats-officedocument.presentationml.presentation', 'powerpoint_converter.sources')
ingestion_pipeline.connect("text_file_converter", "document_joiner")
ingestion_pipeline.connect("pypdf_converter", "document_joiner")
ingestion_pipeline.connect("markdown_converter", "document_joiner")
ingestion_pipeline.connect('powerpoint_converter', 'document_joiner')
ingestion_pipeline.connect("document_joiner", "document_cleaner")
ingestion_pipeline.connect("document_cleaner", "document_splitter")
ingestion_pipeline.connect("document_splitter", "document_embedder")
ingestion_pipeline.connect("document_embedder", "document_writer")

ingestion_pipeline.draw('drawings/ingestion_pipeline.png')


In [34]:
from pathlib import Path

# Testing the ingestion pipeline
# content_dir = 'bien210'
# ingestion_pipeline.run({'file_type_router': {'sources': list(Path(content_dir).glob("**/*"))}})

Ignoring wrong pointing object 134 0 (offset 0)
Ignoring wrong pointing object 140 0 (offset 0)
Ignoring wrong pointing object 146 0 (offset 0)
Ignoring wrong pointing object 809 0 (offset 0)
Ignoring wrong pointing object 811 0 (offset 0)
Ignoring wrong pointing object 820 0 (offset 0)
Ignoring wrong pointing object 825 0 (offset 0)
Ignoring wrong pointing object 925 0 (offset 0)
Ignoring wrong pointing object 931 0 (offset 0)
Ignoring wrong pointing object 937 0 (offset 0)
Ignoring wrong pointing object 21 0 (offset 0)
Ignoring wrong pointing object 27 0 (offset 0)
Ignoring wrong pointing object 33 0 (offset 0)
Ignoring wrong pointing object 65 0 (offset 0)
Ignoring wrong pointing object 263 0 (offset 0)
Ignoring wrong pointing object 269 0 (offset 0)
Ignoring wrong pointing object 319 0 (offset 0)


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

{'document_writer': {'documents_written': 42}}

## 2. RAG Chat Pipeline


In [81]:
import os
from getpass import getpass
from haystack.components.embedders import SentenceTransformersTextEmbedder
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
from haystack.components.generators.chat import OpenAIChatGenerator
from haystack.components.builders import ChatPromptBuilder
from haystack.dataclasses import ChatMessage

template = [ChatMessage.from_system("""
Answer the questions based on the given context.

Context:
{% for document in documents %}
    {{ document.content }}
{% endfor %}
Question: {{ question }}
Answer:
""")]

# Initializing the RAG pipeline
rag_pipeline = Pipeline()
rag_pipeline.add_component("embedder", SentenceTransformersTextEmbedder(model="sentence-transformers/all-MiniLM-L6-v2"))
rag_pipeline.add_component("retriever", InMemoryEmbeddingRetriever(document_store=document_store, top_k=5))
rag_pipeline.add_component("prompt_builder", ChatPromptBuilder(template=template))

# Connecting the components
rag_pipeline.connect("embedder.embedding", "retriever.query_embedding")
rag_pipeline.connect("retriever", "prompt_builder.documents")

rag_pipeline.draw('drawings/rag_pipeline.png')

In [82]:
# Test the rag chat pipeline
query = 'What is e-skin?'
result = rag_pipeline.run({
  'embedder': {'text': query}, 
  'prompt_builder': {'question': query}
})

result['prompt_builder']['prompt'][0].content

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

'\nAnswer the questions based on the given context.\n\nContext:\n\n    E-SKIN AND AN APPLICATION in\x0bROBOTIC SURGERY\n(Sarwar, 2023)\x0cINTRODUCTION\x0cWHAT IS E-SKIN?\nE-skin aims to mimic human skin\nMechanical properties: Adherence to irregular surfaces, stretchability, mechanical toughness, and self-healing properties Tactile sensing: Pressure, strain, temperature, humidity, shear force\nInsights: Shapes of objects, surfaces, texture, hardness, etc.\nApplications are found in skin-attachable electronics for healthcare, robotics and prosthetics (Yang et al., 2019)\n(Chen et al., 2021; Yang et al., 2019).\x0cTELEOPERATED SURGERY\nRobots used to perform surgery since the 1980s\nStatic positioning of tools, automated trajectories in preoperative planning, teleoperation and rehabilitation\nMaster-slave systems consist of a robotic manipulator and a controller (e.g., Da Vinci)\nAccurate positioning, repeatability, better posture for the surgeon, teleperesence, minimally invasive proced

## 3. Gradio Chatbot Interface

In [85]:
import gradio as gr

if "OPENAI_API_KEY" not in os.environ:
    os.environ["OPENAI_API_KEY"] = getpass("Enter OpenAI API token:")

messages = [ChatMessage.from_system("You are a virtual teaching assistant. Answer questions based on the given context")]
chat_generator = OpenAIChatGenerator(model='gpt-3.5-turbo')

def chatbot(query:str, history) -> str:
  # Generate the RAG prompt 
  rag_response = rag_pipeline.run({'embedder': {'text': query}, 'prompt_builder': {'question': query}})
  rag_prompt = rag_response['prompt_builder']['prompt'][0].content
  messages.append(ChatMessage.from_user(rag_prompt))

  # Generate the LLM response
  llm_response = chat_generator.run(messages=messages)
  reply = llm_response['replies'][0]
  messages.append(reply)

  return reply.content

In [86]:
# Test the chatbot function
# chatbot("What is electric skin?", _)

In [87]:
demo = gr.ChatInterface(fn=chatbot, title="Lucido — A BIEN 210 Application",)
demo.launch()



* Running on local URL:  http://127.0.0.1:7869

To create a public link, set `share=True` in `launch()`.




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]