In [20]:
pip install cohere hnswlib unstructured -q

In [21]:
pip install pypdf



In [22]:
pip install --upgrade --quiet  langchain langchain-community langchainhub langchain-openai chromadb bs4

In [87]:
pip install --upgrade --quiet  docx2txt

In [91]:
pip install python-pptx

Collecting python-pptx
  Downloading python_pptx-0.6.23-py3-none-any.whl (471 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/471.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━[0m [32m276.5/471.6 kB[0m [31m8.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
Collecting XlsxWriter>=0.5.7 (from python-pptx)
  Downloading XlsxWriter-3.2.0-py3-none-any.whl (159 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m159.9/159.9 kB[0m [31m21.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: XlsxWriter, python-pptx
Successfully installed XlsxWriter-3.2.0 python-pptx-0.6.23


In [88]:
import cohere
import os
import hnswlib
import json
import uuid
from typing import List, Dict
from unstructured.partition.html import partition_html
from unstructured.chunking.title import chunk_by_title
from google.colab import userdata


# everything to under
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_community.document_loaders import Docx2txtLoader
from langchain_community.document_loaders import UnstructuredPowerPointLoader

co = cohere.Client(userdata.get("COHERE_API_KEY"))

In [85]:
# example local sources
sources = [
    {
        "title": "Capstone Project Managers",
        "path": "/content/local_data/CAPSTONEPM.pdf"
    },
    {
        "title": "YWCC",
        "path": "/content/local_data/YWCC.pdf"
    },
    {
        "title": "Schedule",
        "path": "/content/local_data/LIVESCHEDULE.csv"
    },
     {
        "title": "Scope",
        "path":"/content/local_data/ScopeDocument.docx"
    },
    {
        "title": "CISCO Track",
        "path": "/content/local_data/CISCO.pptx"
    }
]

In [83]:
#document class
class Documents:

    def __init__(self, sources: List[Dict[str, str]]):
        self.sources = sources # all sources not chunked
        self.docs = [] # chunked sources
        self.docs_embs = [] # embedded + chunked
        self.retrieve_top_k = 10
        self.rerank_top_k = 3
        self.load()
        self.embed()
        self.index()

    def loadGeneral(self,source):
      if source["path"].endswith(".pdf"):
        loader = PyPDFLoader(source['path'])
      elif source["path"].endswith(".docx"):
        loader = Docx2txtLoader(source["path"])
      elif source["path"].endswith(".pptx"):
        loader = UnstructuredPowerPointLoader(source["path"])

      docs = loader.load()
      text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100, add_start_index=True)
      all_splits = text_splitter.split_documents(docs)
      for split in all_splits:
        self.docs.append(
            {
                "title": source["title"],
                "text": split.page_content,
                "path": source["path"]
            }
        )

    def loadCSV(self, source):
      loader = CSVLoader(
      file_path=source["path"],
      csv_args={
          "delimiter": ",",
          "quotechar": '"',
        },
      )

      data = loader.load()
      for doc in data:
        self.docs.append(
            {
                "title": source["title"],
                "text": doc.page_content,
                "path": source["path"]
            }
        )

    def load(self):
      print("loading docs")
      for source in self.sources:
        if source["path"].endswith(".pdf") or source["path"].endswith(".docx") or source["path"].endswith(".pptx"):
          self.loadGeneral(source)
        else:
          self.loadCSV(source)

    def embed(self):
      print("embedding docs")
      batch_size = 90
      self.docs_len = len(self.docs)
      print(self.docs_len)
      for i in range(0, self.docs_len, batch_size):
        batch = self.docs[i : min(i + batch_size, self.docs_len)]
        texts = [item["text"] for item in batch]
        docs_embs_batch = co.embed(
            texts = texts,
            model = "embed-english-v3.0",
            input_type = "search_document"
        ).embeddings
        self.docs_embs.extend(docs_embs_batch)

    def index(self):
      print("Indexing documents...")

      self.index = hnswlib.Index(space="ip", dim=1024)
      self.index.init_index(max_elements=self.docs_len, ef_construction=512, M=64)
      self.index.add_items(self.docs_embs, list(range(len(self.docs_embs))))

      print(f"Indexing complete with {self.index.get_current_count()} documents.")

    def retrieve(self, query: str) -> List[Dict[str,str]]:
      """
      Retrieves documents based on the given query.

      Parameters:
      query (str): The query to retrieve documents for.

      Returns:
      List[Dict[str, str]]: A list of dictionaries representing the retrieved  documents, with 'title', 'snippet', and 'url' keys.
      """
      docs_retrieved = []
      query_emb = co.embed(
                  texts=[query],
                  model="embed-english-v3.0",
                  input_type="search_query"
                  ).embeddings

      doc_ids = self.index.knn_query(query_emb, k=self.retrieve_top_k)[0][0]
      docs_to_rerank = []
      for doc_id in doc_ids:
          docs_to_rerank.append(self.docs[doc_id]["text"])

      rerank_results = co.rerank(
          query=query,
          documents=docs_to_rerank,
          top_n=self.rerank_top_k,
          model="rerank-english-v2.0",
      )

      doc_ids_reranked = []
      for result in rerank_results:
          doc_ids_reranked.append(doc_ids[result.index])

      for doc_id in doc_ids_reranked:
          docs_retrieved.append(
              {
                  "title": self.docs[doc_id]["title"],
                  "text": self.docs[doc_id]["text"],
                  "path": self.docs[doc_id]["path"],
              }
          )

      return docs_retrieved


In [78]:
docs = Documents(sources)

loading dc
read a csv Schedule
embedding docs
92
Indexing documents...
Indexing complete with 92 documents.


In [79]:
class Chatbot:
  def __init__(self, docs: Documents):
    self.docs = docs
    self.conversation_id = str(uuid.uuid4())

  def generate_response(self, message: str):
    """
    Generates a response to the user's message.

    Parameters:
    message (str): The user's message.

    Yields:
    Event: A response event generated by the chatbot.

    Returns:
    List[Dict[str, str]]: A list of dictionaries representing the retrieved documents.

    """

    # Generate search queries (if any)
    response = co.chat(message=message, search_queries_only=True)
    # If there are search queries, retrieve documents and respond
    if response.search_queries:
        print("Retrieving information...")

        documents = self.retrieve_docs(response)

        response = co.chat(
            message=message,
            documents=documents,
            conversation_id=self.conversation_id,
            stream=True,
        )
        for event in response:
            yield event
        yield response

    # If there is no search query, directly respond
    else:
        response = co.chat(
            message=message,
            conversation_id=self.conversation_id,
            stream=True
        )
        for event in response:
            yield event



  def retrieve_docs(self, response) -> List[Dict[str, str]]:
      """
      Retrieves documents based on the search queries in the response.

      Parameters:
      response: The response object containing search queries.

      Returns:
      List[Dict[str, str]]: A list of dictionaries representing the retrieved documents.

      """
      # Get the query(s)
      queries = []
      for search_query in response.search_queries:
          queries.append(search_query["text"])

      # Retrieve documents for each query
      retrieved_docs = []
      for query in queries:
          retrieved_docs.extend(self.docs.retrieve(query))

      return retrieved_docs



In [101]:
class App:
    def __init__(self, chatbot: Chatbot):
        """
        Initializes an instance of the App class.

        Parameters:
        chatbot (Chatbot): An instance of the Chatbot class.

        """
        self.chatbot = chatbot

    def run(self):
      """
      Runs the chatbot application.
      """
      while True:
          # Get the user message
          message = input("User: ")

          # Typing "quit" ends the conversation
          if message.lower() == "quit":
              print("Ending chat.")
              break
          else:
              print(f"User: {message}")
              # Get the chatbot response
              response = self.chatbot.generate_response(message)

              # Print the chatbot response
              print("Chatbot:")
              citations_flag = False
              docnames = {}
              for event in response:
                  stream_type = type(event).__name__

                  # Text
                  if stream_type == "StreamTextGeneration":
                      print(event.text, end="")

                  # Citations
                  if stream_type == "StreamCitationGeneration":
                      if not citations_flag:
                          print("\n\nCITATIONS:")
                          citations_flag = True
                      print(event.citations[0])

                  # Documents
                  if citations_flag:
                      if stream_type == "StreamingChat":
                          print("\n\nDOCUMENTS:")
                          documents = [{'id': doc['id'],
                                      'text': doc['text'][:50] + '...',
                                      'title': doc['title'],
                                      'path': doc['path']}
                                      for doc in event.documents]
                          for doc in documents:
                            if doc["title"] in docnames:
                              continue
                            else:
                              docnames[doc["title"]] = doc["path"]
              for docname in docnames:
                print(docname + ": " + docnames[docname])
              print(f"\n{'-'*100}\n")


In [None]:
documents = Documents(sources)
chatbot = Chatbot(documents)

app = App(chatbot)

app.run()


loading dc
read a csv Schedule
embedding docs
94
Indexing documents...
Indexing complete with 94 documents.
User: when is the final presentation due? what is the final grade breakdown
User: when is the final presentation due? what is the final grade breakdown
Chatbot:
Retrieving information...
The final presentation is due on April 26, 2024. This presentation will be worth 10% towards your final grade for the semester. The final grade breakdown is as follows:

- Presentation: 10%
- Prototype demonstrations: 40%
- Written report: 10%
- Individual project: 15%
- Team project: 15%

CITATIONS:
{'start': 33, 'end': 48, 'text': 'April 26, 2024.', 'document_ids': ['doc_0']}
{'start': 81, 'end': 84, 'text': '10%', 'document_ids': ['doc_0']}


DOCUMENTS:
Schedule: /content/local_data/LIVESCHEDULE.csv

----------------------------------------------------------------------------------------------------

User: write a paragraph on the scope of the cisco project
User: write a paragraph on the scope