In [1]:
from dotenv import load_dotenv
load_dotenv()

from IPython.display import display, HTML, Markdown

In [17]:
import logging
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from propositional_retrieval.constants import DOCSTORE_ID_KEY
from propositional_retrieval.proposal_chain import proposition_chain
from propositional_retrieval.ingest import create_index  
from propositional_retrieval.chain  import chain

def get_logger():
    logger = logging.getLogger(__name__)
    logger.setLevel(logging.INFO)
    # Formatter & StreamHandler
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    stream_handler = logging.StreamHandler()
    stream_handler.setFormatter(formatter)
    # Add handlers
    logger.addHandler(stream_handler)
    return logger
    
logger = get_logger()

## Making the vectors

### Parse pdfs

In [3]:
pdf_path = '/Users/arshath/play/chimiadao/ChimiaResearch/propositional-retrieval/notebooks/pdfs'
pdf_loader = PyPDFDirectoryLoader(pdf_path)
pdf_docs = pdf_loader.load()

In [4]:
# current_pdf = ""
# current_pdf_text = ""
# pdf_docs_new = []
# for doc in pdf_docs:
#     if current_pdf == "":
#         current_pdf = doc.metadata['source']
#         current_pdf_text += doc.page_content
#     elif current_pdf == doc.metadata['source']:
#         current_pdf_text += doc.page_content
#     else:
#         pdf_docs_new.append(
#             Document(
#                 page_content=current_pdf_text,
#                 metadata={
#                     'source': current_pdf
#                 }
#             )
#         )
#         current_pdf = doc.metadata['source']
#         current_pdf_text = doc.page_content

# pdf_docs_new.append(
#     Document(
#         page_content=current_pdf_text,
#         metadata={
#             'source': current_pdf
#         }
#     )
# )

# logger.info(f"Loaded {len(pdf_docs)} documents")


In [5]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=8000, chunk_overlap=0)
all_splits = text_splitter.split_documents(pdf_docs)
logger.info(f"Split into {len(all_splits)} documents")

# Create retriever
retriever_multi_vector_img = create_index(
    all_splits,
    proposition_chain,
    DOCSTORE_ID_KEY,
)

2023-12-25 11:44:27,927 - __main__ - INFO - Split into 24 documents
INFO:__main__:Split into 24 documents
INFO:propositional_retrieval.ingest:Creating multi-vector retriever
INFO:chromadb.telemetry.product.posthog:Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https:

In [13]:
docs = retriever_multi_vector_img.get_relevant_documents(
    'Can LLM be used to speed up chemistry research?',
)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


## Using the chain

In [23]:
res = chain.invoke('what is chemputer')

Markdown(res)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


The term "chemputer" is not explicitly defined in the provided document, but based on the context of the article, it seems to refer to a modular robotic system that is used for the synthesis of organic compounds. This system is driven by a chemical programming language and is designed to autonomously compile and execute standardized methods for chemical synthesis, as described in the work by Steiner et al. The chemputer likely integrates conventional laboratory equipment, such as round-bottom flasks, separatory funnels, and rotary evaporators, to perform syntheses that are comparable to manual processes. The goal of such a system would be to automate the process of chemical synthesis, making it more efficient and reproducible, and to enable the translation of written instructions from the chemistry literature into executable protocols that the robotic system can perform.