In [71]:
# import all the necessary modules
from langchain_openai import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.messages import HumanMessage
from langchain_core.runnables import RunnablePassthrough
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_history_aware_retriever
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters.character import RecursiveCharacterTextSplitter
import dotenv
import textwrap
import os

In [28]:
# define a function to print the response in wrapped format
def print_response(response, width=70):
    # wrap the response to fit within the specified width
    wrapper = textwrap.TextWrapper(width=width) 
    # wrap the response
    wrapped_string = wrapper.fill(response)
    # print the wrapped response
    print(wrapped_string)

In [29]:
# load OpenAI API key from secret file
dotenv.load_dotenv()
# language model to use
llm = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0) # could also use Ollama

In [73]:
# load pdf document
loader = PyPDFLoader("./documents_db/Matteucci_2024.pdf")
splitter = RecursiveCharacterTextSplitter(chunk_size = 5000, chunk_overlap = 500) # important!
paper_chunks = loader.load_and_split(text_splitter=splitter)
# inspect the metadata of the loaded pages
print(f"Number of documents:", len(paper_chunks))
[paper_chunk.metadata for paper_chunk in paper_chunks[0:3]]

Number of documents: 20


[{'source': './documents_db/Matteucci_2024.pdf', 'page': 0},
 {'source': './documents_db/Matteucci_2024.pdf', 'page': 1},
 {'source': './documents_db/Matteucci_2024.pdf', 'page': 2}]

In [74]:
# define the directory where the database should be located
persist_directory = "./chroma_db"
# check if the directory exists
if os.path.exists(persist_directory):
    # load persistent vectorstore from disk if it exists
    vectorstore = Chroma(persist_directory=persist_directory, embedding_function=OpenAIEmbeddings())
else:
    # create persistent vectorstore if the directory does not exist
    vectorstore = Chroma.from_documents(
        paper_chunks,
        embedding=OpenAIEmbeddings(),
        persist_directory=persist_directory,       
    )
# define the retriever
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 10}) # could also use "mmr"

In [75]:
# define contextualization subsystem prompt for history-aware retriever chain ---------
contextualize_q_system_prompt = """
Given a chat history and the latest user question \
which might reference context in the chat history, formulate a standalone question \
which can be understood without the chat history. Do NOT answer the question, \
just reformulate it if needed and otherwise return it as is."""
contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
# create history-aware retriever chain
history_aware_retriever = create_history_aware_retriever(
    llm, retriever, contextualize_q_prompt
)
# define the system prompt for the question-answering chain --------------------------
qa_system_prompt = """
You are an assistant to Dr. Giulio Matteucci, a neuroscientist.
Your role is to engage with visitors on his personal website, providing answers to their inquiries
 regarding Giulio's review paper you are given access to. Your responses should be scientifically precise,
 and accuratily reflect the content and message of the papers.
 It is of paramount importance not to make up any information, when in doubt,
   just say that this question goes beyond the scope of the paper. 
   It is really important to ensure factual accuracy and avoid inventing concepts references and attributions.
Strive to be clear and accessible for all users, be both professional and approachable,
 ensuring your explanations are succinct and schematic without sacrificing essential details.
When answering questions, rely solely on the context provided by Dr. Matteucci's papers. 
Never mention anything that is not present in the papers.

{context}"""
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", qa_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
# create question-answering chain
question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)
# create the final chain ----------------------------------------------------------
rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

In [None]:
# reset chat history
chat_history = []

In [76]:
# first conversational cycle
question = "What Giulio talks about in his review paper?"
ai_msg_1 = rag_chain.invoke({"input": question, "chat_history": chat_history})
chat_history.extend([HumanMessage(content=question), ai_msg_1["answer"]])
print_response(ai_msg_1["answer"])

In Dr. Giulio Matteucci's review paper, he discusses the confluence
between trends in neuroscience and machine learning, focusing on
unsupervised learning. Specifically, the paper explores how sensory
processing systems learn to exploit the statistical structure of their
inputs without explicit training targets or rewards. The review
highlights the influence of sensory experience on neural self-
organization and synaptic bases, emphasizing the role of unsupervised
learning in the development of neuronal tuning from middle- to high-
order areas along cortical sensory processing hierarchies. The paper
also discusses novel algorithms for unsupervised and self-supervised
learning, their implications for theories of the brain, particularly
in intermediate visual cortical areas, and their application in real-
world learning machines.


In [77]:
# second conversational cycle
second_question = "Explain the concepts of UTL and USL in layman's terms."
ai_msg_2 = rag_chain.invoke({"input": second_question, "chat_history": chat_history})
chat_history.extend([HumanMessage(content=question), ai_msg_2["answer"]])
print_response(ai_msg_2["answer"])

In layman's terms, Unsupervised Temporal Learning (UTL) and
Unsupervised Spatial Learning (USL) are ways in which our brains learn
from the world around us without explicit instructions or rewards.  -
UTL: This type of learning focuses on how our brains use the natural
flow of time in visual experiences to understand and remember things.
For example, when we see different views of the same object close
together in time, our brain learns to recognize that object regardless
of its position or orientation.  - USL: On the other hand, USL is
about how our brains pick up on patterns and features in what we see
without being told what to look for. It helps us develop a sense of
what's important in our visual environment, like recognizing shapes or
textures, even if we're not consciously trying to learn them.  Both
UTL and USL are essential for our brains to adapt and understand the
world around us, forming the basis for how we perceive and interpret
visual information.


In [80]:
# third conversational cycle
third_question = "Define UTL in one sentence"
ai_msg_3 = rag_chain.invoke({"input": third_question, "chat_history": chat_history})
chat_history.extend([HumanMessage(content=question), ai_msg_3["answer"]])
print_response(ai_msg_3["answer"])

Unsupervised Temporal Learning (UTL) is a learning mechanism that
exploits the temporal continuity of visual experiences to factor out
feature identity from other visual attributes.


In [None]:
# ---- EXAMPLE OF CONTEXTUAL COMPRESSION RETRIEVER ---- 
# from langchain.retrievers import ContextualCompressionRetriever
# from langchain.retrievers.document_compressors import LLMChainExtractor
# compressor = LLMChainExtractor.from_llm(llm)
# compression_retriever = ContextualCompressionRetriever(
#     base_compressor=compressor,
#     base_retriever=vectordb.as_retriever()
# )
# question = "what did they say about matlab?"
# compressed_docs = compression_retriever.get_relevant_documents(question)
# pretty_print_docs(compressed_docs)