In [18]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append('../')
import dotenv 

dotenv.load_dotenv()

True

## Step 1. Load
Specify a DocumentLoader to load in your unstructured data as Documents.

A Document is a `dict` with text (page_content) and metadata.

In [2]:
# Load documents
from langchain.document_loaders import PyPDFLoader

file_path = '../data/raw/Personalization_strategies_in_digital_mental_healt.pdf'

loader = PyPDFLoader(file_path)
data = loader.load()

## Step 2. Split
Split the Document into chunks for embedding and vector storage.

In [3]:
# Split documents
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 0)
splits = text_splitter.split_documents(loader.load())

## Step 3. Store
To be able to look up our document splits, we first need to store them where we can later look them up.

The most common way to do this is to embed the contents of each document split.

We store the embedding and splits in a vectorstore.



In [4]:
# Embed and store splits
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings

vectorstore = Chroma.from_documents(documents=splits,embedding=OpenAIEmbeddings())
retriever = vectorstore.as_retriever()

## Step 4. Retrieve
Retrieve relevant splits for any question using similarity search.

This is simply "top K" retrieval where we select documents based on embedding similarity to the query.

In [5]:
# Prompt 
# https://smith.langchain.com/hub/rlm/rag-prompt
from langchain import hub

rag_prompt = hub.pull("rlm/rag-prompt")
rag_prompt

ChatPromptTemplate(input_variables=['question', 'context'], messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['question', 'context'], template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"))])

In [15]:
# LLM
from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI(model_name="gpt-4", temperature=0)

In [16]:
# RAG chain 

from langchain.schema.runnable import RunnablePassthrough

rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()} 
    | rag_prompt 
    | llm 
)

In [17]:
rag_chain.invoke("Who is Silvan?")

AIMessage(content='Silvan, full name Silvan Hornstein, is a correspondent for a research paper on personalization strategies in digital mental health interventions. He is also currently employed as a Data Scientist by Elona Health, a digital mental health start-up.')

In [18]:
rag_chain.invoke("What is the paper about?")

AIMessage(content='The specific content of the paper is not clearly stated in the provided context. However, it appears to involve a review and analysis of certain interventions, possibly related to digital health or mental health, given the source file name. The authors also seem to have contacted other authors for clarification on certain points, suggesting a comprehensive research process.')

In [19]:
rag_chain.invoke("Which personalization strategies are used?")

AIMessage(content='The personalization strategies used include rule-based personalization for communication and guidance, and user choice-based personalization for content. Rule-based personalization might involve reminders for inactivity or non-completion, or increased guidance or clinician contact for symptom changes. User choice-based personalization allows optional content to be selected by the patient.')

In [20]:
rag_chain.invoke("What problem is the paper trying to address?")

AIMessage(content='The context provided does not clearly state the problem that the paper is trying to address.')

In [21]:
rag_chain.invoke("What problem is the problem with personalization strategies in digital mental health interventions?")

AIMessage(content='The problem with personalization strategies in digital mental health interventions is that there are major questions that remain open. These include understanding what personalization is, how prevalent it is in practice, and what benefits it truly has. There is also a need to define personalization in terms of intervention content, order, guidance, or communication.')