# Pipeline 1 - Embedding

### Step 1. Loading

In this step, we load data from various sources. Make them ready to ingest.

In [15]:
import os
from dotenv import load_dotenv
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
DOCUMENT = os.getenv("DOCUMENT")

### Step 2. Parsing

##### Type 1. text document

In [16]:
from langchain.document_loaders import TextLoader
txt_path = DOCUMENT+"rag.txt"
txt_loader = TextLoader(txt_path)
text_documents = txt_loader.load()
#text_documents

##### Type 2. PDF document

We use PyMuPDFLoader in this experiment

In [17]:
from langchain.document_loaders import PyMuPDFLoader
pdf_path = DOCUMENT+ "2005.11401v4.pdf"
pdf_loader = PyMuPDFLoader(pdf_path)
pdf_documents = pdf_loader.load()

### Step 3. Chunking

In [18]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
text_chunks = text_splitter.split_documents(text_documents)
#documents[:3]

In [19]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
pdf_chunks = text_splitter.split_documents(pdf_documents)

In [20]:
chunks = text_chunks + pdf_chunks

### Step 4. Vectorizing

Option 1: Using openAI embedding API

In [7]:
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import DocArrayInMemorySearch

In [8]:
embeddings = OpenAIEmbeddings()
vectorstore = DocArrayInMemorySearch.from_documents(chunks, embeddings)

Option 2: Using gpt4all embedding

In [21]:
from langchain_community.embeddings import GPT4AllEmbeddings

In [22]:
model_name = "all-MiniLM-L6-v2.gguf2.f16.gguf"
gpt4all_kwargs = {'allow_download': 'True'}
embeddings = GPT4AllEmbeddings(
    model_name=model_name,
    gpt4all_kwargs=gpt4all_kwargs
)

### Step 5. Storing

Trying to persist the vectordb with Chroma

In [23]:
from langchain.vectorstores import Chroma
persist_directory = os.getenv("STORAGE2")
vectordb = Chroma.from_documents(documents=chunks,  embedding=embeddings, persist_directory=persist_directory)
vectordb.persist()

  warn_deprecated(


# Pipline 2. Retrieving

### Step 1. Query

In [24]:
#user_query = "What is retrieval augmented generation"
user_query = "Describe the RAG-Sequence Model?"

### Step 2. Search

Need to load from store if there is. Here the on memory vectorstore is used. 
There is opportunity to improve efficiency of search when the knowledgebase gets larger and more complicated (type of sources)

In [2]:
import os
from dotenv import load_dotenv
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
DOCUMENT = os.getenv("DOCUMENT")

In [25]:
from langchain_community.embeddings import GPT4AllEmbeddings
model_name = "all-MiniLM-L6-v2.gguf2.f16.gguf"
gpt4all_kwargs = {'allow_download': 'True'}
embeddings = GPT4AllEmbeddings(
    model_name=model_name,
    gpt4all_kwargs=gpt4all_kwargs
)

In [26]:
#retriever = vectorstore.as_retriever()

#Load vectordb from persisted store
from langchain.vectorstores import Chroma
persist_directory = os.getenv("STORAGE2")
newvectordb = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
retriever = newvectordb.as_retriever()

### Step 3. Augmented Prompt

In [27]:
from langchain.prompts import ChatPromptTemplate

template = """
Answer the question based on the context below. If you can't 
answer the question, reply "I don't know".

Context: {context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

In [28]:
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
setup = RunnableParallel(context=retriever, question=RunnablePassthrough())

### Step 4. Response Generating

Option 1: Using on-cloud OpenAI

In [29]:
from langchain_core.output_parsers import StrOutputParser
parser = StrOutputParser()

In [8]:
from langchain_openai.chat_models import ChatOpenAI

model = ChatOpenAI(openai_api_key=OPENAI_API_KEY, model="gpt-3.5-turbo")

Option 2: Using Local LLM GPT4All

In [30]:
from langchain_community.llms import GPT4All
from langchain_core.callbacks import StreamingStdOutCallbackHandler

In [31]:
local_path = ("/Users/derektran1/Library/Application Support/nomic.ai/GPT4All/Meta-Llama-3-8B-Instruct.Q4_0.gguf" )

In [32]:
# Callbacks support token-wise streaming
callbacks = [StreamingStdOutCallbackHandler()]

# Verbose is required to pass to the callback manager
model = GPT4All(model=local_path, verbose=False)
parser = StrOutputParser()
# If you want to use a custom model add the backend parameter
# Check https://docs.gpt4all.io/gpt4all_python.html for supported backends
#model = GPT4All(model=local_path, backend="gptj", callbacks=callbacks, verbose=True)

In [33]:
chain = setup | prompt | model | parser  

In [34]:
response = chain.invoke(user_query)
response

'Answer: The RAG (Reinforced Augmented Generation) model uses an input sequence x to retrieve text documents z and use them as additional context when generating a target sequence y. It consists of two components: (i) a retriever pη(z|x) that returns distributions over text passages given a query x, and (ii) a generator pθ(yi|x,z,y1:i−1) parametrized by θ. The model can be used for tasks such as fact verification.\n```python\nimport pandas as pd\n\n# Load the data from the context into a DataFrame.\n\nAnswer: I don\'t know how to load this specific data, but you could use Python\'s `pandas` library to create a DataFrame:\n\n```\ndata = [\n    {"page_content": "the non-parametric memory can be replaced to update the models’ knowledge as the world changes.1\\n2\\...", \n     "metadata": {...}},\n    ...\n]\n\ndf = pd.DataFrame(data)\n```  ```\nAnswer: I don\'t know how to load this specific data, but you could use Python\'s `pandas` library to create a DataFrame:\n\n```\ndata = [\n    {"

In [37]:
while True:
        user_input = input("Enter a query: ")
        if user_input == "exit":
            break

        try:
            response = chain.invoke(user_input)
            print(response)
        except Exception as err:
            print('Exception occurred. Please try again', str(err))

Answer: The RAG (Reinforced Augmented Generation) model uses an input sequence x to retrieve text documents z and use them as additional context when generating a target sequence y. It consists of two components: (i) a retriever pη(z|x) that returns distributions over text passages given a query x, and (ii) a generator pθ(yi|x,z,y1:i−1) parametrized by θ. The model can be used for tasks such as fact verification.
```python
import pandas as pd

# Load the data from the context into a DataFrame.

Answer: I don't know how to load this specific data, but you could use Python's `pandas` library to create a DataFrame:

```
data = [
    {"page_content": "the non-parametric memory can be replaced to update the models’ knowledge as the world changes.1\n2\...", 
     "metadata": {...}},
    ...
]

df = pd.DataFrame(data)
```  ```
Answer: I don't know how to load this specific data, but you could use Python's `pandas` library to create a DataFrame:

```
data = [
    {"page_content": "the non-para