# Retrieval-augmented generation (RAG)

<img src="./figures/RAG-process.png" >

# Part 1 : Data Preprocessing & Vector Store

In [1]:
import os
# Set GPU device
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

os.environ['http_proxy']  = 'http://192.41.170.23:3128'
os.environ['https_proxy'] = 'http://192.41.170.23:3128'

In [1]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import TextLoader
from langchain.document_loaders import WebBaseLoader

import torch
from tqdm.auto import tqdm
import os

In [3]:
##STEP1 Document loaders
folder_path = './docs/pdf/'
pdf_list = []
for filename in os.listdir(folder_path):
    if filename.endswith('.pdf'):
        pdf_list.append(folder_path + filename)

documents = []
pdf_loaders = [PyPDFLoader(pdf) for pdf in pdf_list]
for loader in pdf_loaders:
    documents.extend(loader.load())

##STEP2 Document transformers
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 700,
    chunk_overlap = 100
)

docs = text_splitter.split_documents(documents) 

##STEP3 Text embedding models
from langchain.embeddings import HuggingFaceInstructEmbeddings
embedding_model = HuggingFaceInstructEmbeddings(
        model_name = 'hkunlp/instructor-base',              
        model_kwargs = {
            'device': torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        },
    )

##STEP4 Vector stores
vector_path = 'vectordb_path'
db_file_name = 'ml_andrew_full_course'

vectordb = FAISS.from_documents(
        documents = docs, 
        embedding = embedding_model)

vectordb.save_local(
    os.path.join(vector_path, db_file_name)
)

##STEP5 Retrievers
vectordb = FAISS.load_local(
        folder_path = os.path.join(vector_path, db_file_name),
        embeddings  = embedding_model
    ) 

retriever = vectordb.as_retriever()

load INSTRUCTOR_Transformer
'NoneType' object has no attribute 'cadam32bit_grad_fp32'


  warn("The installed version of bitsandbytes was compiled without GPU support. "


max_seq_length  512


In [4]:
querys = vectordb.similarity_search("What is Linear Regression", k=2)
for query in querys:
    print(str(query.metadata["page"]) + ":", query.page_content[:300])

2: needing to do that.  
So if – well, suppose you want to evaluate  your hypothesis H at a certain point with a 
certain query point low K is X. Okay? A nd let’s say you want to know what’s the 
predicted value of Y at this position of X, right? So for lin ear regression, what we were 
doing was we wo
2: regression problem like this. What I want to do today is talk about a class of algorithms 
called non-parametric learning algorithms that will help to alleviate the need somewhat 
for you to choose features very carefully. Okay ? And this leads us in to our discussion of 
locally weighted regression


# Part 2 : Prompt & Model

## Prompt

In [5]:
from langchain.prompts import PromptTemplate

prompt_template = PromptTemplate.from_template(
    "Tell me a {adjective} joke about {content}."
)
prompt_template.format(adjective="funny", content="chickens")

'Tell me a funny joke about chickens.'

In [6]:
from langchain.prompts import ChatPromptTemplate

chat_template = ChatPromptTemplate.from_messages(
    [
        ("system", "You are a helpful AI bot. Your name is {name}."),
        ("human", "Hello, how are you doing?"),
        ("ai", "I'm doing well, thanks!"),
        ("human", "{user_input}"),
    ]
)

messages = chat_template.format_messages(name="Bob", user_input="What is your name?")
messages

[SystemMessage(content='You are a helpful AI bot. Your name is Bob.'),
 HumanMessage(content='Hello, how are you doing?'),
 AIMessage(content="I'm doing well, thanks!"),
 HumanMessage(content='What is your name?')]

In [8]:
from langchain.chat_models import ChatOpenAI
from langchain.prompts import HumanMessagePromptTemplate
from langchain.schema.messages import SystemMessage

chat_template = ChatPromptTemplate.from_messages(
    [
        SystemMessage(
            content=(
                "You are a helpful assistant that re-writes the user's text to "
                "sound more upbeat."
            )
        ),
        HumanMessagePromptTemplate.from_template("{text}"),
    ]
)

# llm = ChatOpenAI()
# llm(chat_template.format_messages(text="i dont like eating tasty things."))

In [9]:
from langchain.prompts import PromptTemplate

prompt_template = """
    You are the chatbot and the face of Asian Institute of Technology (AIT). Your job is to give answers to prospective and current students about the school.
    Your job is to answer questions only and only related to the AIT. Anything unrelated should be responded with the fact that your main job is solely to provide assistance regarding AIT.
    MUST only use the following pieces of context to answer the question at the end. If the answers are not in the context or you are not sure of the answer, just say that you don't know, don't try to make up an answer.
    {context}
    Question: {question}
    When encountering abusive, offensive, or harmful language, such as fuck, bitch, etc,  just politely ask the users to maintain appropriate behaviours.
    Always make sure to elaborate your response and use vibrant, positive tone to represent good branding of the school.
    Never answer with any unfinished response
    Answer:
    """

PROMPT = PromptTemplate(
    template = prompt_template, 
    input_variables=["context", "question"]
)

## Model

In [16]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoTokenizer, pipeline
from transformers import BitsAndBytesConfig
# from langchain import HuggingFacePipeline
from langchain.llms.huggingface_pipeline import HuggingFacePipeline

bitsandbyte_config = BitsAndBytesConfig(
            load_in_4bit = True,
            bnb_4bit_quant_type = "nf4",
            bnb_4bit_compute_dtype = torch.float16,
            bnb_4bit_use_double_quant = True
        )

model_id = "lmsys/fastchat-t5-3b-v1.0"
# model_id = "distilgpt2"
#For those who doesn't have GPU you can try with 'distilgpt2' instead

tokenizer = AutoTokenizer.from_pretrained(
    model_id,
)
tokenizer.pad_token_id = tokenizer.eos_token_id

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config = bitsandbyte_config,
    device_map = 'auto',
    load_in_8bit=True,
)

pipe = pipeline(
    task="text-generation", 
    model=model, 
    tokenizer=tokenizer, 
    max_new_tokens=256, 
    model_kwargs = { 
        "temperature":0, 
        "repetition_penalty": 1.5
    }, 
)

llm = HuggingFacePipeline(pipeline=pipe)

# Part 3 : Conversational Retrieval Chain

The ConversationalRetrievalQA chain builds on RetrievalQAChain to provide a chat history component.
- It first combines the `chat history` (either explicitly passed in or retrieved from the provided memory) 
- `the question into a standalone question` 
- looks up `relevant documents` from the retriever
- finally `passes those documents` and the question to a question-answering chain to return a response.

This chain takes in chat history (a list of messages) and new questions,
and then returns an answer to that question.

The algorithm for this chain consists of three parts:

1. Use the chat history and the new question to create a "standalone question".

This is done so that this question can be passed into the retrieval step to fetch
relevant documents. If only the new question was passed in, then relevant context
may be lacking. If the whole conversation was passed into retrieval, there may
be unnecessary information there that would distract from retrieval.

2. This new question is passed to the retriever and relevant documents are
returned.

3. The retrieved documents are passed to an LLM along with either the new question
(default behavior) or the original question and chat history to generate a final
response.


In [10]:
from langchain.chains import LLMChain
from langchain.chains.conversational_retrieval.prompts import CONDENSE_QUESTION_PROMPT

from langchain.chains.question_answering import load_qa_chain

from langchain.memory import ConversationBufferWindowMemory

doc_chain = load_qa_chain(
    llm, 
    chain_type="stuff", 
    prompt = PROMPT)

question_generator = LLMChain(
    llm=llm, 
    prompt=CONDENSE_QUESTION_PROMPT)

memory = ConversationBufferWindowMemory(
    k = 3,  
    memory_key = "chat_history", 
    return_messages = True,  
    output_key = 'answer')

### Class ConversationalRetrievalChain

`retriever` : Retriever to use to fetch documents.

`combine_docs_chain` : The chain used to combine any retrieved documents.

`question_generator`: The chain used to generate a new question for the sake of retrieval. This chain will take in the current question (with variable question) and any chat history (with variable chat_history) and will produce a new standalone question to be used later on.

`return_source_documents` : Return the retrieved source documents as part of the final result.

`get_chat_history` : An optional function to get a string of the chat history. If None is provided, will use a default.

`return_generated_question` : Return the generated question as part of the final result.

`response_if_no_docs_found` : If specified, the chain will return a fixed response if no docs are found for the question.


In [11]:
from langchain.chains import ConversationalRetrievalChain
    
chain = ConversationalRetrievalChain(
    retriever = retriever,
    question_generator = question_generator,
    combine_docs_chain = doc_chain,
    return_source_documents = True,
    memory = memory,
    get_chat_history = lambda h :h)

In [12]:
chain.retriever.Config

langchain.schema.vectorstore.VectorStoreRetriever.Config

## QA-Bot

`pip3 install --upgrade --no-deps --force-reinstall --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu`

In [13]:
prompt_question = 'Who are you by the way?'

answer = chain(
    {"question": prompt_question})

In [14]:
answer['question'], answer['answer'], answer['source_documents']

('Who are you by the way?',
 'I am the chatbot for the Asian Institute of Technology (AIT). My main job is to provide assistance and answer questions related to AIT. How can I assist you today?',
 [Document(page_content="and if they're other questions, I'll take them after.  \n[End of Audio]  \nDuration: 79 minutes", metadata={'source': './docs/pdf/MachineLearning-Lecture02.pdf', 'page': 17}),
  Document(page_content="touches many areas of science and industrie s, and I'm just kind of curious. How many \npeople here are computer science majors, are in the computer science department? Okay. \nAbout half of you. How many people are from  EE? Oh, okay, maybe about a fifth. How", metadata={'source': './docs/pdf/MachineLearning-Lecture01.pdf', 'page': 0}),
  Document(page_content="there. Actually, [inaudible] I think this room just got a new projector that — someone \nsent you an excited email — was it just on Frid ay? — saying we just got a new projector \nand they said 4,000-to-1 somethin

In [15]:
prompt_question = 'What is NLP?'

answer = chain(
    {"question": prompt_question})

In [16]:
answer['question'], answer['answer'], answer['source_documents']

('What is NLP?',
 'NLP stands for Natural Language Processing. It is a field of study in artificial intelligence that focuses on the interaction between computers and human language. It involves tasks such as text analysis, machine translation, sentiment analysis, and speech recognition. If you have any specific questions about NLP at AIT, feel free to ask!',
 [Document(page_content="these medical records, which started to be digitized only about maybe 15 years, applying \nlearning algorithms to them can turn raw medi cal records into what I might loosely call \nmedical knowledge in which we start to detect trends in medical practice and even start to \nalter medical practice as a result of me dical knowledge that's derived by applying \nlearning algorithms to the sorts of medical r ecords that hospitals have just been building \nover the last 15, 20 years in an electronic format.  \nTurns out that most of you probably use learning algorithms — I don't know — I think \nhalf a dozen tim

In [17]:
prompt_question = 'How it is different comparing with NLU?'

answer = chain(
    {"question": prompt_question})

Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for gpt-3.5-turbo in organization org-Ds6D2AMs3kLcrGzbiTbp1Ifs on requests per min (RPM): Limit 3, Used 3, Requested 1. Please try again in 20s. Visit https://platform.openai.com/account/rate-limits to learn more. You can increase your rate limit by adding a payment method to your account at https://platform.openai.com/account/billing..
Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for gpt-3.5-turbo in organization org-Ds6D2AMs3kLcrGzbiTbp1Ifs on requests per min (RPM): Limit 3, Used 3, Requested 1. Please try again in 20s. Visit https://platform.openai.com/account/rate-limits to learn more. You can increase your rate limit by adding a payment method to your account at https://platform.openai.com/account/

In [18]:
answer['question'], answer['answer'], answer['source_documents']

('How it is different comparing with NLU?',
 'As the chatbot for Asian Institute of Technology (AIT), I am here to provide assistance regarding AIT only. I do not have information about NLP or NLU. If you have any questions specifically related to AIT, I would be happy to help you with those.',
 [Document(page_content='than least squares regression and logistic regr ession, and, in particular, because it outputs \nonly values are either zero or one it turns out  it’s very difficult to  endow this algorithm \nwith probabilistic semantics. And this is , again, even though – oh, excuse me. Right \nthere. Okay. And even though this learning ru le looks, again, looks  cosmetically very \nsimilar to what we have in l ogistics regression this is actual ly a very different type of \nlearning rule than the others that were seen in this class. So because this is such a simple \nlearning algorithm, right? It ju st computes theta transpose X and then you threshold and', metadata={'source': './docs

## Github Recommendation
https://github.com/curiousily/Get-Things-Done-with-Prompt-Engineering-and-LangChain
https://github.com/ksm26/LangChain-Chat-with-Your-Data