In [1]:
import openai
import pinecone
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
# from langchain.vectorstores import Pinecone
# from langchain_pinecone import PinceconeVectorStore
from langchain.llms import OpenAI
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings

  from tqdm.autonotebook import tqdm


<font color="green">
The code sets environment variables for accessing openAI and Higging Face Hub API using respective API keys<font>

In [2]:
import os
os.environ["OPENAI_API_KEY"] = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"

### Load documents

In [3]:
def load_docs(directory):
    loader = PyPDFDirectoryLoader(directory)
    documents = loader.load()
    return documents

In [4]:
directory = "rag_docs/"
documents = load_docs(directory)

In [5]:
len(documents)

88

### Transformer Documents

Split the documents into smaller chunks

In [6]:
def split_docs(documents, chunk_size=1000, chunk_overlap=20):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    docs = text_splitter.split_documents(documents)
    return docs

In [7]:
docs = split_docs(documents)
print(len(docs))

201


### Generate Text Embeddings

In [8]:
embeddings = OpenAIEmbeddings()

  warn_deprecated(


In [9]:
query_result = embeddings.embed_query("Hello buddy")
len(query_result)

1536

In [10]:
query_result

[-0.010970983843216485,
 -0.00543742962233356,
 0.0039475034670926655,
 -0.02783758799993352,
 -0.032323387076860666,
 0.009234338226517173,
 -0.012130882799942204,
 -0.012887060864199329,
 -0.00999692379019801,
 -0.003249000348229677,
 0.02128832265147151,
 -0.01981441710743508,
 -0.006139136956569701,
 -0.009606018827899432,
 0.01752025105432367,
 -0.021006357286569723,
 0.042012714573139445,
 -0.01860965820416265,
 0.016033528648794634,
 0.002306982448942717,
 -0.014123859127235502,
 -0.00760663406985983,
 0.0011639046916034534,
 -0.005117015535599134,
 -0.016430842041839516,
 -0.013495847815259256,
 0.0035373732125551787,
 -0.01591817875800636,
 0.009003640307585808,
 -0.02783758799993352,
 0.030349636973128865,
 0.016648723099278274,
 -0.015008203012450917,
 -0.0019192812363559954,
 -0.028965447596895493,
 -0.02781195613959349,
 -0.0029846586335415164,
 -0.015751563283892846,
 0.024184866293573953,
 -0.020903824257274054,
 0.02865784850900849,
 -0.00352455658389322,
 0.00266744852

### Vector Store - PINECONE

In [11]:
from pinecone import Pinecone, ServerlessSpec
pc = Pinecone(api_key="xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx")
# pinecone.init(
#     api_key = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
#     environment="us-west-2"
# )
# index_name = "rag_app_demo"
# index = Pinecone.from_documents(docs, embeddings, index_name=index_name)

In [12]:
index_name = "ragappdemo"
existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

In [56]:
# pc.create_index(
#         name=index_name,
#         dimension=1536,
#         metric="cosine",
#         spec=ServerlessSpec(cloud="aws", region="us-east-1")
#     )

In [13]:
from langchain_pinecone import PineconeVectorStore

In [14]:
import os
os.environ["PINECONE_API_KEY"] = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"

In [15]:
vectorstore_from_docs = PineconeVectorStore.from_documents(
    docs,
    index_name = index_name,
    embedding = embeddings
)

In [62]:
# if index_name not in existing_indexes:
#     pc.create_index(
#         name=index_name,
#         dimension=1536,
#         metric="cosine",
#         spec=ServerlessSpec(cloud="aws", region="us-east-1"),
#     )
#     while not pc.describe_index(index_name).status["ready"]:
#         time.sleep(1)

# index = pc.Index(index_name)

### Retrieve Answers

In [22]:
def get_similar_docs(query, k=2):
    similar_docs = vectorstore_from_docs.similarity_search(query, k=k)
    return similar_docs

In [23]:
from langchain.chains.question_answering import load_qa_chain
from langchain import HuggingFaceHub

In [24]:
llm = OpenAI()

In [25]:
chain = load_qa_chain(llm, chain_type="stuff")

In [26]:
def get_answer(query):
    relevant_docs = get_similar_docs(query)
    print(relevant_docs)
    response = chain.run(input_documents= relevant_docs, question=query)
    return response
    

In [27]:
our_query_1 = "What is corrective RAG?"
answer_1 = get_answer(our_query_1)
print(answer_1)

[Document(metadata={'page': 4.0, 'source': 'rag_docs\\RAG_101_1714383713.pdf'}, page_content='CORRECTIVE RAG\nCorrective RAG (CRAG) is a method that improves the\naccuracy of language models by intelligently re-\nincorporating information from retreived documents.\nIt uses an evaluator to assess the quality of documents\nobtained for a query. Then, it decides whether to use,\nignore, or request more data from these documents.ADVANCED RAG METHODS'), Document(metadata={'page': 4.0, 'source': 'rag_docs\\RAG_101_1714383713.pdf'}, page_content='CORRECTIVE RAG\nCorrective RAG (CRAG) is a method that improves the\naccuracy of language models by intelligently re-\nincorporating information from retreived documents.\nIt uses an evaluator to assess the quality of documents\nobtained for a query. Then, it decides whether to use,\nignore, or request more data from these documents.ADVANCED RAG METHODS')]


  warn_deprecated(



Corrective RAG (CRAG) is a method for improving the accuracy of language models by using an evaluator to assess the quality of retrieved documents and deciding whether to incorporate, ignore, or request more data from these documents.


In [29]:
our_query_2 = "What is the email address of Cornellius Yudha?"
answer_2 = get_answer(our_query_2)
print(answer_2)

[Document(metadata={'page': 0.0, 'source': 'rag_docs\\MLOps_1709250343.pdf'}, page_content='LinkedIn : Cornellius Yudha Wijaya\nMedium: @cornelliusyudhawijaya'), Document(metadata={'page': 0.0, 'source': 'rag_docs\\MLOps_1709250343.pdf'}, page_content='LinkedIn : Cornellius Yudha Wijaya\nMedium: @cornelliusyudhawijaya')]
I'm sorry, there is no email address given in the context provided.


In [30]:
our_query_3 = "Summarize the key research findings of 2023 Data Pipeline Survey."
answer_3 = get_answer(our_query_3)
print(answer_3)

[Document(metadata={'page': 2.0, 'source': 'rag_docs\\Download_the_report_1715179718.pdf'}, page_content='PAGE 3\nTREND REPORT   |   DATA PIPELINES © 2023 DZONEORIGINAL RESEARCHData analytics is no longer just about using data to drive business decisions; we are entering a new era where cloud-based \nsystems and tools are at the heart of data processing and analytics. Data-centric tools and techniques like warehouses and \nlakes, ETL/ELT, observability, real-time analytics, and so much more — these technologies are democratizing the data we \ncollect. The proliferation of and growing emphasis on data democratization results in increased and nuanced ways in which \ndata platforms can be used — and by extension, also empowers business users to make data-driven decisions with confidence.\nIn August 2023, DZone surveyed software developers, architects, and other IT professionals in order to understand the state of \ndata pipelines.\nMajor research targets were:\n• ETL/ELT methods and solut

### Structure the output

In [31]:
import re
import json


In [32]:
from langchain.chat_models import ChatOpenAI
from langchain.schema import HumanMessage
from langchain.prompts import PromptTemplate, ChatPromptTemplate, HumanMessagePromptTemplate
from langchain.output_parsers import StructuredOutputParser, ResponseSchema

In [33]:
response_schemas = [
    ResponseSchema(name="question", description="Question generated from provided input text data."),
    ResponseSchema(name="choices", description="Available options for a multiple choice question is comma separated"),
    ResponseSchema(name="answer",description="Correct answer for the asked question")
]
output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
output_parser

StructuredOutputParser(response_schemas=[ResponseSchema(name='question', description='Question generated from provided input text data.', type='string'), ResponseSchema(name='choices', description='Available options for a multiple choice question is comma separated', type='string'), ResponseSchema(name='answer', description='Correct answer for the asked question', type='string')])

In [34]:
format_instructions = output_parser.get_format_instructions()
print(format_instructions)

The output should be a markdown code snippet formatted in the following schema, including the leading and trailing "```json" and "```":

```json
{
	"question": string  // Question generated from provided input text data.
	"choices": string  // Available options for a multiple choice question is comma separated
	"answer": string  // Correct answer for the asked question
}
```


In [35]:
chat_model = ChatOpenAI()

  warn_deprecated(


In [53]:
from langchain_openai import ChatOpenAI
chat_model = ChatOpenAI(temperature=0.9)

In [39]:
chat_model.model_name

'gpt-3.5-turbo'

In [42]:
prompt = ChatPromptTemplate(
    messages=[
        HumanMessagePromptTemplate.from_template("""when a text input is given by the user, please generate multiple choice question from it along with the answer.
                                                 \n {format_instructions}\n{user_prompt}""")
    ],
    input_variables=["user_prompt"],
    partial_variables={"format_instructions":format_instructions}
)

In [58]:
final_query = prompt.format_prompt(user_prompt = our_query_3)
print(final_query)

messages=[HumanMessage(content='when a text input is given by the user, please generate multiple choice question from it along with the answer.\n                                                 \n The output should be a markdown code snippet formatted in the following schema, including the leading and trailing "```json" and "```":\n\n```json\n{\n\t"question": string  // Question generated from provided input text data.\n\t"choices": string  // Available options for a multiple choice question is comma separated\n\t"answer": string  // Correct answer for the asked question\n}\n```\nSummarize the key research findings of 2023 Data Pipeline Survey.')]


In [59]:
final_query.to_messages()

[HumanMessage(content='when a text input is given by the user, please generate multiple choice question from it along with the answer.\n                                                 \n The output should be a markdown code snippet formatted in the following schema, including the leading and trailing "```json" and "```":\n\n```json\n{\n\t"question": string  // Question generated from provided input text data.\n\t"choices": string  // Available options for a multiple choice question is comma separated\n\t"answer": string  // Correct answer for the asked question\n}\n```\nSummarize the key research findings of 2023 Data Pipeline Survey.')]

In [61]:
final_query_output = chat_model(final_query.to_messages())
print(final_query_output.content)

```json
{
	"question": "What are some key research findings of the 2023 Data Pipeline Survey?",
	"choices": "A. Increased adoption of cloud-based data pipelines, B. Challenges in data quality and governance, C. Use of AI and machine learning in data pipelines, D. All of the above",
	"answer": "D. All of the above"
}
```


In [62]:
markdown_text = final_query_output.content
json_string = re.search(r'{(.*?)}', markdown_text, re.DOTALL).group(1)

In [63]:
print(json_string)


	"question": "What are some key research findings of the 2023 Data Pipeline Survey?",
	"choices": "A. Increased adoption of cloud-based data pipelines, B. Challenges in data quality and governance, C. Use of AI and machine learning in data pipelines, D. All of the above",
	"answer": "D. All of the above"

