# Project: Question-Answering on Private Documents

In [1]:
import os
from google.colab import userdata
openai_key = userdata.get('OPENAI')
pinecone_api = userdata.get('PINECONE')
os.environ['PINECONE_API_KEY'] = pinecone_api
os.environ['OPENAI_API_KEY'] = openai_key

In [2]:
!pip install pypdf -q langchain-community docx2txt wikipedia tiktoken pinecone langchain-openai langchain-pinecone

In [3]:
# Document extraction from custom files
def load_document(file):
  import os
  name, extension = os.path.splitext(file)

  if extension == '.pdf':
    from langchain.document_loaders import PyPDFLoader
    print(f'Loading {file}')
    loader = PyPDFLoader(file)
    data = loader.load()
  elif extension == '.docx':
    from langchain.document_loaders import Docx2txtLoader
    print(f' Loading {file}')
    loader = Docx2txtLoader(file)
    data = loader.load()
  else:
    print('Document format is not supported!')
    return None

  return data


# Wikipedia searching
def load_from_wikipedia(query, lang='en', load_max_docs=2):
  from langchain.document_loaders import WikipediaLoader
  loader = WikipediaLoader(query=query, lang=lang, load_max_docs=load_max_docs)
  data = loader.load()
  return data

# Chunking for long documents
def chunk_data(data):
  from langchain.text_splitter import RecursiveCharacterTextSplitter
  text_splitter = RecursiveCharacterTextSplitter(
      chunk_size=256,
      chunk_overlap=0
  )
  chunks = text_splitter.split_documents(data) # .create_documents when it's not already splitted in pages
  return chunks

# Calculate embedding cost
def print_embedding_cost(texts):
  import tiktoken
  enc = tiktoken.encoding_for_model('text-embedding-ada-002') # Openai's embedding model
  total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
  print(f"Total Tokens: {total_tokens}")
  print(f"Embedding cost in USD: {total_tokens / 1000 * 0.0004:.6f}")

### Embedding and Uploading to a Vector Database (Pinecone)

In [4]:
def insert_or_fetch_embeddings(index_name, chunks, pinecone_api_key):
  import pinecone
  from langchain_community.vectorstores import Pinecone
  from langchain_pinecone import PineconeVectorStore
  from langchain_openai import OpenAIEmbeddings
  from pinecone import ServerlessSpec

  pc = pinecone.Pinecone(api_key=pinecone_api_key)
  embeddings = OpenAIEmbeddings(api_key=openai_key, model='text-embedding-3-small', dimensions=1536)

  if index_name in pc.list_indexes().names():
    print(f"Index {index_name} already exists. Loading embeddings ...", end='')
    vector_store = Pinecone.from_existing_index(index_name, embeddings)
    print('Done!')
  else:
    print(f"Creating index {index_name} and embeddings ...", end='')
    pc.create_index(
        name=index_name,
        dimension=1536,
        metric='cosine',
        spec=ServerlessSpec(
          cloud="aws",
          region="us-east-1")
    )
    index = pc.Index(index_name)
    vector_store=PineconeVectorStore.from_documents(documents = chunks, embedding = embeddings, index_name=index_name )
    print('Done!')
    return vector_store

def delete_pinecone_index(index_name='all'):
  import pinecone
  pc = pinecone.Pinecone(api_key=pinecone_api)
  if index_name == 'all':
    indexes = pc.list_indexes().names()
    print('Deleting all indexes ...')
    for index in indexes:
      pc.delete_index(index)
    print('Done!')
  else:
    print(f"Deleting index {index_name} ...", end='')
    pc.delete_index(index_name)
    print('Done!')

## Asking and Getting Answers

In [5]:
def ask_and_get_answer(vector_store, q):
  from langchain.chains import RetrievalQA
  from langchain_openai import ChatOpenAI
  from IPython.display import Markdown, display

  llm = ChatOpenAI(api_key=openai_key, model='gpt-4o-mini', temperature=0.5)

  retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': 3})

  chain = RetrievalQA.from_chain_type(llm, chain_type='stuff', retriever=retriever)

  answer = chain.invoke(q)
  return display(Markdown(answer['result']))

## Running Code

In [6]:
data = load_document('us_constitution.pdf')
print(data[1].page_content)
print(data[10].metadata)
print(f"You have {len(data)} pages in your data")
print(f"There are {len(data[20].page_content)} characters in the page")

Loading us_constitution.pdf
The House of Representatives shall be composed of Members chosen 
 every second Y ear by the People of the several States, and the 
 Electors in each State shall have the Qualifications requisite for 
 Electors of the most numerous Branch of the State Legislature. 
 No Person shall be a Representative who shall not have attained to the 
 Age of twenty five Y ears, and been seven Y ears a Citizen of the United 
 States, and who shall not, when elected, be an Inhabitant of that State 
 in which he shall be chosen. 
 Representatives and direct T axes shall be apportioned among the 
 several States which may be included within this Union, according to 
 their respective Numbers, which shall be determined by adding to the 
 whole Number of free Persons, including those bound to Service for a 
 T erm of Y ears, and excluding Indians not taxed, three fifths of all other 
 Persons. The actual Enumeration shall be made within three Y ears 
 after the first Meeting of

In [7]:
chunks = chunk_data(data)
print(len(chunks))
print(chunks[10].page_content)

224
Maryland six, V irginia ten, North Carolina five, South Carolina five, and 
 Georgia three. 
 When vacancies happen in the Representation from any State, the 
 Executive Authority thereof shall issue W rits of Election to fill such 
 V acancies.


In [8]:
data = load_document('the_great_gatsby.docx')
print(data[0].page_content)

[1;30;43mΗ έξοδος ροής περικόπηκε στις τελευταίες 5000 γραμμές.[0m

“But there’s a garage right here,” objected Jordan. “I don’t want to

get stalled in this baking heat.”



Tom threw on both brakes impatiently, and we slid to an abrupt dusty

stop under Wilson’s sign. After a moment the proprietor emerged from

the interior of his establishment and gazed hollow-eyed at the car.



“Let’s have some gas!” cried Tom roughly. “What do you think we

stopped for—to admire the view?”



“I’m sick,” said Wilson without moving. “Been sick all day.”



“What’s the matter?”



“I’m all run down.”



“Well, shall I help myself?” Tom demanded. “You sounded well enough on

the phone.”



With an effort Wilson left the shade and support of the doorway and,

breathing hard, unscrewed the cap of the tank. In the sunlight his

face was green.



“I didn’t mean to interrupt your lunch,” he said. “But I need money

pretty bad, and I was wondering what you were going to do with your

old car.”



“How 

In [9]:
data = load_from_wikipedia('GPT-4', 'el')
print(data[0].page_content)

Το GPT-4 (Generative Pre-trained Transformer 4) είναι η τέταρτη γενιά ενός μεγάλου πολυτροπικού μοντέλου γλώσσας που δημιουργήθηκε από την OpenAI. Κυκλοφόρησε στις 14 Μαρτίου 2023 και είναι διαθέσιμο μέσω API και για χρήστες ChatGPT Plus. Η Microsoft επιβεβαίωσε ότι οι εκδόσεις του Bing που χρησιμοποιούν GPT στην πραγματικότητα χρησιμοποιούσαν το GPT-4 πριν από την επίσημη κυκλοφορία του. Ως μετασχηματιστής, το GPT-4 ήταν προεκπαιδευμένο για την πρόβλεψη του επόμενου διακριτικού (χρησιμοποιώντας δημόσια δεδομένα και «δεδομένα με άδεια από τρίτους παρόχους») και στη συνέχεια βελτιστοποιήθηκε με ενισχυτική μάθηση από την ανάδραση ανθρώπου και τεχνητής νοημοσύνης για ανθρώπινη ευθυγράμμιση και πολιτική συμμόρφωση.
Οι New York Times έγραψαν ότι το GPT-4 έδειξε μεγάλες βελτιώσεις στην ακρίβεια σε σύγκριση με το GPT-3.5, είχε αποκτήσει τη δυνατότητα να συνοψίζει και να σχολιάζει εικόνες, ήταν σε θέση να συνοψίζει περίπλοκα κείμενα, πέρασε σε εξετάσεις bar και πολλά τυποποιημένα τεστ, αλλά πα

In [10]:
print_embedding_cost(chunks)

Total Tokens: 9842
Embedding cost in USD: 0.003937


In [11]:
# Deleting the indexes already exists in pinecone account
delete_pinecone_index()

Deleting all indexes ...
Done!


In [12]:
index_name = 'askadocument'
vector_store = insert_or_fetch_embeddings(index_name, chunks, pinecone_api)

Creating index askadocument and embeddings ...Done!


In [13]:
q = 'What is the whole document about?'
answer = ask_and_get_answer(vector_store, q)
answer

I'm sorry, but I don't have access to the document you're referring to. If you can provide more context or details about the document, I may be able to help you better.

In [14]:
import time
i = 1
print('Write Quit or Exit to quit.')
while True:
  q = input (f'Question #{i}: ')
  i += 1

  if q.lower() in ['quit', 'exit']:
    print('Quitting... bye bye')
    time.sleep(2)
    break

  answer = ask_and_get_answer(vector_store, q)
  print(f"\n {'-' * 50} \n")

Write Quit or Exit to quit.
Question #1: quit
Quitting... bye bye


In [15]:
data = load_from_wikipedia('ChatGPT', 'en')
chunks = chunk_data(data)
index_name = 'chatgpt'
vector_store = insert_or_fetch_embeddings(index_name, chunks, pinecone_api)

Creating index chatgpt and embeddings ...Done!


In [16]:
q = 'What is ChatGPT?'
answer = ask_and_get_answer(vector_store, q)
answer

ChatGPT is an AI language model developed by OpenAI. It is designed to understand and generate human-like text based on the input it receives. ChatGPT can engage in conversations, answer questions, provide explanations, and assist with a variety of tasks involving natural language processing.

## Using Chroma as a Vector DB

In [17]:
!pip install -q chromadb langchain-chroma

In [18]:
def create_embeddings_chroma(chunks, persist_directory = './chroma_db'):
  from langchain_chroma import Chroma
  from langchain_openai import OpenAIEmbeddings

  embeddings = OpenAIEmbeddings(api_key=openai_key, model='text-embedding-3-small', dimensions=1536)

  vector_store = Chroma.from_documents(chunks, embeddings, persist_directory=persist_directory)
  return vector_store

def load_embeddings_chroma(persist_directory='./chroma_db'):
  from langchain_chroma import Chroma
  from langchain_openai import OpenAIEmbeddings

  embeddings = OpenAIEmbeddings(api_key=openai_key, model='text-embedding-3-small', dimensions=1536)

  vector_store = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
  return vector_store

In [19]:
data = load_document('rag_powered_by_google_search.pdf')
chunks = chunk_data(data)
vector_store = create_embeddings_chroma(chunks)

Loading rag_powered_by_google_search.pdf


In [20]:
q = 'What is Vertex AI search'
answer = ask_and_get_answer(vector_store, q)

Vertex AI Search is a feature within Google's Vertex AI platform that provides advanced search capabilities utilizing generative AI. It offers customizable answers, search tuning, vector search, and grounding, making it suitable for enterprise applications. The platform is designed to enhance search experiences by incorporating AI and machine learning technologies.

In [21]:
db = load_embeddings_chroma()
q = 'How many paris of questions and answers had the stackoverflow dataset?'
answer = ask_and_get_answer(vector_store, q)
answer


The Stack Overflow dataset had 8 million pairs of questions and answers.

In [22]:
q = 'Multiply that number by 2.'
answer = ask_and_get_answer(vector_store, q)
answer

I don't know what number you are referring to. Please provide the number you would like to multiply by 2.

## Adding Memory (Chat History)

In [49]:
from langchain_openai import ChatOpenAI
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory

llm = ChatOpenAI(api_key=openai_key, model='gpt-4o-mini', temperature=0)
retriever = vector_store.as_retriever(serch_type='similarity', search_kwargs={'k': 5})
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

crc = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retriever,
    memory=memory,
    chain_type='stuff',
    verbose=True
)

In [50]:
def ask_question(q, chain):
  result = chain.invoke({'question': q})
  return result

In [51]:
data = load_document('rag_powered_by_google_search.pdf')
chunks = chunk_data(data)
vector_store = create_embeddings_chroma(chunks)

Loading rag_powered_by_google_search.pdf


In [57]:
q = 'How many pairs of question and answers had the StackOverFlow dataset'
result = ask_question(q, crc)
print(result['answer'])



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mGiven the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.

Chat History:

Human: How many pairs of question and answers had the StackOverFlow dataset
Assistant: The Stack Overflow dataset had 8 million pairs of questions and answers.
Human: Multiply that number by 10 in english
Assistant: 8 million multiplied by 10 is 80 million.
Human: Multiply that number by 10
Assistant: 80 million multiplied by 10 is 800 million.
Human: How many pairs of question and answers had the StackOverFlow dataset
Assistant: I don't know.
Human: How many pairs of question and answers had the StackOverFlow dataset
Assistant: I don't know.
Follow Up Input: How many pairs of question and answers had the StackOverFlow dataset
Standalone question:[0m

[1m> Finished chain.[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering

In [58]:
q = 'Multiply that number by 10'
result = ask_question(q, crc)
print(result['answer'])



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mGiven the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.

Chat History:

Human: How many pairs of question and answers had the StackOverFlow dataset
Assistant: The Stack Overflow dataset had 8 million pairs of questions and answers.
Human: Multiply that number by 10 in english
Assistant: 8 million multiplied by 10 is 80 million.
Human: Multiply that number by 10
Assistant: 80 million multiplied by 10 is 800 million.
Human: How many pairs of question and answers had the StackOverFlow dataset
Assistant: I don't know.
Human: How many pairs of question and answers had the StackOverFlow dataset
Assistant: I don't know.
Human: How many pairs of question and answers had the StackOverFlow dataset
Assistant: The Stack Overflow dataset had 8 million pairs of questions and answers.
Follow Up Input: Multiply that number by 10
Stan

In [59]:
for item in result['chat_history']:
  print(item)

content='How many pairs of question and answers had the StackOverFlow dataset' additional_kwargs={} response_metadata={}
content='The Stack Overflow dataset had 8 million pairs of questions and answers.' additional_kwargs={} response_metadata={}
content='Multiply that number by 10 in english' additional_kwargs={} response_metadata={}
content='8 million multiplied by 10 is 80 million.' additional_kwargs={} response_metadata={}
content='Multiply that number by 10' additional_kwargs={} response_metadata={}
content='80 million multiplied by 10 is 800 million.' additional_kwargs={} response_metadata={}
content='How many pairs of question and answers had the StackOverFlow dataset' additional_kwargs={} response_metadata={}
content="I don't know." additional_kwargs={} response_metadata={}
content='How many pairs of question and answers had the StackOverFlow dataset' additional_kwargs={} response_metadata={}
content="I don't know." additional_kwargs={} response_metadata={}
content='How many pai

## Using a Custom Prompt

In [68]:
from langchain_openai import ChatOpenAI
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory

from langchain.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate

llm = ChatOpenAI(api_key=openai_key, model='gpt-4o-mini', temperature=0)
retriever = vector_store.as_retriever(serch_type='similarity', search_kwargs={'k': 5})
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

system_template = r"""
Use the following pieces of context to answer the user's  in the same language.
If you don't find the answer in the provided context then just answer 'I dont know'
---------------
Content: ```{context}```
"""

user_template = """
Question: ```{question}```
Chat History: ```{chat_history}```
"""

messages = [
    SystemMessagePromptTemplate.from_template(system_template),
    HumanMessagePromptTemplate.from_template(user_template)
]

qa_prompt = ChatPromptTemplate.from_messages(messages)


crc = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retriever,
    memory=memory,
    chain_type='stuff',
    combine_docs_chain_kwargs={'prompt': qa_prompt},
    verbose=True
)

In [62]:
print(qa_prompt)

input_variables=['chat_history', 'context', 'question'] input_types={} partial_variables={} messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, template="\nUse the following pieces of context to answer the user's question.\n---------------\nContent: ```{context}```\n"), additional_kwargs={}), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['chat_history', 'question'], input_types={}, partial_variables={}, template='\nQuestion: ```{question}```\nChat History: ```{chat_history}```\n'), additional_kwargs={})]


In [66]:
db = load_embeddings_chroma()
q = 'How many pairs of questions and answers had the StackOverFlow dataset'
result = ask_question(q, crc)
print(result['answer'])



[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: 
Use the following pieces of context to answer the user's  in the same language.
---------------
Content: ```simple similarity search was highly e ective because the dataset had 8
million pairs of questions and answers. However, datasets do not
usually contain pre-existing question-and-answer or query-and-

simple similarity search was highly e ective because the dataset had 8
million pairs of questions and answers. However, datasets do not
usually contain pre-existing question-and-answer or query-and-

simple similarity search was highly e ective because the dataset had 8
million pairs of questions and answers. However, datasets do not
usually contain pre-existing question-and-answer or query-and-

simple similarity search was highly e ective because the dataset had 8
million pairs of questions and answers. However, datasets do not
usually cont

In [69]:
db = load_embeddings_chroma()
q = 'When was Elon Mush born'
result = ask_question(q, crc)
print(result['answer'])



[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: 
Use the following pieces of context to answer the user's  in the same language.
If you don't find the answer in the provided context then just answer 'I dont know'
---------------
Content: ```or has no contextual knowledge of a topic, it is more likely to hallucinate
and provide inaccurate or false responses. Developers are increasingly
excited about generative AI and Retrieval Augmented Generation (RAG)

or has no contextual knowledge of a topic, it is more likely to hallucinate
and provide inaccurate or false responses. Developers are increasingly
excited about generative AI and Retrieval Augmented Generation (RAG)

or has no contextual knowledge of a topic, it is more likely to hallucinate
and provide inaccurate or false responses. Developers are increasingly
excited about generative AI and Retrieval Augmented Generation (RAG)

or has no con