# Project: Question-Answering on Private Documents

## Here is all the package needed

In [10]:
!pip install pypdf -q
#Once install you don't need to explicitly import pypdf after installing it, because it's already there ready to use

In [11]:
!pip install docx2txt -q
#Once install you don't need to explicitly import pypdf after installing it, because it's already there ready to use

In [15]:
import os
from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv(), override=True)

# You can add other package here if needed
from langchain_community.document_loaders import TextLoader
from langchain_community.document_loaders import PyPDFLoader

### Loading Documents

In [39]:
# The goal is to loading PDF, DOCX and TXT files as LangChain Documents
def load_document(file):

    name, extension = os.path.splitext(file) # check splitext()

    if extension == '.pdf':
        # Find how to load PDF document with https://python.langchain.com/docs/modules/data_connection/document_loaders/
        print(f'Loading {file}')
        loader = PyPDFLoader(f"{name}{extension}")
        #pages = loader.load_and_split()
    elif extension == '.docx':
        # Find how to load DOCX document with https://python.langchain.com/docs/modules/data_connection/document_loaders/
        print(f'Loading {file}')
        loader = TextLoader(f"{name}{extension}")
    elif extension == '.txt':
        # Find how to load TXT document with https://python.langchain.com/docs/modules/data_connection/document_loaders/
        loader = TextLoader(f"{name}{extension}")
    else:
        print('Document format is not supported!')
        return None

    data = loader.load()

    # Verification
    if len(data) != 0:
        return data
    else:
        print('Document is empty')
        return None
  

#### Verification 

In [40]:
pdf_file="./documents/churchill_speech.pdf"
docx_file="./documents/churchill_speech.docx"
text_file="./documents/churchill_speech.txt"

print(load_document(pdf_file))
print(load_document(docx_file))
print(load_document(text_file))

Loading ./documents/churchill_speech.pdf
[Document(page_content="Winston Churchill Speech - We Shall Fight on the Beaches We Shall Fight on the Beaches June 4, 1940 House of Commons From the moment that the French defenses at Sedan and on the Meuse were broken at the end of the second week of May, only a rapid retreat to Amiens and the south could have saved the BriGsh and French Armies who had entered Belgium at the appeal of the Belgian King; but this strategic fact was not immediately realized. The French High Command hoped they would be able to close the gap, and the Armies of the north were under their orders. Moreover, a reGrement of this kind would have involved almost certainly the destrucGon of the ﬁne Belgian Army of over 20 divisions and the abandonment of the whole of Belgium. Therefore, when the force and scope of the German penetraGon were realized and when a new French Generalissimo, General Weygand, assumed command in place of General Gamelin, an eﬀort was made by the F

RuntimeError: Error loading ./documents/churchill_speech.docx

### Chunking Data

In [41]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

def chunk_data(data, chunk_size=256):
    # Find how to load a built-in document transformers that make it easy to split with https://python.langchain.com/docs/modules/data_connection/document_transformers/
    text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False,
)
    # chunks = split the data into chunks
    chunks = text_splitter.create_documents([data])
    # Verification
    if len(chunks) != 0:
        return chunks
    else:
        print('Document is not split')
        return None
    

#### Verification

In [42]:
data = load_document(text_file)
chunks = chunk_data(data)
print(len(chunks))

TypeError: expected string or bytes-like object

### Embedding and Uploading to a Vector Database (Pinecone)

In [None]:
def insert_or_fetch_embeddings(index_name, chunks):
    from pinecone import Pinecone, PodSpec
    from langchain.vectorstores import Pinecone as Pinecone_langchain
    from langchain.embeddings import CohereEmbeddings

    # Find how to create embeddings instance 
    # embeddings = ?

    # Initialize Pinecone  with API key 
    # pinecone = ?
    
    
    if index_name in pinecone.list_indexes().names():
        print(f'Index {index_name} already exists. Loading embeddings ... ', end='')
        vector_store = Pinecone.from_existing_index(index_name, embeddings)
        print('Ok')
    else:
        print(f'Creating index {index_name} and embeddings ...', end='')
        # Create index
        # TO DO
        # Index the text chunks into Pinecone
        # TO DO - use chunks/embeddings/index_name
       
        print('Ok')
        
    return vector_store
    

#### Verification

In [None]:
index_name="your_index_name"

vectorstore = insert_or_fetch_embeddings(index_name, chunks)
if index_name in pinecone.list_indexes().names():
        print(f'Index {index_name} already exists. Loading embeddings ... ', end='')
        vector_store = Pinecone.from_existing_index(index_name, embeddings)
        print('Verification complete')
else:
    print('Verification incomplete')


In [None]:
# can be helpful
def delete_pinecone_index(index_name='all'):
    
    if index_name == 'all':
        indexes = pinecone.list_indexes().names()
        print('Deleting all indexes ... ')
        for index in indexes:
            pinecone.delete_index(index)
        print('Ok')
    else:
        print(f'Deleting index {index_name} ...', end='')
        pinecone.delete_index(index_name)
        print('Ok')
    

### Asking and Getting Answers

In [None]:
def ask_and_get_answer(vector_store, q):
    from langchain.chains import RetrievalQA
    from langchain.chat_models import ChatCohere

    # llm = ?

    # retriever = ?

    # chain = ?
    
    answer = chain.invoke(q)
    return answer

### Running Code in order to complete this project

In [None]:
data = load_document('path_document')

print(f'You have {len(data)} pages in your data')
print(f'There are {len(data[20].page_content)} characters in the page')

In [None]:
chunks = chunk_data(data)
print(len(chunks))
print(chunks[10].page_content)

In [None]:
delete_pinecone_index()

In [None]:
index_name = 'name_index'
vector_store = insert_or_fetch_embeddings(index_name)

In [None]:
q = 'What is the whole document about?'
answer = ask_and_get_answer(vector_store, q)
print(answer)

In [None]:
import time
i = 1
print('Write Quit or Exit to quit.')
while True:
    q = input(f'Question #{i}: ')
    i = i + 1
    if q.lower() in ['quit', 'exit']:
        print('Quitting ... bye bye!')
        time.sleep(2)
        break
    
    answer = ask_and_get_answer(vector_store, q)
    print(f'\nAnswer: {answer}')
    print(f'\n {"-" * 50} \n')

    

In [None]:
delete_pinecone_index()