In [1]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

True

In [6]:
!pip install pypdf
!pip install docx2txt
!pip install wikipedia

You should consider upgrading via the '/Users/christoschristodoulou/projects/langchain-gdpr/venv/bin/python3 -m pip install --upgrade pip' command.[0m
Collecting docx2txt
  Downloading docx2txt-0.8.tar.gz (2.8 kB)
Using legacy 'setup.py install' for docx2txt, since package 'wheel' is not installed.
Installing collected packages: docx2txt
    Running setup.py install for docx2txt ... [?25ldone
[?25hSuccessfully installed docx2txt-0.8
You should consider upgrading via the '/Users/christoschristodoulou/projects/langchain-gdpr/venv/bin/python3 -m pip install --upgrade pip' command.[0m


In [47]:
def load_document(file):
    import os
    name, extension = os.path.splitext(file)

    if extension == '.pdf':
        from langchain.document_loaders import PyPDFLoader
        print(f'Loading {file} of type {extension}')
        loader = PyPDFLoader(file)
    elif extension == '.docx':
        from langchain.document_loaders import Docx2txtLoader
        print(f'Loading {file} of type {extension}')
        loader = Docx2txtLoader(file)
    else:
        print(f'Document format {extension} is not supported')
        return None
        
    data = loader.load()
    return data

In [67]:
# load document from wikipedia
def load_from_wikipedia(query, lang='en', load_max_docs=2):
    from langchain.document_loaders import WikipediaLoader
    print(f'Loading from wikipedia documents regarding the query: {query}')
    loader = WikipediaLoader(query=query, lang=lang, load_max_docs=load_max_docs)
    data = loader.load()
    return data

In [48]:
def chunk_data(data, chunk_size=256):
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=0)
    chunks = text_splitter.split_documents(data)
    return chunks

### Running Code

In [49]:
data = load_document('./files/attention.pdf')
data[1]

Loading ./files/attention.pdf of type .pdf


Document(page_content='1 Introduction\nRecurrent neural networks, long short-term memory [ 13] and gated recurrent [ 7] neural networks\nin particular, have been firmly established as state of the art approaches in sequence modeling and\ntransduction problems such as language modeling and machine translation [ 35,2,5]. Numerous\nefforts have since continued to push the boundaries of recurrent language models and encoder-decoder\narchitectures [38, 24, 15].\nRecurrent models typically factor computation along the symbol positions of the input and output\nsequences. Aligning the positions to steps in computation time, they generate a sequence of hidden\nstates ht, as a function of the previous hidden state ht−1and the input for position t. This inherently\nsequential nature precludes parallelization within training examples, which becomes critical at longer\nsequence lengths, as memory constraints limit batching across examples. Recent work has achieved\nsignificant improvements in compu

In [50]:
chunks = chunk_data(data)
print(len(chunks))
print(chunks[10].page_content)

192
tensor2tensor. Llion also experimented with novel model variants, was responsible for our initial codebase, and
efficient inference and visualizations. Lukasz and Aidan spent countless long days designing various parts of and


### Calculate the cost of chunks

In [51]:
def print_embedding_cost(texts):
    import tiktoken
    enc = tiktoken.encoding_for_model('text-embedding-ada-002')
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    print(f'Total Tokens: {total_tokens}')
    print(f'Embedding Cost in USD: {total_tokens / 1000 * 0.0004:.6f}')
    
print_embedding_cost(chunks)

Total Tokens: 10045
Embedding Cost in USD: 0.004018


### Embeding and upload to vector database (PINECONE)

In [52]:
def insert_or_fetch_embeddings(index_name):
    import pinecone
    from langchain.vectorstores import Pinecone
    from langchain.embeddings.openai import OpenAIEmbeddings

    embeddings = OpenAIEmbeddings()
    pinecone.init(api_key=os.environ.get('PINECONE_API_KEY'), environment=os.environ.get('PINECONE_ENV'))

    if index_name in pinecone.list_indexes():
        print(f'Index {index_name} already exists') 
        print(f'Loading embeddings') 
        vector_store = Pinecone.from_existing_index(index_name, embeddings)
    else:
        print(f'Creating {index_name} and embeddings', end='')
        pinecone.create_index(index_name, dimension=1536, metric='cosine')
        vector_store = Pinecone.from_documents(chunks, embeddings, index_name=index_name)
        print(f'Index {index_name} created.')

    return vector_store

In [53]:
def delete_pinecone_index(index_name='all'):
    import pinecone
    from langchain.vectorstores import Pinecone
    pinecone.init(api_key=os.environ.get('PINECONE_API_KEY'), environment=os.environ.get('PINECONE_ENV'))

    if index_name == 'all':
        indexes = pinecone.list_indexes()
        print(f'Delete all indexes')

        for index in indexes:
            pinecone.delete_index(index)
            print(f'Ok..')
    else:
        print(f'Deleting {index_name} index')
        
        
        

In [54]:
delete_pinecone_index()

Delete all indexes
Ok..


### Create and upload embeddings to the vector database

In [55]:
index_name = 'askdocument'
vector_store = insert_or_fetch_embeddings(index_name)

Creating askdocument and embeddingsIndex askdocument created.


### Ask and get answer about a document

In [56]:
def ask_and_get_answer(vector_store, query):
    from langchain.chains import RetrievalQA
    from langchain.chat_models import ChatOpenAI
    
    llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=1)
    retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': 3})
    chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)
    answer = chain.run(query)
    return answer

def ask_with_memory(vector_store, question, chat_history=[]):
    from langchain.chains import ConversationalRetrievalChain
    from langchain.chat_models import ChatOpenAI

    llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=1)
    retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': 3})

    crc = ConversationalRetrievalChain.from_llm(llm, retriever)
    result = crc({'question': question, 'chat_history': chat_history})
    chat_history.append(question, result['answer'])

    return result, chat_history

In [61]:
q = 'Give me the context of the paper, and what is it about?'
answer = ask_and_get_answer(vector_store, q)
answer

'The paper is titled "Attention Is All You Need" and is authored by Ashish Vaswani from Google Brain. The paper is published in the Proceedings of the 21st International Conference on Computational Linguistics and 44th Annual Meeting of the ACL in July 2006. \n\nUnfortunately, the provided context does not include a summary or abstract of the paper, so it is unclear what the paper is specifically about.'

In [1]:
import time
i = 1
print('Write Quit or Exit to quit.')
while True:
    q = input(f'Question #{i}')
    i = i + 1
    if q.lower() in ['quit', 'exit']:
        print('Quitting ... bye!')
        time.sleep(2)
        break
    answer = ask_and_get_answer(vector_store, q)
    print(f'\nAnswer: {answer}')
    print(f'\n {"=" * 50} \n')

Write Quit or Exit to quit.


Question #1 test


NameError: name 'ask_and_get_answer' is not defined

## We can use wikipedia loader: 

### Instructions

1. we delete the index from any previous embeddings, 
2. we fetch the wikipedia pages we want based on query and language
3. We ask questions in the same manner

In [None]:
delete_pinecone_index()

In [None]:
data = load_from_wikipedia('Transformer_(machine_learning_model)', 'en')
data

In [None]:
data = load_from_wikipedia('Transformer_(machine_learning_model)', 'en')
chunks = chunk_data(data)
index = 'attention'
vector_store = insert_or_fetch_embeddings(index_name)

In [None]:
q = 'What is transformers'
answer = ask_and_get_answer(vector_store, q)
answer