### Installing Packages

In [None]:
%pip install python-dotenv -q

In [None]:
%pip install docx2txt -q

In [None]:
%pip install pypdf -q

In [None]:
%pip install wikipedia -q

### Load Environment Variables

In [None]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

### Defining Functions

In [None]:
def load_document(file):
  import os
  name, extension = os.path.splitext(file)

  if extension == '.pdf':
    from langchain.document_loaders import PyPDFLoader, OnlinePDFLoader
    loader = PyPDFLoader(file)
  elif extension == '.docx':
    from langchain.document_loaders import Docx2txtLoader
    loader = Docx2txtLoader(file)
  else:
    print('Document format is not supported!')
    return None

  data = loader.load()
  return data

In [None]:
def load_from_wikipedia(query, lang='en', load_max_docs=2):
  from langchain.document_loaders import WikipediaLoader
  loader = WikipediaLoader(query=query, lang=lang, load_max_docs=load_max_docs)

  data = loader.load()
  return data

In [None]:
def chunk_data(data, chunk_size=256):
  from langchain.text_splitter import RecursiveCharacterTextSplitter
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=100)
  chunks = text_splitter.split_documents(data)
  return chunks

### Calculating Cost

In [None]:
def print_embedding_cost(texts):
  import tiktoken
  enc = tiktoken.encoding_for_model('text-embedding-ada-002')
  total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
  print(f'Total Tokens: {total_tokens}')
  print(f'Embedding Cost in USD {total_tokens / 1000 * 0.0004:.6f}')

### Embedding and Uploading to a Vector Database (Pinecone)

In [None]:
def insert_or_fetch_embeddings(index_name):
  import pinecone
  from langchain.vectorstores import Pinecone
  from langchain.embeddings.openai import OpenAIEmbeddings
  
  embeddings = OpenAIEmbeddings()

  pinecone.init(api_key=os.environ.get('PINECONE_API_KEY'), environment=os.environ.get('PINECONE_ENVIRONMENT'))

  if index_name in pinecone.list_indexes():
    print(f'Index {index_name} already exists. Loading embeddings ...', end='')
    vector_store = Pinecone.from_existing_index(index_name, embeddings)
    print('Ok')

  # but if the index does not exist create the index and upsert the vectors
  else:
    print(f'Creating index {index_name} and embeddings ...', end='')
    pinecone.create_index(index_name, dimension=1536, metric='cosine')
    vector_store = Pinecone.from_documents(chunks, embeddings, index_name=index_name)
    print('Ok')
  
  return vector_store

In [None]:
def delete_pinecone_index(index_name='all'):
  import pinecone
  pinecone.init(api_key=os.environ.get('PINECONE_API_KEY'), environment=os.environ.get('PINECONE_ENVIRONMENT'))

  if index_name == 'all':
    indexes = pinecone.list_indexes()
    for index in indexes:
      print('Deleting all indexes...', end='')
      pinecone.delete_index(index)
      print('Ok')
  else:
    print(f'Deleting all indexes {index_name} ...', end='')
    pinecone.delete_index(index_name)
    print('Ok')

### Asking and Getting Answers

In [None]:
def ask_and_get_answer(vector_store, q):
  from langchain.chains import RetrievalQA # no memory, no chat history
  from langchain.chat_models import ChatOpenAI

  llm = ChatOpenAI(temperature=1)

  retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={"k":5})

  chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff", # the default chain_type="stuff" uses ALL of the text from the document
    retriever=retriever
  )

  answer = chain.run(q)
  return answer

In [None]:
def ask_with_memory(vector_store, q, chat_history=[]):
  from langchain.chains import ConversationalRetrievalChain # no memory, no chat history
  from langchain.chat_models import ChatOpenAI

  llm = ChatOpenAI(temperature=1)

  retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={"k":5})

  crc = ConversationalRetrievalChain.from_llm(llm, retriever)
  result = crc({'question': q, 'chat_history': chat_history})
  chat_history.append((question, result['answer']))

  return result, chat_history

### Running Code

In [None]:
data = load_document('us_constitution.pdf')

print(f'You have {len(data)} pages in your data')
print(f'There are {len(data[20].page_content)} chars in one page.')

In [None]:
data = load_document('the_great_gatsby.docx')
print(data[0].page_content)

In [None]:
data = load_from_wikipedia('GPT-4', 'de')
print(data[0].page_content)

In [None]:
chunks = chunk_data(data)
print(len(chunks))
# print(chunks[10].page_content)
print_embedding_cost(chunks)

In [None]:
delete_pinecone_index()

In [None]:
index_name = 'us-consitution'
vector_store = insert_or_fetch_embeddings(index_name)

In [None]:
q = 'What is the whole document about?'
answer = ask_and_get_answer(vector_store, q)
print(answer)

In [None]:
import time

i = 1
print('Write Quit or Exit to quit.')
while True:
  q = input(f'\nQuestion #{i}: ')
  i = i + 1
  if q.lower() in ['quit', 'exit']:
    print('Quitting... Bye Bye! \n')
    time.sleep(2)
    break

  answer = ask_and_get_answer(vector_store, q)
  print(f'\Answer: {answer}')
  print(f'\n {"-" * 50} \n')

In [None]:
data = load_from_wikipedia('ChatGPT', 'ro')
chunks = chunk_data(data)
index_name = 'chatgpt'
vector_store = insert_or_fetch_embeddings(index_name)

In [None]:
q = 'Ce este ChatGPT?'
answer = ask_and_get_answer(vector_store, q)
print(answer)

In [None]:
# ask with memory
chat_history = []
question = "How many amendments are in the U.S. Constitution?"
result, chat_history = ask_with_memory(vector_store, question, chat_history)
print(result['answer'])
print(chat_history)

In [None]:
question = 'Multiply that number 2'
result, chat_history = ask_with_memory(vector_store, question, chat_history)
print(result['answer'])
print(chat_history)