In [2]:
import os
import wikipedia
from langchain.document_loaders \
    import PyPDFLoader, Docx2txtLoader, TextLoader, WikipediaLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import pinecone
from langchain.vectorstores import Pinecone as PineconeLangChain
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chains import ConversationalRetrievalChain, RetrievalQA
from langchain.chat_models import ChatOpenAI
from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv(), override=False) 

def load_document(file):
    name, extension = os.path.splitext(file)
    if extension == '.pdf':
        print(f'Loading {file}')
        document_loader = PyPDFLoader(file)
    elif extension == '.docx':
        print(f'Loading {file}')
        document_loader = Docx2txtLoader(file)
    elif extension in ('.txt', '.py'):
        document_loader = TextLoader(file)
    else:
        print('Document format is not supported!')
        return None
    return document_loader.load()

  from tqdm.autonotebook import tqdm


In [3]:
def load_from_wikipedia(query, lang='en', load_max_docs=2):
    loader = WikipediaLoader(query=query, lang=lang, load_max_docs=load_max_docs)
    data = loader.load()
    print(data)
    return data

In [4]:
def chunk_data(data, chunk_size=256):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0)
    return text_splitter.split_documents(data)

In [5]:
def insert_or_fetch_embeddings(index_name):
    embeddings = OpenAIEmbeddings()
    PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
    PINECONE_API_ENV = os.environ.get("PINECONE_ENV")
    print(f"PINECONE_API_KEY={PINECONE_API_KEY}")
    print(f"PINECONE_API_ENV={PINECONE_API_ENV}")
    pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_API_ENV)
    #pinecone.init(api_key=os.environ.get('PINECONE_API_KEY'), environment=os.environ.get('PINECONE_ENV'))
    if index_name in pinecone.list_indexes():
        print(f'Index {index_name} already exists. Loading embeddings... ', end='')
        vector_store = PineconeLangChain.from_existing_index(index_name, embeddings)
        print('Ok')
    else:
        print(f'Creating index {index_name} and embeddings...', end='')
        pinecone.create_index(index_name, dimension=1536, metric='cosine')
        vector_store = PineconeLangChain.from_documents(chunks, embeddings, index_name=index_name)
        print('Ok')
    return vector_store

In [6]:
def delete_pinecone_index(index_name='all'):
    PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
    PINECONE_API_ENV = os.environ.get("PINECONE_ENV")
    print(f"PINECONE_API_KEY={PINECONE_API_KEY}")
    print(f"PINECONE_API_ENV={PINECONE_API_ENV}")
    pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_API_ENV)
    #pinecone.init(api_key=os.environ.get('PINECONE_API_KEY'), environment=os.environ.get('PINECONE_ENV'))
    if index_name == 'all':
        indexes = pinecone.list_indexes()
        print('Deleting all indexes... ')
        for index in indexes:
            pinecone.delete_index(index)
        print('Ok')
    else:
        print(f'Deleting index {index_name}...', end='')
        pinecone.delete_index(index_name)
        print('Ok')

In [7]:
def ask_and_get_answer(vector_store, q):
    llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=1)
    retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': 3})
    chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)
    answer = chain.run(q)
    return answer

In [40]:
from langchain.memory import ConversationBufferMemory
from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate, MessagesPlaceholder
from langchain.schema import SystemMessage

"""
chat_memory = ConversationBufferMemory(
    memory_key='chat_history',
    return_messages=True
)

chatbot_prompt = ChatPromptTemplate(
    input_variables=["content", "chat_history"],
    messages=[
        SystemMessage(content="You are a chatbot having a conversation with a human."),
        MessagesPlaceholder(variable_name="chatbot_history"),
        HumanMessagePromptTemplate.from_template("{content}")
    ]
)

llm = ChatOpenAI(temperature=1)
retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': 3})
crc = ConversationalRetrievalChain.from_llm(llm, retriever, memory=chat_memory)

def ask_with_memory(vector_store, question, chat_history, crc):
    result = crc({'question': question, 'chat_history': chat_history})
    chat_history.append((question, result['answer']))
    return result, chat_history
"""

def ask_with_memory(vector_store, question, chat_history=[]):
    from langchain.chains import ConversationalRetrievalChain
    from langchain.chat_models import ChatOpenAI
    
    llm = ChatOpenAI(temperature=1)
    retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': 3})
    
    crc = ConversationalRetrievalChain.from_llm(llm, retriever)
    result = crc({'question': question, 'chat_history': chat_history})
    chat_history.append((question, result['answer']))
    
    return result, chat_history

In [None]:
i = 1
chat_history = []
chatbot_mode = "initial"
print("Write Quit or Exit to quit")
while True:
    if chatbot_mode == "initial":
        q = input("Write 1 for a document or 2 for Wikipedia")
        if q == "1":
            q = input("Share the path of the document")
            chatbot_mode = "document"
        elif q == "2":
            q = input("Share the Wikipedia query")
            chatbot_mode = "wikipedia"
        elif q.lower() in ["quit","exit"]:
            print("See you soon!")
            break
        else:
            continue
        if chatbot_mode == "document":
            data = load_document(q)
            chatbot_mode = "process"
        elif chatbot_mode == "wikipedia":
            data = load_from_wikipedia(q)
            chatbot_mode = "process"
        else:
            continue
        if data is not None and chatbot_mode == "process":
            delete_pinecone_index()
            chunks = chunk_data(data)
            index_name = 'chatgpt'
            vector_store = insert_or_fetch_embeddings(index_name)
            chatbot_mode = "question"
            i = 1
    if chatbot_mode == "question":
        q = input(f"Question #{i}")
        i = i + 1
    if q.lower() in ["quit","exit"]:
        print("See you soon!")
        break
    elif chatbot_mode == "question":
        result, chat_history = ask_with_memory(vector_store, q, chat_history)
        print (result['answer'])
        print("----------------------------------------------------------------------")

Write Quit or Exit to quit


Write 1 for a document or 2 for Wikipedia 1
Share the path of the document C:\Users\claud\Downloads\PythonWebDeveloper.pdf


Loading C:\Users\claud\Downloads\PythonWebDeveloper.pdf
PINECONE_API_KEY=762ffcc0-59de-4084-8e83-1ef0ebf15c8b
PINECONE_API_ENV=gcp-starter
Deleting all indexes... 
Ok
PINECONE_API_KEY=762ffcc0-59de-4084-8e83-1ef0ebf15c8b
PINECONE_API_ENV=gcp-starter
Creating index chatgpt and embeddings...Ok


Question #1 What are the main requirements of this job description?


The main requirements for this job description include:
- Advanced skills in Python programming language.
- Experience in developing and maintaining web applications.
- Proficiency in implementing microservices and applying domain-driven design principles.
- Familiarity with RESTful API architecture pattern.
- Strong command of Python and web development frameworks.
----------------------------------------------------------------------


Question #2 What was my last question?


Desculpe, mas eu não consigo acessar o histórico de perguntas anteriores.
----------------------------------------------------------------------


Question #3 print(chat_history)


Desculpe, mas não tenho acesso ao histórico das perguntas anteriores. Portanto, não consigo responder à sua pergunta sobre qual foi a sua última pergunta.
----------------------------------------------------------------------
