# Setup

In [1]:
#model/parser
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

#embedding/vector store
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import RecursiveCharacterTextSplitter

from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain.document_loaders import PyPDFLoader

#system
import os

OPENAI_KEY = ''

# Process data

In [2]:
# Scan folder
files = os.listdir('./data/unprocessed_data/')
embeddings = OpenAIEmbeddings(openai_api_key = OPENAI_KEY)

if len(files)>0:
    print(f'{len(files)} new files found, processing and merging')

    #Load files
    loaders = [PyPDFLoader('./data/unprocessed_data/' + x) for x in files]

    docs = []
    for loader in loaders:
        docs.extend(loader.load())

    #Process
    llm = ChatOpenAI(openai_api_key = OPENAI_KEY, model_name = 'gpt-3.5-turbo-16k', temperature=0.5)
    splits_prompt = ChatPromptTemplate.from_messages([
        ("system", "Formate o texto para que fique entendível para usar de contexto em uma LLM. me dê apenas o output como resposta:"),
        ("user", "{input}")
    ])
    splits_parser = StrOutputParser()
    splits_chain = splits_prompt | llm | splits_parser
    for n in range(len(docs)):
        docs[n].page_content = splits_chain.invoke({"input": f"{docs[n].page_content}"})

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=150)
    splits = text_splitter.split_documents(docs)

    #Vector merging
    vector = FAISS.from_documents(splits, embeddings)
    old_vector = FAISS.load_local("./data/faiss_index", embeddings, allow_dangerous_deserialization=True)
    vector.merge_from(old_vector)

    vector.save_local("./data/faiss_index")

    for file in files:
        os.replace(f"./data/unprocessed_data/{file}", f"./data/processed_data/{file}")

else:
    print('0 new files found, loading old vector')
    vector = FAISS.load_local("./data/faiss_index", embeddings, allow_dangerous_deserialization=True)

1 new files found, processing and merging


# Model

In [3]:
#Chain creation

chat_hist = []

llm = ChatOpenAI(openai_api_key = OPENAI_KEY, model_name = 'gpt-3.5-turbo-16k', temperature=0.5)

prompt = ChatPromptTemplate.from_template(
    """"Você deve memorizar os editais do IFRS canoas para responder questões sobre ele.

        Cursos de Licenciatura, Bacharel e Tecnólogo são considerados cursos de nível superior.

        Dê informações completas.

        <context>
        {context}
        </context>

        Historico de mensagens: {history}

        Pergunta: {input}""")

#retrieval
document_chain = create_stuff_documents_chain(llm, prompt)

retriever = vector.as_retriever(search_kwargs={'k': 10, 'score_treshold': 0.9},
                                search_type="similarity")
retrieval_chain = create_retrieval_chain(retriever, document_chain)

# Use

In [5]:
msg = "Qual o salario pago pra quem é monitor?"

response = retrieval_chain.invoke({"input": msg,
                                   "history": chat_hist})

print(response["answer"])

chat_hist.append({'user':msg})
chat_hist.append({'agent':response["answer"]})

O valor mensal das bolsas de monitoria está definido da seguinte forma:
- Para uma carga horária de 4 horas semanais, o valor é de R$ 175,00/mês.
- Para uma carga horária de 8 horas semanais, o valor é de R$ 350,00/mês.
