# -------------------------------------------------------------------------------------
# Chatting with Court of Accounting Documents
##### Author: Felipe Pedroso
##### version 1.0
##### last revision on 10/12/2023
# -------------------------------------------------------------------------------------


# Importing Libraries and dependencies

In [17]:
!pip -q install langchain openai tiktoken PyPDF2 faiss-cpu
import os
import openai
from PyPDF2 import PdfReader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS 
os.environ["OPENAI_API_KEY"] = "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"

# Importing PDF's

In [27]:
from PyPDF2 import PdfReader

def read_SAI_PDF(pdf_dir):
    doc_reader = PdfReader(pdf_dir)
    raw_text = ''
    for i, page in enumerate(doc_reader.pages):
        text = page.extract_text()
        if text:
            raw_text += text
    return raw_text

raw_text = read_SAI_PDF('TCE_PIAUI/relcon_sample.pdf')

# Converting the large PDF's into smaller chunks

In [28]:
text_splitter = CharacterTextSplitter(        
    separator = "\n",
    chunk_size = 1000,
    chunk_overlap  = 200, #striding over the text
    length_function = len,
)
texts = text_splitter.split_text(raw_text)
print('PDF dividido em:\t',len(texts),'\tchunks')
texts[0]

PDF dividido em:	 18 	chunks


'Estado do Piauí  \nTribunal de Contas  Processo 006867/2018  \nPendente de  \njulgamento  \n \n  \n    \n1 DIRETORIA DE FISCALIZAÇÃO DA ADMINISTRAÇÃO MUNICIPAL  \nProcesso  ............  006867/2018                                      Relatório No 3/2017 -Contraditório  \nAssunto  .............  Prestação de Contas Anual – Contas de Governo do Exercício de 2017  \nInteressado  ........  Município de Acauã  Pop: 6749 hab. Coef. 0.6  \nPREFEITO  ..........  Reginaldo Raimundo Rodrigues  \nRelator  ..............  Delano Carneiro da Cunha Câmara  \nProcurador  .........  José Araújo Pinheiro Júnior  \n \n1. RELATÓRIO  \n \nTrata-se da Prestação de Contas Anual – Contas de Governo - do Município Acauã , referente \nao exercício financeiro de 2017 . \n \nA Diretoria de Fiscalização da Administração Municipal – DFAM deste Tribunal, após análise dos \ndocumentos que integram o processo de  prestação de contas do Ente municipal, demonstrou'

# Embedding and storaging the chunks

In [46]:
embeddings = OpenAIEmbeddings()
docsearch = FAISS.from_texts(texts, embeddings)

In [47]:
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI
chain = load_qa_chain(OpenAI(), 
                      chain_type="stuff") # we are going to stuff all the docs in at once
chain.llm_chain.prompt.template

"Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n\n{context}\n\nQuestion: {question}\nHelpful Answer:"

In [None]:
from langchain.chains import RetrievalQA
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":4})
rqa = RetrievalQA.from_chain_type(llm=OpenAI(), 
                                  chain_type="stuff", 
                                  retriever=retriever, 
                                  return_source_documents=True)

In [49]:
#query = "What's the the text about?"
#docs = docsearch.similarity_search(query)
#chain.run(input_documents=docs, question=query)

In [52]:
from langchain.chains import RetrievalQA
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":4})
rqa = RetrievalQA.from_chain_type(llm=OpenAI(), 
                                  chain_type="stuff", 
                                  retriever=retriever, 
                                  return_source_documents=True)

In [None]:
query =  "Quem é o prefito?"
resposta = rqa(query)

In [51]:
#-----------------------------------------------------------

In [33]:
import os
import openai
openai.api_key = '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'

query = "Extract name entities (NE) from the text. Return only the name of the county"
user_msg = detected_text + "\n\n" + query
system_msg = "You are an accounting expert."

response = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": system_msg},
        {"role": "user", "content": user_msg},
    ],
)

In [34]:
print(response.choices[0].message.content)

Buriti dos Lopes


In [24]:
print(response)

{
  "id": "chatcmpl-88KDcAmdEsty8TQyRjHuoTl1x6LYl",
  "object": "chat.completion",
  "created": 1696995036,
  "model": "gpt-3.5-turbo-0613",
  "choices": [
    {
      "index": 0,
      "message": {
        "role": "assistant",
        "content": "O munic\u00edpio avaliado no texto \u00e9 Buriti dos Lopes."
      },
      "finish_reason": "stop"
    }
  ],
  "usage": {
    "prompt_tokens": 287,
    "completion_tokens": 16,
    "total_tokens": 303
  }
}


# Processing All PDF's

In [43]:
pdf_dir = 'C:/Users/felip/OneDrive/!__Doutorado_2023/Artigo 04 - LLM/Python Chatting with documents/TCE_PIAUI/VOTREL/'
for pdf_doc in list(filter(lambda x: '.pdf' in x, os.listdir(pdf_dir)))[0:1]:
    print(pdf_doc)
    raw_text = read_SAI_PDF(pdf_dir+pdf_doc)
    
embeddings = OpenAIEmbeddings()
docsearch = FAISS.from_texts(texts, embeddings)



from langchain.chains import RetrievalQA
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":4})
rqa = RetrievalQA.from_chain_type(llm=OpenAI(), 
                                  chain_type="stuff", 
                                  retriever=retriever, 
                                  return_source_documents=True)

query =  "Extract name entities (NE) from the text. Return only the name of the county"#"Sumarize os principais pontos do trabalho do auditor?"
resposta = rqa(query)
resposta['result']

 


006867_2018_VOTREL - 11342019 - 31102019 - AUD. DELANO C. DA CUNHA CAMARA.pdf
Estado do Piauí
Tribunal de Contas
Gab. do Cons. S 



In [None]:

user_msg = detected_text + "\n\n" + query
system_msg = "You are an accounting expert."

response = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": system_msg},
        {"role": "user", "content": user_msg},
    ],
)