In [1]:
import re
import requests
import pandas as pd
import numpy as np
import json
import base64
import hashlib
from langchain.chains.combine_documents.base import BaseCombineDocumentsChain
from langchain.chains.question_answering import load_qa_chain
from langchain.chat_models import ChatOpenAI
from langchain.schema import Document
from langchain.prompts import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings
import psycopg2
from pgvector.psycopg2 import register_vector
from google.oauth2.service_account import Credentials
from google.cloud import bigquery
from psycopg2.extras import execute_values

In [31]:
class DatabaseService:

    def __init__(self):
        db_user=''
        db_password=''
        db_database=''
        db_port='5432'
        db_host=''
        self.connection_str = f"host='{db_host}' port='{db_port}' dbname='{db_database}' user='{db_user}' password='{db_password}'"

    def _get_database_connection(self):
        return psycopg2.connect(self.connection_str)

    def run_select_statement(self, select_statement: str, vars=None):
        
        try:
            conn = self._get_database_connection()
            register_vector(conn)
        except Exception as e:
            print(f'Connecting to database failed. {e}')
            return []
        try:
            cursor = conn.cursor()
            cursor.execute(select_statement, vars=vars)
            fields = [field_md[0] for field_md in cursor.description]
            result = cursor.fetchall()
            result = [dict(zip(fields, row)) for row in result]
        except Exception as e:
            print(f'Fetching resuls from database failed. {e}\nSelect statement: {select_statement}')
            conn.rollback()
            result = []

        return result
    
    # Delete all records from a table
    def run_dml_delete_statement(self, table: str):
        
        try:
            conn = self._get_database_connection()
            register_vector(conn)
        except Exception as e:
            print(f'Connecting to database failed. {e}')
            return []
        
        # SQL query to execute 
        query = f'delete from {table}' 
        cursor = conn.cursor() 
        try: 
            cursor.execute(query)
            conn.commit() 
        except (Exception, psycopg2.DatabaseError) as error: 
            print("Error: %s" % error) 
            conn.rollback()  
            
        cursor.close()  
    
    def run_dml_statement(self, df: str, table: str, vars=None): 
        
        try:
            conn = self._get_database_connection()
            register_vector(conn)
        except Exception as e:
            print(f'Connecting to database failed. {e}')
            return []
        
        tuples = [tuple(x) for x in df.to_numpy()] 
  
        cols = ','.join(list(df.columns)) 
        # SQL query to execute 
        query = "INSERT INTO %s(%s) VALUES %%s" % (table, cols) 
        cursor = conn.cursor() 
        try: 
            extras.execute_values(cursor, query, tuples) 
            conn.commit() 
        except (Exception, psycopg2.DatabaseError) as error: 
            print("Error: %s" % error) 
            conn.rollback() 

        cursor.close()  
        
    def run_insert_statement(self, table_name: str, data_list: list):
        try:
            conn = self._get_database_connection()
            register_vector(conn)
        except Exception as e:
            print(f'Connecting to database failed. {e}')
            return
        try:
            cursor = conn.cursor()
            execute_values(cursor, f"""INSERT INTO {table_name} (ticket_id, ticket_comment, ticket_chunk_hash, module, product,
                                   ticket_status, sentence_embedding, created_at, updated_at, chunk) VALUES %s ON CONFLICT DO NOTHING""", data_list)
            conn.commit()
            print(f'Sentences added to dabatase successfully.')
        except Exception as e:
            print(f'Inserting {len(data_list)} rows into database failed. {e}')
            conn.rollback()

In [3]:
class LoadData:
    
    def run_select_bigquery(select_statement: str):
        GOOGLE_CREDENTIAL=''
        credentials = Credentials.from_service_account_info(json.loads(base64.b64decode(GOOGLE_CREDENTIAL)))

        project_id = 'labs-poc'
        client = bigquery.Client(credentials= credentials,project=project_id)

        try:
            # Perform a query.
            query_job = client.query(select_statement)  # API request
            result = query_job.result()
        except Exception as e:
            print(f'Fetching resuls from database failed. {e}\nSelect statement: {select_statement}')
            raise
    
        return result

In [4]:
select_statement = ("""
                    WITH tickets_all AS (
                        SELECT
                            ticket_id,
                            STRING_AGG(ticket_comment, '\u2561') AS ticket_comment,
                            MAX(subject) AS subject,
                            '' AS summary,
                            '' ticket_sentence_hash,
                            MAX(module_name) AS module,
                            MAX(product_name) AS product,
                            '' AS sentence_source,
                            MAX(ticket_status) AS ticket_status,
                            '[1,2,3]' AS sentence_embedding,
                            MAX(created_at) AS created_at,
                            MAX(updated_at) AS updated_at,
                        FROM
                            `labs-poc`.custom_data.tickets tr
                        GROUP BY
                            ticket_id
                        ),

                        first_contact AS (
                        SELECT
                            ts.ticket_comment AS first_comment,
                            ts.ticket_id
                        FROM
                            `labs-poc`.custom_data.tickets ts
                        INNER JOIN
                            tickets_all tr
                        ON
                            ts.ticket_id = tr.ticket_id
                        QUALIFY ROW_NUMBER() OVER(PARTITION BY ts.ticket_id ORDER BY ts.comment_created_at) = 1
                        )

                        SELECT
                            ta.*,
                            tr.first_comment
                        FROM
                            tickets_all ta
                        INNER JOIN
                            first_contact tr
                            ON tr.ticket_id = ta.ticket_id

                    """)
# Faz a leitura dos daods no BQ
df = LoadData.run_select_bigquery(select_statement).to_dataframe()     

In [7]:
df.shape

(174, 13)

In [8]:
df.columns

Index(['ticket_id', 'ticket_comment', 'subject', 'summary',
       'ticket_sentence_hash', 'module', 'product', 'sentence_source',
       'ticket_status', 'sentence_embedding', 'created_at', 'updated_at',
       'first_comment'],
      dtype='object')

In [9]:
dff = df.copy()

In [13]:
def split_document(document):
    docs_list = [Document(page_content=str(document))]
    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=512, chunk_overlap=0
    )
    doc_splits = text_splitter.split_documents(docs_list)
    return [doc.page_content for doc in doc_splits]

In [14]:
dff['chunk'] = dff['ticket_comment'].apply(split_document)

In [19]:
dff.shape

(174, 14)

In [18]:
sum(dff['chunk'].apply(lambda x: len(x)))

634

In [20]:
dff = dff.explode('chunk', ignore_index=True)

In [21]:
dff.shape

(634, 14)

In [22]:
dff.sample(5)

Unnamed: 0,ticket_id,ticket_comment,subject,summary,ticket_sentence_hash,module,product,sentence_source,ticket_status,sentence_embedding,created_at,updated_at,first_comment,chunk
590,19392712,(16:45:09) *** NILZETE MENDES GARCIA entrou no...,Chat - Parada Sefaz SP - Rejeição 656 Consumo ...,,,Documentos Fiscais Eletrônicos (DFE),Datasul,,closed,"[1,2,3]",2024-02-20 16:45:12,2024-03-01 10:07:44,Conversa com NILZETE MENDES GARCIA URL: https...,https://centraldeatendimento.totvs.com/hc/pt-b...
216,10053657,"Olá Alexandre, bom dia. No patch do pacote 12...",Erro nas rotinas CD0704 e CD0401 após atualiza...,,,Audit Trail (MAU),Datasul,,closed,"[1,2,3]",2020-10-20 22:29:40,2020-11-04 16:08:49,"Prezado Suporte,\r \r A IBF está utilizando a ...","basta responder ao e-mail deste ticket, para d..."
355,19195245,Utilizamos o módulo de contratos para lançamen...,IRRF desconto simplificado em lançamento de al...,,,TOTVS Gestão Financeira,RM,,closed,"[1,2,3]",2024-01-31 11:28:21,2024-02-15 17:12:37,Utilizamos o módulo de contratos para lançamen...,http://www.planalto.gov.br/ccivil_03/_ato2023-...
253,19441444,"Dúvidas, estou a disposição. **Larissa ...",Nota com origem indevida,,,Faturamento (MFT),TOTVS Backoffice (Linha Datasul),,solved,"[1,2,3]",2024-02-26 15:15:19,2024-03-06 10:18:35,O agente LETICIA JORDENS MARQUES realizou um a...,Pelo que verifiquei na listagem da nota seu it...
63,19540596,"Olá Daiana ,boa tarde As informações são gera...",Registro Bloco H020,,,Fiscal,Logix,,closed,"[1,2,3]",2024-03-06 16:48:35,2024-03-15 18:08:04,Como gerar o registro H020 no bloco H do SPED ...,**![](https://lh3.googleusercontent.com/proxy/...


In [23]:
model = OpenAIEmbeddings(
        model="text-embedding-ada-002",
        openai_api_base='https://proxy.dta.totvs.ai/',
        openai_api_key=""
    )

In [24]:
sentences_pending = dff["chunk"].unique()
print(f'Criando os embeddings para {len(sentences_pending)} novas sentenças.')
embeddings_pending = model.embed_documents(sentences_pending)
sentence_to_embedding = dict(zip(sentences_pending, embeddings_pending))

Criando os embeddings para 608 novas sentenças.


In [25]:
dff['chunk_embedding'] = dff["chunk"].apply(lambda x: sentence_to_embedding[x])

In [29]:
dff['sentence_embedding'] = dff['chunk_embedding']

In [26]:
def get_sentence_hash(ticket_id, sentence):
    hash_concat = str(ticket_id) + sentence
    return hashlib.md5(hash_concat.encode('utf-8')).hexdigest()

In [27]:
dff['ticket_chunk_hash'] = dff.apply(lambda row: get_sentence_hash(row['ticket_id'], row['chunk']), axis=1)

In [30]:
data_list = [(row['ticket_id'], 
              row['ticket_comment'], 
              row['ticket_chunk_hash'],
              row['module'], 
              row['product'],
              row['ticket_status'],
              row['sentence_embedding'], 
              row['created_at'], 
              row['updated_at'], 
              row['chunk']) for _, row in dff.iterrows()]

In [32]:
DatabaseService().run_insert_statement('tickets_embeddings_chunks', data_list)

Sentences added to dabatase successfully.
