In [62]:
import re
import requests
import pandas as pd
import numpy as np
import json
import base64
import hashlib
from langchain.chains.combine_documents.base import BaseCombineDocumentsChain
from langchain.chains.question_answering import load_qa_chain
from langchain.chat_models import ChatOpenAI
from langchain.schema import Document
from langchain.prompts import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings
import psycopg2
from pgvector.psycopg2 import register_vector
from google.oauth2.service_account import Credentials
from google.cloud import bigquery
from psycopg2.extras import execute_values

In [68]:
class DatabaseService:

    def __init__(self):
        db_user=''
        db_password=''
        db_database=''
        db_port='5432'
        db_host=''
        self.connection_str = f"host='{db_host}' port='{db_port}' dbname='{db_database}' user='{db_user}' password='{db_password}'"

    def _get_database_connection(self):
        return psycopg2.connect(self.connection_str)

    def run_select_statement(self, select_statement: str, vars=None):
        
        try:
            conn = self._get_database_connection()
            register_vector(conn)
        except Exception as e:
            print(f'Connecting to database failed. {e}')
            return []
        try:
            cursor = conn.cursor()
            cursor.execute(select_statement, vars=vars)
            fields = [field_md[0] for field_md in cursor.description]
            result = cursor.fetchall()
            result = [dict(zip(fields, row)) for row in result]
        except Exception as e:
            print(f'Fetching resuls from database failed. {e}\nSelect statement: {select_statement}')
            conn.rollback()
            result = []

        return result
    
    # Delete all records from a table
    def run_dml_delete_statement(self, table: str):
        
        try:
            conn = self._get_database_connection()
            register_vector(conn)
        except Exception as e:
            print(f'Connecting to database failed. {e}')
            return []
        
        # SQL query to execute 
        query = f'delete from {table}' 
        cursor = conn.cursor() 
        try: 
            cursor.execute(query)
            conn.commit() 
        except (Exception, psycopg2.DatabaseError) as error: 
            print("Error: %s" % error) 
            conn.rollback()  
            
        cursor.close()  
    
    def run_dml_statement(self, df: str, table: str, vars=None): 
        
        try:
            conn = self._get_database_connection()
            register_vector(conn)
        except Exception as e:
            print(f'Connecting to database failed. {e}')
            return []
        
        tuples = [tuple(x) for x in df.to_numpy()] 
  
        cols = ','.join(list(df.columns)) 
        # SQL query to execute 
        query = "INSERT INTO %s(%s) VALUES %%s" % (table, cols) 
        cursor = conn.cursor() 
        try: 
            extras.execute_values(cursor, query, tuples) 
            conn.commit() 
        except (Exception, psycopg2.DatabaseError) as error: 
            print("Error: %s" % error) 
            conn.rollback() 

        cursor.close()  
        
    def run_insert_statement(self, table_name: str, data_list: list):
        try:
            conn = self._get_database_connection()
            register_vector(conn)
        except Exception as e:
            print(f'Connecting to database failed. {e}')
            return
        try:
            cursor = conn.cursor()
            execute_values(cursor, f"""INSERT INTO {table_name} (ticket_id, ticket_comment, ticket_sentence_hash, module, product,
                                   sentence_source, ticket_status, sentence_embedding, created_at, updated_at, sentence) VALUES %s ON CONFLICT DO NOTHING""", data_list)
            conn.commit()
            print(f'Sentences added to dabatase successfully.')
        except Exception as e:
            print(f'Inserting {len(data_list)} rows into database failed. {e}')
            conn.rollback()

In [None]:
class LoadData:
    
    def run_select_bigquery(select_statement: str):
        GOOGLE_CREDENTIAL=''
        credentials = Credentials.from_service_account_info(json.loads(base64.b64decode(GOOGLE_CREDENTIAL)))

        project_id = 'labs-poc'
        client = bigquery.Client(credentials= credentials,project=project_id)

        try:
            # Perform a query.
            query_job = client.query(select_statement)  # API request
            result = query_job.result()
        except Exception as e:
            print(f'Fetching resuls from database failed. {e}\nSelect statement: {select_statement}')
            raise
    
        return result

In [24]:
def extract_topic_between_asterisks(text):
    pattern = r'\*\*(.*?)\*\*'
    match = re.search(pattern, text)
    if match:
        return match.group(1)
    return None

In [40]:
def get_ticket_summary(ticket: str):
    docs_list = [Document(page_content=str(ticket))]
    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=512, chunk_overlap=0
    )
    doc_splits = text_splitter.split_documents(docs_list)
    prompt_template = (
        "A seguinte passagem representa uma conversa entre um cliente e um agente de suporte técnico:"
        "---------------------\n"
        "{context}\n"
        "---------------------\n"
        "Dada a conversa forneça um tópico central e único do problema que foi discutido"
        "Desconsidere nomes, URLs, anexos, datas.\n"
    )
    template = PromptTemplate(input_variables=["context"], template=prompt_template)
    llm = ChatOpenAI(
        temperature=0,
        model="gpt-4o-2024-05-13",
        openai_api_base='https://proxy.dta.totvs.ai/',
        openai_api_key=""
    )
    qa_chain = load_qa_chain(llm, prompt=template)
    context = {'input_documents': doc_splits}
    output = qa_chain(context)
    output = output.get('output_text')
    out = extract_topic_between_asterisks(output)
    return out if out else output

In [26]:
def clean_output(text):
    cleaned_text = re.sub(r'\d+\.\s*', '', text)
    return ' '.join(cleaned_text.split('\n'))

In [27]:
def get_ticket_keywords(ticket: str):
    docs_list = [Document(page_content=str(ticket))]
    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=512, chunk_overlap=0
    )
    doc_splits = text_splitter.split_documents(docs_list)
    prompt_template = prompt_template = (
        "A seguinte passagem representa uma conversa entre um cliente e um agente de suporte técnico:"
        "---------------------\n"
        "{context}\n"
        "---------------------\n"
        "Dada a conversa forneça 8 palavras chaves principais que descrevam o problema que foi discutido"
        "Desconsidere nomes, URLs, anexos, datas.\n"
    )
    template = PromptTemplate(input_variables=["context"], template=prompt_template)
    llm = ChatOpenAI(
        temperature=0,
        model="gpt-4o-2024-05-13",
        openai_api_base='https://proxy.dta.totvs.ai/',
        openai_api_key=""
    )
    qa_chain = load_qa_chain(llm, prompt=template)
    context = {'input_documents': doc_splits}
    output = qa_chain(context)
    output = output.get('output_text')
    return clean_output(output)

In [11]:
select_statement = ("""
                    WITH tickets_all AS (
                        SELECT
                            ticket_id,
                            STRING_AGG(ticket_comment, '\u2561') AS ticket_comment,
                            MAX(subject) AS subject,
                            '' AS summary,
                            '' ticket_sentence_hash,
                            MAX(module_name) AS module,
                            MAX(product_name) AS product,
                            '' AS sentence_source,
                            MAX(ticket_status) AS ticket_status,
                            '[1,2,3]' AS sentence_embedding,
                            MAX(created_at) AS created_at,
                            MAX(updated_at) AS updated_at,
                        FROM
                            `labs-poc`.custom_data.tickets tr
                        GROUP BY
                            ticket_id
                        ),

                        first_contact AS (
                        SELECT
                            ts.ticket_comment AS first_comment,
                            ts.ticket_id
                        FROM
                            `labs-poc`.custom_data.tickets ts
                        INNER JOIN
                            tickets_all tr
                        ON
                            ts.ticket_id = tr.ticket_id
                        QUALIFY ROW_NUMBER() OVER(PARTITION BY ts.ticket_id ORDER BY ts.comment_created_at) = 1
                        )

                        SELECT
                            ta.*,
                            tr.first_comment
                        FROM
                            tickets_all ta
                        INNER JOIN
                            first_contact tr
                            ON tr.ticket_id = ta.ticket_id

                    """)
# Faz a leitura dos daods no BQ
df = LoadData.run_select_bigquery(select_statement).to_dataframe()     

In [12]:
df.shape

(174, 13)

In [13]:
df.columns

Index(['ticket_id', 'ticket_comment', 'subject', 'summary',
       'ticket_sentence_hash', 'module', 'product', 'sentence_source',
       'ticket_status', 'sentence_embedding', 'created_at', 'updated_at',
       'first_comment'],
      dtype='object')

In [16]:
df_summary = df.copy()
df_keywords = df.copy()

In [41]:
df_summary['sentence'] = df_summary['ticket_comment'].apply(get_ticket_summary)
df_summary['sentence_source'] = 'summary'

In [29]:
df_keywords['sentence'] = df_keywords['ticket_comment'].apply(get_ticket_keywords)
df_keywords['sentence_source'] = 'keywords'

In [49]:
df_subject = df.copy()

In [50]:
df_subject['sentence'] = df_subject['subject']
df_subject['sentence_source'] = 'subject'

In [52]:
dff = pd.concat([df_summary, df_keywords, df_subject])
dff.shape

(522, 14)

In [53]:
dff.sample(5)

Unnamed: 0,ticket_id,ticket_comment,subject,summary,ticket_sentence_hash,module,product,sentence_source,ticket_status,sentence_embedding,created_at,updated_at,first_comment,sentence
40,19536126,(13:43:10) *** MARIA ANALICE DE OLIVEIRA entro...,* Instalação Biblioteca RM,,,Configuração,Framework (Linha RM),summary,closed,"[1,2,3]",2024-03-06 13:43:13,2024-03-15 16:07:41,Conversa com MARIA ANALICE DE OLIVEIRA URL: h...,O tópico central e único do problema discutido...
164,19004780,Segue demanda para analise. Bom trabalho!╡Equi...,Erro ao integrar evento,,,Segurança e Saúde Ocupacional (SSO),TOTVS RH (Linha RM),subject,closed,"[1,2,3]",2024-01-11 10:05:47,2024-01-23 18:10:50,Bom dia !\r \r Ao realizar a integração de um ...,Erro ao integrar evento
45,19371366,(11:58:51) *** Joabson de Brito Cardoso entrou...,Formula Visual Criar Coluna na Visão de dados,,,RM Integração,TOTVS Backoffice (Linha RM),subject,closed,"[1,2,3]",2024-02-19 11:58:55,2024-02-28 16:09:38,Conversa com Joabson de Brito Cardoso URL: ht...,Formula Visual Criar Coluna na Visão de dados
117,17817875,"Boa noite! Conforme conversamos via chat, a f...",Acessar espelho de ponto 2018 - mensagem-> não...,,,Automação de Ponto (CHRONUS),TOTVS RH (Linha RM),summary,closed,"[1,2,3]",2023-08-15 17:55:31,2023-08-25 15:10:56,Conversa com Mylena Tito Barbosa URL: https:/...,O tópico central e único do problema discutido...
150,17940848,na hora de validar o ambiente mobile está dand...,Configuração do APPMNTNG,,,Manutenção de ativos (SIGAMNT),TOTVS Manufatura (Linha Protheus),keywords,closed,"[1,2,3]",2023-08-31 09:58:31,2023-09-12 11:10:59,na hora de validar o ambiente mobile está dand...,Licença MNTNG Código 3033 License Server Virtu...


In [56]:
model = OpenAIEmbeddings(
        model="text-embedding-ada-002",
        openai_api_base='https://proxy.dta.totvs.ai/',
        openai_api_key=""
    )

In [57]:
sentences_pending = dff["sentence"].unique()
print(f'Criando os embeddings para {len(sentences_pending)} novas sentenças.')
embeddings_pending = model.embed_documents(sentences_pending)
sentence_to_embedding = dict(zip(sentences_pending, embeddings_pending))

Criando os embeddings para 509 novas sentenças.


In [58]:
dff['sentence_embedding'] = dff["sentence"].apply(lambda x: sentence_to_embedding[x])

In [60]:
def get_sentence_hash(ticket_id, sentence):
    hash_concat = str(ticket_id) + sentence
    return hashlib.md5(hash_concat.encode('utf-8')).hexdigest()

In [63]:
dff['ticket_sentence_hash'] = dff.apply(lambda row: get_sentence_hash(row['ticket_id'], row['sentence']), axis=1)

In [66]:
data_list = [(row['ticket_id'], 
              row['ticket_comment'], 
              row['ticket_sentence_hash'],
              row['module'], 
              row['product'],
              row['sentence_source'],
              row['ticket_status'],
              row['sentence_embedding'], 
              row['created_at'], 
              row['updated_at'], 
              row['sentence']) for _, row in dff.iterrows()]

In [69]:
DatabaseService().run_insert_statement('tickets_embeddings_summary', data_list)

Sentences added to dabatase successfully.
