In [18]:
import copy
import requests
import numpy as np
import pandas as pd
import db_dtypes
import ftfy
import re
import six
import tiktoken
from unidecode import unidecode
from langchain.chains.combine_documents.base import BaseCombineDocumentsChain
from langchain.chains.question_answering import load_qa_chain
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.schema import Document
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.vectorstores import DocArrayInMemorySearch
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
import psycopg2
from pgvector.psycopg2 import register_vector
import json
import os
from google.oauth2 import service_account
from google.cloud import bigquery
#from asyncio.log import logger
from util import transform_sentence
from logger import get_logger
import psycopg2.extras as extras 

In [19]:
!pip install --upgrade langchain
!pip install --upgrade openai



In [20]:
class LoadData:

    def run_select_statement(select_statement: str):
        credentials = service_account.Credentials.from_service_account_file('/Users/rodrigomoraes/Library/CloudStorage/GoogleDrive-rg.moraes@totvs.com.br/My Drive/TOTVS LABS/key_SA_GCP/labs-poc-09feb4e7688e.json')

        project_id = 'labs-poc'
        client = bigquery.Client(credentials= credentials,project=project_id)

        try:
            # Perform a query.
            query_job = client.query(select_statement)  # API request
            result = query_job.result()
            #result_df = result.to_dataframe()
        except Exception as e:
            print(f'Fetching resuls from database failed. {e}\nSelect statement: {select_statement}')
            raise
    
        return result

In [21]:
# Faz a leitura dos daods no BQ
select_statement = ("""
                SELECT
                    ticket_id,
                    ticket_comment as sentence,
                    TO_HEX(MD5(LOWER(CONCAT(
                        'tickets{',
                        '"ticket_id":"', ticket_id, '",',
                        '"ticket_comment":"', ticket_comment, '",',
                        '}'
                    )))) AS ticket_sentence_hash,
                    module_name as module,
                    product_name as product,
                    '' as sentence_source,
                    ticket_status,
                    '[1,2,3]' as sentence_embedding,
                    created_at,
                    updated_at,
                    '' as sanitized_solution, 
                    ''as solution, 
                    ''as summary
                FROM
                    `labs-poc`.custom_data.tickets
                WHERE
                    product_name IN ('Datasul')
                    --AND mdmCreated > SAFE.DATETIME(DATE_SUB(CURRENT_DATE(), INTERVAL 1 MONTH))
                    AND module_name IS NOT NULL 
                    AND ticket_id in (19421012, 19430975, 19580862, 19212381)
                """)
result_df = LoadData.run_select_statement(select_statement)

result = LoadData.run_select_statement(select_statement)
result_df = result.to_dataframe()

# Cria os embeddings para inserir os registros.
for i, row in result_df.iterrows():
    #sentence = result_df['sentence'].values
    sentence = result_df.at[i,'sentence']

    query_vec =  OpenAIEmbeddings(
            openai_api_base="https://proxy.dta.totvs.ai/",
            openai_api_key="sk-axyZ_tPhqNPbbywhdhhhKQ",
            model="text-embedding-ada-002"
        ).embed_query(sentence)
    query_vec = np.array(query_vec)
    
    result_df.at[i,'sentence_embedding'] = query_vec
    
result_df.head()


AttributeError: module 'openai' has no attribute 'OpenAI'

In [173]:
class DatabaseService:

    def __init__(self):
        db_user='tecsupport'
        db_password='?Hi((<={}F{nI=jp'
        db_database='tecsupport'
        db_port='5432'
        db_host='34.123.172.21'
        self.connection_str = f"host='{db_host}' port='{db_port}' dbname='{db_database}' user='{db_user}' password='{db_password}'"

    def _get_database_connection(self):
        return psycopg2.connect(self.connection_str)

    def run_select_statement(self, select_statement: str, vars=None):
        logger = get_logger(__name__)
        
        try:
            conn = self._get_database_connection()
            register_vector(conn)
        except Exception as e:
            logger.error(f'Connecting to database failed. {e}')
            return []
        try:
            cursor = conn.cursor()
            cursor.execute(select_statement, vars=vars)
            fields = [field_md[0] for field_md in cursor.description]
            result = cursor.fetchall()
            result = [dict(zip(fields, row)) for row in result]
        except Exception as e:
            logger.error(f'Fetching resuls from database failed. {e}\nSelect statement: {select_statement}')
            conn.rollback()
            result = []
    
    def run_dml_statement(self, df: str, table: str, vars=None): 
        logger = get_logger(__name__)
        
        try:
            conn = self._get_database_connection()
            register_vector(conn)
        except Exception as e:
            logger.error(f'Connecting to database failed. {e}')
            return []
        
        tuples = [tuple(x) for x in df.to_numpy()] 
  
        cols = ','.join(list(df.columns)) 
        # SQL query to execute 
        query = "INSERT INTO %s(%s) VALUES %%s" % (table, cols) 
        cursor = conn.cursor() 
        try: 
            extras.execute_values(cursor, query, tuples) 
            conn.commit() 
        except (Exception, psycopg2.DatabaseError) as error: 
            print("Error: %s" % error) 
            conn.rollback() 
            cursor.close() 
            return 1
        print("the dataframe is inserted") 
        cursor.close()  

        return result

In [215]:
# Grava o resultado dos dados coletados no BQ e faz insert do Dataframe diretamente no Banco Vetorizado
result = DatabaseService().run_dml_statement(result_df, 'tickets_embeddings')

#DocumentSearchService().find_documents_for_query(query, product, module, k=3)
print(result)

the dataframe is inserted
<google.cloud.bigquery.table.RowIterator object at 0x7ff60aba56c0>


In [232]:
logger = get_logger(__name__)

EMBEDDINGS_SELECT_COLUMNS = ['ticket_id', 'sentence', 'module', 'product', 'sanitized_solution', 
                            'sentence_source', 'solution', 'summary', 'score']

class DocumentSearchService:

    def __init__(self):
        self.database_service = DatabaseService()
        self.threshold = 60

    def keyword_search_on_database(self, query: str, product: str, module: str):

        table_name = 'tickets_embeddings'

        select_statement = f'''
            select * from (
            select *,
            (array_length(intersec, 1) * 0.9 / array_length(query_tokens, 1)) as query_in_sentence_score,
            (array_length(intersec, 1) * 0.9 / array_length(sentence_array, 1)) as sentence_in_query_score from (
            SELECT *, ARRAY
                (
                    SELECT UNNEST(sentence_array)
                    INTERSECT
                    SELECT UNNEST(query_tokens)
                ) as intersec
            FROM (
                    SELECT ticket_id, sentence, module, product, 
                            sentence_source, solution, summary, 
                            STRING_TO_ARRAY(sentence, ' ') as sentence_array,
                            STRING_TO_ARRAY('{query}', ' ') as query_tokens
                    from {table_name}
                    where product = '{product}'
                    and module = '{module}'
                ) as subquery_product_module
            ) as subquery_intersec
            ) as final_query
            where query_in_sentence_score >= 0.45 or sentence_in_query_score >= 0.45;'''
        result = self.database_service.run_select_statement(select_statement)
        #result = LoadData.run_select_statement(select_statement)
        df = pd.DataFrame(result)
        if df.empty:
            return pd.DataFrame(columns=EMBEDDINGS_SELECT_COLUMNS)
        df['score'] = df[['query_in_sentence_score', 'sentence_in_query_score']].max(axis=1)
        df = df.drop(columns=['query_in_sentence_score', 'sentence_in_query_score'])
        return df

    def embeddings_search_on_database(self, query_vec: np.array, product: str, module: str,
                                      threshold: int):
        table_name = 'tickets_embeddings'

        select_statement = f'''SELECT * FROM
                            (SELECT *, 1 - (sentence_embedding <-> %s) as score FROM public.{table_name}
                            WHERE product = '{product}' AND module = '{module}') as filtered_kb
                            WHERE score > {threshold/100};'''
        # select_statement = f'''SELECT {', '.join(EMBEDDINGS_SELECT_COLUMNS)} FROM
        #                     (SELECT *, 1 - (sentence_embedding <-> %s) as score FROM public.{table_name}
        #                     WHERE product = '{product}' AND module = '{module}') as filtered_kb
        #                     WHERE score > {threshold/100};'''
        result = self.database_service.run_select_statement(select_statement, (query_vec,))
        print(select_statement)

        return pd.DataFrame(result)

    def find_documents_for_query(self, query: str, product: str, module: str,
                                 k: int):

        # Searching documents using similarity of OpenAPI embeddings
        query_vec =  OpenAIEmbeddings(
            openai_api_base="https://proxy.dta.totvs.ai/",
            openai_api_key="sk-axyZ_tPhqNPbbywhdhhhKQ",
            model="text-embedding-ada-002"
        ).embed_query(query)
        query_vec = np.array(query_vec)

        results = self.embeddings_search_on_database(query_vec, product, module, self.threshold)

        # Getting only results with score higher than threshold
        results = results[results["score"] >= self.threshold / 100].copy()

        # Ordering results by score
        results.sort_values(by="score", ascending=False, inplace=True)

        # Keeping only the highest rank per ticket'
        results.drop_duplicates(subset=['ticket_id'], keep="first", inplace=True)
        results = results.head(k)

        return results

In [175]:
llm = ChatOpenAI(
            openai_api_base="https://proxy.dta.totvs.ai/",
            openai_api_key="sk-axyZ_tPhqNPbbywhdhhhKQ",
            temperature=0,
            model="gpt-4-1106-preview",
        )

In [176]:
llm

ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x7ff60adf9450>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x7ff60acc0280>, model_name='gpt-4-1106-preview', temperature=0.0, openai_api_key=SecretStr('**********'), openai_api_base='https://proxy.dta.totvs.ai/', openai_proxy='')

In [177]:
def _get_tokens_length(text: str) -> int:
    encondig = tiktoken.encoding_for_model("text-embedding-ada-002")
    return len(encondig.encode(text))

def _get_sentence_by_tokens_number(sentence: str) -> str | list[str]:
    if _get_tokens_length(sentence) > 3000:
        text_spliter = CharacterTextSplitter(chunk_size=400, chunk_overlap=0, separator=".")
        return text_spliter.split_text(sentence)
    return sentence

def generate_chain_docs(query: str, sentence: str) -> list[Document]:
    sentence = _get_sentence_by_tokens_number(sentence)

    if isinstance(sentence, str):
        docs = [Document(page_content=sentence)]

    if isinstance(sentence, list):
        solution_docs = [Document(page_content=text) for text in sentence]
        embeddings = OpenAIEmbeddings(
            openai_api_base="https://proxy.dta.totvs.ai/",
            openai_api_key="sk-axyZ_tPhqNPbbywhdhhhKQ",
            model="text-embedding-ada-002"
        )
        db = DocArrayInMemorySearch.from_documents(
            solution_docs,
            embeddings
        )
        docs = db.similarity_search(query)
    return docs

In [148]:
def get_transform_response_prompt_template() -> PromptTemplate:
    transform_response_template = (
        "You are a helpful support assistant that needs to help the user to have your query answered."
        "The context related with the query is provided below. \n"
        "---------------------\n"
        "{context}"
        "\n---------------------\n"
        "Given the documentation information and not prior knowledge,"
        "answer the query and give instructions summing up the document, using only the context: {query}\n"
        "If you are not certain that the documentation answers the query mention that it is not a certainty"
        "Your answer should be in {language} and with limit of 24 lines\n"
        "You should encourage the user to click in the each of these links {links} for more informations.\n"
        "Put the link between spaces and not modificate anything of the link."
    )
    return PromptTemplate(input_variables=["context", "query", "links", "language"], template=transform_response_template)

In [178]:
def get_transform_response_prompt_template() -> PromptTemplate:
    transform_response_template = (
       "The following are passages related to query {query}\n"
        "---------------------\n"
        "{context}"
        "\n---------------------\n"
        "Given the passages information and not prior knowledge,"
        "answer the query and give instructions summing up the passages\n"
        "If you are not certain that the passages answer the query mention that it is not a certainty.\n"
        "Your answer should be in {language} and with limit of 24 lines\n"
        "After each passage you should encourage the user to click on their corresponding link from these links {links}\n"
        "Put the link between spaces and not modificate anything of the link."
    )
    return PromptTemplate(input_variables=["context", "query", "links", "language"], template=transform_response_template)

In [179]:
chain = load_qa_chain(llm, prompt=get_transform_response_prompt_template())

In [180]:
def transform_sentence(sentence):
    # Ensure the parameter type as string
    mproc0 = str(sentence)

    # Set all messages to a standard encoding
    mproc1 = ftfy.fix_encoding(mproc0)

    # Replaces accentuation from chars. Ex.: "férias" becomes "ferias"
    mproc2 = unidecode(mproc1)

    # Removes special chars from the sentence. Ex.:
    #  - before: "MP - SIGAEST - MATA330/MATA331 - HELP CTGNOCAD"
    #  - after:  "MP   SIGAEST   MATA330 MATA331   HELP CTGNOCAD"
    mproc3 = re.sub('[^0-9a-zA-Z]', " ", mproc2)

    with open('custom_stopwords.txt', 'r') as file:
        custom_stopwords = file.read().splitlines()

    # Sets capital to lower case maintaining full upper case tokens and remove portuguese stop words.
    #  - before: "MP   MEU RH   Horario ou Data registrado errado em solicitacoes do MEU RH"
    #  - after:  "MP MEU RH horario data registrado errado solicitacoes MEU RH"
    mproc4 = " ".join([t.lower() for t in mproc3.split() if t not in custom_stopwords])

    return mproc4

In [244]:
EMBEDDINGS_SELECT_COLUMNS = ['ticket_id', 'sentence', 'ticket_sentence_hash', 'module', 'product',
                             'sentence_source', 'ticket_status', 'created_at', 'updated_at', 'score']


class DocumentSearchService:

    def __init__(self):
        self.database_service = DatabaseService()
        self.threshold = 60

    def embeddings_search_on_database(self, query_vec: np.array, product: str, module: str,
                                      threshold: int):
        table_name = 'tickets_embeddings'
        select_statement = f'''SELECT * FROM
                            (SELECT *, 1 - (sentence_embedding <-> %s) as score FROM public.{table_name}
                            WHERE product = '{product}' AND module = '{module}') as filtered_kb
                            WHERE score > {threshold/100};'''
        result = self.database_service.run_select_statement(select_statement, (query_vec,))
        return pd.DataFrame(result)

    def find_tickets_for_query(self, query: str, product: str, module: str, k: int):

        # Searching tickets using similarity of OpenAPI embeddings
        query_vec =  OpenAIEmbeddings(
            openai_api_base="https://proxy.dta.totvs.ai/",
            openai_api_key="sk-axyZ_tPhqNPbbywhdhhhKQ",
            model="text-embedding-ada-002"
        ).embed_query(query)
        query_vec = np.array(query_vec)
        results = self.embeddings_search_on_database(query_vec, product, module, self.threshold)

        # Getting only results with score higher than threshold
        results = results[results["score"] >= self.threshold / 100].copy()

        # Ordering results by score
        results.sort_values(by="score", ascending=False, inplace=True)

        # Keeping only the highest rank per ticket'
        results.drop_duplicates(subset=['ticket_id'], keep="first", inplace=True)
        results = results.head(k)

        return results

In [245]:
# 360058709693
query = 'Podemos transferir para consultoria paga, para que seja feito um acesso em conjunto para analisarmos e solucionarmos o problema?'
product = 'Datasul'
module = 'Banco de Dados'

In [246]:
print(query)
documents_df = DocumentSearchServiceTest().find_documents_for_querytest(query, product, module, k=3)

Podemos transferir para consultoria paga, para que seja feito um acesso em conjunto para analisarmos e solucionarmos o problema?


AttributeError: 'DocumentSearchServiceTest' object has no attribute 'find_documents_for_querytest'

In [220]:
documents_df

Unnamed: 0,ticket_id,sentence,module,product,sanitized_solution,sentence_source,solution,summary,score,type_of_search


In [128]:
query = transform_sentence(query)
print(query)

query_vec =  OpenAIEmbeddings(
            openai_api_base="https://proxy.dta.totvs.ai/",
            openai_api_key="sk-axyZ_tPhqNPbbywhdhhhKQ",
            model="text-embedding-ada-002"
        ).embed_query(query)
print(query_vec)

podemos transferir consultoria paga feito acesso conjunto analisarmos solucionarmos problema
[0.013569739968009416, 0.006605161044435864, 0.01886607564883217, -0.029106109159843484, 0.0030669240074709223, 0.013251010641796014, -0.016831631238811116, -0.0197069798598482, -0.007961456852121938, -0.017631845965580255, 0.004045152666683037, 0.009372004529368358, 0.03143894001659969, 0.003889178483489377, -0.010741863653690493, -0.0014859916760778201, 0.020276623326078592, -0.012552518694321484, 0.0016826545286110902, -0.008069960591242632, -0.020778453468757774, -0.0001249275694565647, -0.0242776969599243, -0.017374150865737178, -0.010138311663039296, -0.02902473205399491, 0.008876956277837855, -0.00312456658977496, -0.014851440095334298, -0.016180610666732147, 0.019964674959691276, -0.006825559002590293, -0.010199344958087024, -0.018404934990399874, -0.03442278958278968, -0.010762207464491337, 0.011596329784358683, 0.008158119588239884, 0.010619797063595037, -0.009263500790247665, 0.01580

In [127]:
input_documents = []
links = []
for i, result in documents_df.iterrows():
    url = result.get("html_url", "")
    solution = result.get("sanitized_solution", "")
    input_documents.extend(generate_chain_docs(query, solution))
    links.append(url)

In [128]:
input_documents

[Document(page_content='Quando o módulo de Agroindústria estiver marcado no programa CD0101 - Cadastros Gerais Parâmetro Global, duas situações ocorrem no programa CD0204 - Implantação Item:É habilitado o campo Classificação Item na aba Complementos;É realizado uma validação referente ao tamanho do código do item.Esta validação é para que não gere a mensagem 17006 - Código do Item com formato inválido, quando utilizado um código com letras e/ou mais de 8 caracteres para Classificação Item do tipo Produto e Insumo.Porém a validação do tamanho do código não agrega valor para alguns clientes, sendo assim, foi criada a função desabilita-checkitem-agro, que permitirá definir se a validação citada acima deve ser realizada ou não.Se a função desabilita-checkitem-agro estiver criada via programa CD7070 - Função Liberação Especial, o programa CD0204 não irá executar a validação mesmo que o módulo de Agroindústria esteja implantado.Nesse caso, o campo Classificação Item aparecerá na tela do CD02

In [129]:
links

['https://centraldeatendimento.totvs.com/hc/pt-br/articles/360047494593-Manufatura-Linha-Datasul-MEN-Cadastro-de-itens-utilizando-letras-ou-mais-de-oito-caracteres-com-o-m%C3%B3dulo-de-Agroind%C3%BAstria-habilitado',
 'https://centraldeatendimento.totvs.com/hc/pt-br/articles/8139941336087-Manufatura-Linha-Datasul-MEN-C%C3%B3pia-da-Narrativa-do-Item',
 'https://centraldeatendimento.totvs.com/hc/pt-br/articles/20829443966359-Manufatura-Linha-Datasul-MEN-Programas-para-cadastro-e-altera%C3%A7%C3%A3o-de-itens']

In [130]:
context =  {
    "query": query,
    "input_documents": input_documents,
    "links": links,
    "language": "Brazilian Portuguese"
}
context

{'query': 'como realizar o cadastro do item?',
 'input_documents': [Document(page_content='Quando o módulo de Agroindústria estiver marcado no programa CD0101 - Cadastros Gerais Parâmetro Global, duas situações ocorrem no programa CD0204 - Implantação Item:É habilitado o campo Classificação Item na aba Complementos;É realizado uma validação referente ao tamanho do código do item.Esta validação é para que não gere a mensagem 17006 - Código do Item com formato inválido, quando utilizado um código com letras e/ou mais de 8 caracteres para Classificação Item do tipo Produto e Insumo.Porém a validação do tamanho do código não agrega valor para alguns clientes, sendo assim, foi criada a função desabilita-checkitem-agro, que permitirá definir se a validação citada acima deve ser realizada ou não.Se a função desabilita-checkitem-agro estiver criada via programa CD7070 - Função Liberação Especial, o programa CD0204 não irá executar a validação mesmo que o módulo de Agroindústria esteja implanta

In [131]:
a = chain(context)

In [132]:
a.get('output_text')

'Para realizar o cadastro de um item, siga as instruções abaixo:\n\n1. Acesse o programa CD0204 - Implantação Item. Se o módulo de Agroindústria estiver marcado, o campo Classificação Item será habilitado na aba Complementos. Além disso, haverá uma validação do tamanho do código do item, que não deve ultrapassar 8 caracteres para itens classificados como Produto ou Insumo. Caso não deseje essa validação, é possível desabilitá-la criando a função desabilita-checkitem-agro via programa CD7070 - Função Liberação Especial. Para mais detalhes sobre o cadastro de itens com o módulo de Agroindústria, acesse o link: \n https://centraldeatendimento.totvs.com/hc/pt-br/articles/360047494593-Manufatura-Linha-Datasul-MEN-Cadastro-de-itens-utilizando-letras-ou-mais-de-oito-caracteres-com-o-m%C3%B3dulo-de-Agroind%C3%BAstria-habilitado \n\n2. Ao copiar um item no programa CD0204, a Narrativa é gravada automaticamente com as características técnicas do item. Se desejar manter a narrativa original, marq

In [8]:
def find_similar_tickets(subject, summary, product, threshold=60, k=20):
    query_vec = OpenAIEmbeddings(
            openai_api_base=get_proxy_host(),
            openai_api_key=get_proxy_secret_key()
        ).embed_query(subject)
    query_vec = np.array(query_vec)
    select_statement = f'''SELECT {', '.join(EMBEDDINGS_SELECT_COLUMNS)} FROM
                            (SELECT *, 1 - (sentence_embedding <-> %s) as score FROM public.{table_name}
                            WHERE product = '{product}') as filtered_kb
                            WHERE score > {threshold/100};'''
    DatabaseService().run_select_stament(statement)
    
    #TODO: criar o resumo dos comentários
    summary = 
    query_vec = OpenAIEmbeddings(
            openai_api_base=get_proxy_host(),
            openai_api_key=get_proxy_secret_key()
        ).embed_query(summary)
    query_vec = np.array(query_vec)
    select_statement = f'''SELECT {', '.join(EMBEDDINGS_SELECT_COLUMNS)} FROM
                            (SELECT *, 1 - (sentence_embedding <-> %s) as score FROM public.{table_name}
                            WHERE product = '{product}') as filtered_kb
                            WHERE score > {threshold/100};'''
    DatabaseService().run_select_stament(statement)

SyntaxError: invalid syntax (3017404267.py, line 14)

In [9]:
result_df = LoadData.run_select_statement(select_statement)

for i, row in result_df.iterrows():
    ticket_id = row['id']
    expected_ids = row['expected_id']
    expected_ids = [int(id.strip()) for id in expected_ids.split(',')]
    product = row['product']
    module = row['module']
    subject = row['subject']
    summary = row['summary']
    ids = find_similar_tickets(subject, summary, product, module, k=20) # METODO BASEADO NO QUE É FEITO PRA CAROLINA
    found_ids = 0
    for id in expected_ids:
        if id in expected_ids:
            found_ids += 1
            print('Encontrou o id {id} esperado')
    print(f'Encontramos {found_ids} ids dos {len(expected_ids)} esperados')

KeyError: 'id'