In [115]:
import xml.etree.ElementTree as ET
import re
import pandas as pd
from dotenv import load_dotenv
from typing import List, Dict, Any
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_ollama.llms import OllamaLLM
from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnableSequence

In [103]:
load_dotenv()

True

In [2]:
def parse_ktr_file(file_path: str) -> ET.Element:
    tree = ET.parse(file_path)
    return tree.getroot()

In [11]:
def extract_execution_sequence(root: ET.Element) -> List[str]:
    steps = root.findall(".//step")
    return [step.find("name").text for step in steps]

In [59]:
def extract_sql_queries(root: ET.Element) -> List[Dict[str, str]]: 
    sql_steps = root.findall(".//step") 
    queries = [] 
    for step in sql_steps: 
        step_type = step.find("type") 
        if step_type is not None and step_type.text in ['TableInput', 'DBJoin']: 
            step_name = step.find("name").text 
            sql = None 
            sql_element = step.find("sql") 
            if sql_element is not None: 
                sql = sql_element.text 
                if sql: 
                    queries.append({ 
                        "step_name": step_name, 
                        "step_type": step_type.text, 
                        "sql": sql 
                    }) 
    return queries

In [63]:
def extract_tables_from_sql(sql_queries: List[str]) -> List[str]: 
    tables = [] 
    for query_info in sql_queries: 
        query = query_info['sql']
        # Padrão para capturar tabelas com ou sem domínio/esquema 
        pattern = r'\bFROM\s+((\w+\.)?(\w+))\b|\bJOIN\s+((\w+\.)?(\w+))\b' 
        matches = re.findall(pattern, query, re.IGNORECASE) 
        for match in matches: # O nome completo da tabela está no grupo 1 (para FROM) ou 4 (para JOIN) 
            full_table_name = match[0] or match[3] 
            tables.append(full_table_name) 
    return list(set(tables))

In [119]:
def get_response_text(response: Any) -> str:
    """
    Extrai o texto da resposta, independentemente do tipo de LLM usado.
    """
    if isinstance(response, str):
        return response
    elif hasattr(response, 'content'):
        return response.content
    elif hasattr(response, 'text'):
        return response.text
    else:
        return ""

In [117]:
def extract_table_column_relations_llm(llm: Any, queries: List[Dict[str, str]]) -> pd.DataFrame:

    # Criar um template de prompt
    prompt_template = PromptTemplate(

        input_variables=["sql"],
        template="""
        Analise a seguinte query SQL e extraia todas as relações tabela/coluna presentes:
        {sql}
        Retorne apenas uma lista de pares tabela/coluna no formato:
        tabela1,coluna1
        tabela1,coluna2
        tabela2,coluna1
        ...

        Não inclua nenhum texto adicional na sua resposta, apenas a lista de pares.
        """
    )

    # Criar uma RunnableSequence
    chain = RunnableSequence(
        prompt_template,
        llm
    )

    results = []

    for query in queries:
        # Executar a sequence
        response = chain.invoke({"sql": query['sql']})

        # Processar a resposta
        response_text = get_response_text(response)
        lines = response_text.strip().split('\n')

        for line in lines:
            if ',' in line:
                table, column = line.split(',')
                results.append({'nome_tabela': table.strip(), 'nome_coluna': column.strip()})

    # Criar DataFrame
    df = pd.DataFrame(results)

    # Remover duplicatas
    df = df.drop_duplicates()
    
    return df

In [75]:
def process_all_queries_llm(llm: OllamaLLM, queries: List[Dict[str, str]]) -> pd.DataFrame:
    return extract_table_column_relations_llm(llm, queries)

In [41]:
pentaho_file_path = '../data/benef_transf.ktr'

In [42]:
root = parse_ktr_file(pentaho_file_path)

In [43]:
execution_sequence = extract_execution_sequence(root)

In [44]:
execution_sequence

['BENEFICIÁRIO',
 'BKP BENEFICIÁRIO ',
 'BUSCA MICROSIGA',
 'Blocking Step',
 'Blocking Step 2',
 'CONTRATO',
 'Database join - BENEFICIÁRIO',
 'Dummy (do nothing)',
 'Dummy (do nothing) 2',
 'FAMILIA PESSOA RESPONSÁVEL',
 'FAMILIA TITULAR RESPONSÁVEL',
 'Filter rows',
 'HANDLE_BENEFICIARIO',
 'Insert / Update - BN_BENEFICIARIO',
 'Insert / Update - BN_RESP_FINANCEIRO',
 'LOTAÇÃO',
 'QTD_INCATU_BN_BENEFICIARIO',
 'QTD_INCATU_BN_RESP_FINANCEIRO',
 'Remover colunas',
 'SAM_FAMILIA_TETO_PF',
 'SEM SETOR',
 'Set Variables',
 'Set Variables 2',
 'Switch / Case LOCAL FATURAMENTO',
 'Switch / Case PESSOA RESPONSÁVEL',
 'Switch / Case TITULAR RESPONSÁVEL']

In [64]:
sql_queries = extract_sql_queries(root)

In [65]:
sql_queries

[{'step_name': 'BENEFICIÁRIO',
  'step_type': 'DBJoin',
  'sql': "SELECT \n       BEN.HANDLE                                                                         AS ID_BENEFICIARIO\n      ,FAM.HANDLE                                                                         AS ID_FAMILIA\n      ,CON.HANDLE                                                                         AS ID_CONTRATANTE\n      ,NVL(LOT.HANDLE,0)                                                                  AS ID_CONTRATANTE_LOT\n      ,PLA.HANDLE                                                                         AS ID_PLANO\n      ,(SELECT MAX(BEN_TIT.HANDLE) \n        FROM   SAM_BENEFICIARIO BEN_TIT \n        WHERE  BEN_TIT.FAMILIA   = BEN.FAMILIA \n        AND    BEN_TIT.EHTITULAR = 'S')                                                   AS ID_BENEFICIARIO_RESP\n      ,CASE\n         WHEN CON.LOCALFATURAMENTO = 'C' THEN CPES.HANDLE --CPES\n         WHEN CON.LOCALFATURAMENTO = 'L' THEN LPES.HANDLE --LPE

In [90]:
tables = sql_queries

In [91]:
tables

[{'step_name': 'BENEFICIÁRIO',
  'step_type': 'DBJoin',
  'sql': "SELECT \n       BEN.HANDLE                                                                         AS ID_BENEFICIARIO\n      ,FAM.HANDLE                                                                         AS ID_FAMILIA\n      ,CON.HANDLE                                                                         AS ID_CONTRATANTE\n      ,NVL(LOT.HANDLE,0)                                                                  AS ID_CONTRATANTE_LOT\n      ,PLA.HANDLE                                                                         AS ID_PLANO\n      ,(SELECT MAX(BEN_TIT.HANDLE) \n        FROM   SAM_BENEFICIARIO BEN_TIT \n        WHERE  BEN_TIT.FAMILIA   = BEN.FAMILIA \n        AND    BEN_TIT.EHTITULAR = 'S')                                                   AS ID_BENEFICIARIO_RESP\n      ,CASE\n         WHEN CON.LOCALFATURAMENTO = 'C' THEN CPES.HANDLE --CPES\n         WHEN CON.LOCALFATURAMENTO = 'L' THEN LPES.HANDLE --LPE

In [66]:
tables = extract_tables_from_sql(sql_queries)

In [67]:
tables

['SAM_FAMILIA',
 'SAM_RAMOATIVIDADE',
 'K_SAM_BENEFICIARIO_APOIOSAUDE',
 'AWE_BIOMETRIA',
 'SAM_ESTADOCIVIL',
 'SAM_CONTRATO',
 'SAM_FAMILIA_ANOTADM',
 'SAM_CAMARACOMPENSACAO',
 'SFN_CONTAFIN',
 'SAM_MATRICULA',
 'SAM_TIPODEPENDENTE',
 'SFN_PESSOA',
 'SAM_PLANO',
 'SAM_BENEFICIARIO',
 'SAM_BENEFICIARIO_EVENTO',
 'SFN_AGENCIA',
 'SAM_CONTRATO_TIPOCARTAO',
 'SFN_FATURA',
 'SAM_MOTIVOCANCELAMENTO',
 'SAM_CAMARACOMPENSACAO_PREST',
 'SAM_BENEFICIARIO_HISTORICO',
 'SAM_CONTRATO_TPDEP',
 'SFN_CONTAFIN_COMPLEMENTO',
 'SAM_BENEFICIARIO_MOD_REPASSE',
 'SAM_UNIMED',
 'SAM_POLITICA',
 'SAM_BENEFICIARIO_MOD',
 'ESTADOS',
 'SAM_ENDERECO',
 'PAISES',
 'SAM_TIPOCARTAO',
 'Z_GRUPOUSUARIOS',
 'SAM_ORIGEMCARENCIA',
 'SFN_BANCO',
 'SAM_CONVENIO',
 'MUNICIPIOS',
 'SIGA.VW_SRA010',
 'SFN_CONTAFIN_TIPODOCUMENTO',
 'SAM_CONTRATO_ORIGEMCARENCIA',
 'SFN_TIPODOCUMENTO',
 'SAM_FAMILIA_TETO_PF',
 'SIS_TIPOFATURAMENTO',
 'SAM_CONTRATO_LOTACAO',
 'SAM_ANOTACAOADMINISTRATIVA',
 'SAM_CBO',
 'SAM_BENEFICIARIO_CARTAOIDE

In [110]:
sql_queries

[{'step_name': 'BENEFICIÁRIO',
  'step_type': 'DBJoin',
  'sql': "SELECT \n       BEN.HANDLE                                                                         AS ID_BENEFICIARIO\n      ,FAM.HANDLE                                                                         AS ID_FAMILIA\n      ,CON.HANDLE                                                                         AS ID_CONTRATANTE\n      ,NVL(LOT.HANDLE,0)                                                                  AS ID_CONTRATANTE_LOT\n      ,PLA.HANDLE                                                                         AS ID_PLANO\n      ,(SELECT MAX(BEN_TIT.HANDLE) \n        FROM   SAM_BENEFICIARIO BEN_TIT \n        WHERE  BEN_TIT.FAMILIA   = BEN.FAMILIA \n        AND    BEN_TIT.EHTITULAR = 'S')                                                   AS ID_BENEFICIARIO_RESP\n      ,CASE\n         WHEN CON.LOCALFATURAMENTO = 'C' THEN CPES.HANDLE --CPES\n         WHEN CON.LOCALFATURAMENTO = 'L' THEN LPES.HANDLE --LPE

In [105]:
#llm = OllamaLLM(model='gemma2:2b', temperature=0.1)

In [106]:
llm = ChatGoogleGenerativeAI(
    model='gemini-1.5-pro-exp-0801',
    temperature=0
)

In [107]:
llm.invoke('Tell me a joke.')

AIMessage(content="Why don't scientists trust atoms? \n\nBecause they make up everything!\n", response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'safety_ratings': [{'category': 'HARM_CATEGORY_SEXUALLY_EXPLICIT', 'probability': 'NEGLIGIBLE', 'blocked': False}, {'category': 'HARM_CATEGORY_HATE_SPEECH', 'probability': 'NEGLIGIBLE', 'blocked': False}, {'category': 'HARM_CATEGORY_HARASSMENT', 'probability': 'NEGLIGIBLE', 'blocked': False}, {'category': 'HARM_CATEGORY_DANGEROUS_CONTENT', 'probability': 'NEGLIGIBLE', 'blocked': False}]}, id='run-ccaffd5e-1e33-404a-a9f7-72ac7c429ac1-0', usage_metadata={'input_tokens': 6, 'output_tokens': 16, 'total_tokens': 22})

In [120]:
table_column_relation = process_all_queries_llm(llm,sql_queries)

In [123]:
table_column_relation.sort_values('nome_tabela').to_csv('teste2.csv')