In [147]:
print("Um cuspidor esta nascendo!")

Um cuspidor esta nascendo!


In [148]:
!pip install llama-index-readers-wikipedia -q

In [149]:
!pip install gradio duckdb duckdb-engine pandas unidecode --quiet


In [150]:
!pip install llama-index llama-index-experimental llama-index-llms-groq -q

# Como Obter e Usar a API Key da Groq

## 1. Criar Conta ou Fazer Login
- Acesse [Groq Console](https://console.groq.com/login) e crie uma conta ou fa√ßa login se j√° possuir.

## 2. Navegar at√© a Se√ß√£o de API Keys
- No painel da sua conta, v√° at√© a p√°gina de [API Keys](https://console.groq.com/keys).

## 3. Criar uma Nova API Key
- Clique em **"Create API Key"**.
- D√™ um nome descritivo √† chave (por exemplo: `Colab Integration`) e confirme a cria√ß√£o.

## 4. Copiar a API Key
- Ap√≥s a cria√ß√£o, copie a API key exibida.
- **Aten√ß√£o:** Essa ser√° a √∫nica vez que a chave ser√° mostrada. Guarde-a em um local seguro.

## 5. Configurar a Chave no Colab
- **Boa pr√°tica:** N√£o coloque a chave diretamente no c√≥digo. Em vez disso, use **Colab Secrets** ou vari√°veis de ambiente.

Exemplo usando vari√°vel de ambiente:

```python
import os

# Defina a vari√°vel de ambiente
os.environ["GROQ_API_KEY"] = "sua-api-key-aqui"

#ou ustilizando secret
from google.colab import userdata
os.environ["GROQ_API_KEY"] = userdata.get('GROQ_API_KEY')


In [151]:
import os
from google.colab import userdata
os.environ["GROQ_API_KEY"] = userdata.get('GROQ_API_KEY')

## SpitterDucklakeEngines

In [152]:

from sqlalchemy import create_engine

class SpitterDucklakeEngines():
  @classmethod
  def get_session_duckdb(self, ducklake_db, path):
    engine = create_engine("duckdb:///:memory:")

    with engine.connect() as con:
      con.exec_driver_sql("INSTALL 'ducklake';")
      con.exec_driver_sql("LOAD 'ducklake';")
      #con.exec_driver_sql(f"DETACH DATABASE IF EXISTS {ducklake_db};")
      con.exec_driver_sql(f"""ATTACH 'ducklake:metadata.ducklake' AS {ducklake_db} (DATA_PATH '{path}');""")

      result = con.exec_driver_sql(f"""SHOW DATABASES;""").fetchall()
      print("Databases anexados:", [i[0] for i in result])
      con.exec_driver_sql(f"USE {ducklake_db};")
      con.commit()

    return engine


  @classmethod
  def get_tables(self, engine, ducklake_db, layer=False):
    cond = f"""database_name= '{ducklake_db}'
                AND schema_name = '{layer}'""" if layer else f"database_name= '{ducklake_db}'"
    with engine.connect() as con:
      result = con.exec_driver_sql(f"""
          SELECT table_name
          FROM duckdb_tables
          WHERE {cond}
          ORDER BY table_name;
          ;""").fetchall()
    tables = [i[0] for i in result]

    print("Tabelas identificadas:", tables)
    return tables

  @classmethod
  def dispose_connection(self, engine):
    if engine:
      engine.dispose()


## SpitterDucklakeAnalystOperator

In [185]:



from llama_index.llms.groq import Groq
from llama_index.core import SQLDatabase
from llama_index.core.query_engine import NLSQLTableQueryEngine
from llama_index.core import Settings
from llama_index.core import PromptTemplate

class SpitterDucklakeAnalystOperator():
  def __init__(self,engine, ducklake_db, path, groq_model='llama-3.3-70b-versatile'):
    self.llm = Groq(model=groq_model, api_key=self.get_groq_environment())
    self.ducklake_db = ducklake_db
    self.path = path
    self.engine = engine
    self.query_engine = None
    Settings.llm = Groq(model=groq_model, api_key=self.get_groq_environment())


  def get_groq_environment(self):
    if not os.environ.get("GROQ_API_KEY"):
        raise ValueError("A chave GROQ_API_KEY n√£o foi fornecida nem encontrada no ambiente.")
    return os.environ.get("GROQ_API_KEY")

  def get_query_engine(self, layer='silver', tables=[]):
      tables = tables if tables else  SpitterDucklakeEngines.get_tables(self.engine, self.ducklake_db, layer)
      with self.engine.connect() as con:
        con.exec_driver_sql(f"USE {self.ducklake_db}.{layer};")
        con.commit()
      sql_database = SQLDatabase(self.engine, include_tables=tables)

      return NLSQLTableQueryEngine(sql_database, embed_model=Settings.llm)


In [154]:
# tables = ['tb_bronze_escolas_gramame_geisel']
# ducklake = "spitter_ducklake"
# path = "/tmp/dklq_spitter_ducklake"
# layer = "bronze"
# op = SpitterDucklakeAnalystOperator()
# query_engine = op.get_query_engine(ducklake, path, layer, tables)
# query_engine.query("Qual bairro possui mais escolas?")


In [155]:
#op.close()

## SpitterDucklakeEngenieertOperator

In [156]:
from llama_index.core import Settings
from llama_index.core import PromptTemplate

class SpitterDucklakeEngenieertOperator():
  def __init__(self, engine, ducklake_db, path, groq_model='llama-3.3-70b-versatile'):

    self.ducklake_db = ducklake_db
    self.path = path
    self.query_engine = None
    self.engine = engine
    Settings.llm = Groq(model=groq_model, api_key=self.get_groq_environment())


  def get_groq_environment(self):
    if not os.environ.get("GROQ_API_KEY"):
        raise ValueError("A chave GROQ_API_KEY n√£o foi fornecida nem encontrada no ambiente.")
    return os.environ.get("GROQ_API_KEY")

  def get_table_statistcs(self, table):
      with self.engine.connect() as con:
        desc = con.exec_driver_sql(f"""DESCRIBE FROM {table}""").fetchall()
        cols = ",".join([f'"{row[0]}"' for row in desc])
        ref_query = f"SELECT {cols} FROM {table}"

        desc_str = "\n".join([f"{col}: {dtype}" for col, dtype, *_ in desc])
        sample_df = con.exec_driver_sql(ref_query).fetchall()
        sample_df = pd.DataFrame(sample_df, columns=[row[0] for row in desc]).head(10)
        sample_df = sample_df.apply(lambda col: col.astype(str).str.slice(0, 50) if col.dtype == "object" else col)
        sample_text = sample_df.to_string(index=False)

      return desc_str, sample_text, ref_query


  def generate_sql(self, query, table, layer='bronze', last_query=False):
      print("Insturacao recebida:", query)
      print("Query recebida:", last_query)
      llm = Settings.llm
      schema_table, sample_table, ref_query = self.get_table_statistcs(table)
      ref_query = last_query if last_query else ref_query
      prompt = (
          "Voc√™ est√° trabalhando com uma tabela chamada {table} SQL presente no duckdb.\n"
          "Descri√ß√£o:\n"
          "{schema_table}\n\n"
          "Algumas linhas da tabela\n"
          "{sample_table}\n\n"

          "Query atual:\n"
          "{last_query}\n\n"

          "Nova instru√ß√£o:\n"
          "{query}\n\n"

          "A nova query deve utiilizar a atual como referencia, exceto pelas altera√ß√µes explicitamente pedidas na nova instru√ß√£o.\n"
          "Modifique somente o que for mencionado.\n"
          "Mantenha estrutura, filtros, joins, aliases e ordena√ß√µes originais se n√£o forem citados.\n"
          "Se for solicitado, voc√™ pode remover ou desfazer altera√ß√µes anteriores.\n"
          "Somente se explicitamente indicado, ignore a query atual e gere uma nova query do zero.\n"
          "A query final deve ser execut√°vel em SQL ANSI.\n"
          "Retorne somente a query, sem aspas, coment√°rios ou texto adicional.\n"
          )


      sql_prompt = PromptTemplate(prompt).partial_format(
          schema_table=schema_table,
          table=table,
          sample_table=sample_table,
          query=query,
          last_query = last_query )

      sql_query = llm.predict(sql_prompt)
      print("Query gerada:", sql_query)
      return sql_query

# table = "tb_bronze_escolas_gramame_geisel"
# layer="bronze"
# query_gen = False
# op = SpitterDucklakeEngenieertOperator()

# ducklake = "spitter_ducklake"
# path = "/tmp/dklq_spitter_ducklake"
# layer = "bronze"


In [157]:
# query = "Inclua a coluna escola na query"
# query_gen = op.generate_sql(query, table, layer, ducklake, path, last_query=query_gen)
# print("\n--- QUERY SQL GERADA ---")
# print(query_gen)

In [158]:
#op.close()

## SpitterDucklakeOperator

In [191]:
from unidecode import unidecode
import traceback
import os

import re

class SpitterDucklakeOperator():
    def __init__(self, engine, ducklake_db, path=None):

      self.ducklake_db = ducklake_db
      self.engine = engine

    def table_exist(self, table):
      try:
        with self.engine.connect() as con:
          con.exec_driver_sql(f"USE {self.ducklake_db};")
          con.exec_driver_sql(f"SELECT * FROM {table} LIMIT 0")

        return True
      except Exception as e:
        return False

    def normalize_name(self, col):
      col = unidecode(col).lower().strip()
      col = re.sub(r'[^a-z0-9_]', '_', col)
      col = re.sub(r'_+', '_', col)
      return col.strip('_').encode("utf-8", errors="ignore").decode("utf-8")

    def create_or_replace_table(self, target_table, insert_q, primary_key=False):
      try:
        layer = ".".join(target_table.split(".")[:2])
        with self.engine.connect() as con:
          con.exec_driver_sql(f"USE {self.ducklake_db};")
          instruction_sql = ','.join(["{0} {1} {2}".format(
                                                            self.normalize_name(row[0]),
                                                            row[1],
                                                            "" if row[0] != primary_key else 'PRIMARY KEY'
                                                          ) for row in con.exec_driver_sql
                                                           (f"DESCRIBE {insert_q}").fetchall()
                                    ]
                                      )

          con.exec_driver_sql(f"CREATE SCHEMA IF NOT EXISTS {layer}")
          con.exec_driver_sql(f"""CREATE OR REPLACE TABLE {target_table} ({instruction_sql})""")
          con.commit()
        return True, "success"
      except Exception as e:
        return False, f"{e} -> {traceback.format_exc()}"

    def insert_table(self, source_table, target_table, op_mode="append", primary_key=False, custom_query=False):
      try:
        insert_q = custom_query if custom_query else f"SELECT * FROM {source_table}"
        with self.engine.connect() as con:
          con.exec_driver_sql(f"USE {self.ducklake_db};")
          if not self.table_exist(target_table) or op_mode == "overwrite":
              self.create_or_replace_table(target_table,insert_q, primary_key)
          if primary_key:
            con.exec_driver_sql(f"""
                                    INSERT OR REPLACE INTO {target_table}
                                    {insert_q}
                                """)
            con.commit()
          else:
            con.exec_driver_sql(f"""
                                    INSERT INTO {target_table}
                                    SELECT * FROM {source_table}
                                """)
            con.commit()
        return True, "success"
      except Exception as e:
        return False, f"{e} -> {traceback.format_exc()}"


## SpitterOperator

In [160]:
#csv operations
import duckdb
import pandas as pd
import traceback

class SpitterOperator():
    def __init__(self, content=False, ducklake_db="spitter_ducklake", path=None):
        self.df = pd.DataFrame()
        self.primary_key = ""
        self.cols = []
        self.ducklake_db = ducklake_db
        self.path_prefix = os.environ.get("TMP").replace("\\","/") if os.name == "nt" else "/tmp"
        self.path = path if path else f"{self.path_prefix}/dklq_{self.ducklake_db}"
        self.engine =  SpitterDucklakeEngines.get_session_duckdb(self.ducklake_db, self.path)
        self.engenieer = SpitterDucklakeEngenieertOperator(self.engine, self.ducklake_db, self.path)
        self.analyst = SpitterDucklakeAnalystOperator(self.engine, self.ducklake_db, self.path)

        self.dklq = SpitterDucklakeOperator(self.engine, self.ducklake_db)
        self.tables = {}
        self.op_mode = ""
        if content:
          self.load_bronze(content)

    def preview_table(self, table):
        with self.engine.connect() as con:
          norm_cols = ",".join([f'"{col}" AS {self.dklq.normalize_name(col)}' for col in self.cols])
          preview_df = con.exec_driver_sql(f"""
            SELECT {norm_cols} FROM {table} LIMIT 10
          """)
          preview_df = pd.DataFrame(preview_df.fetchall(), columns=preview_df.keys())
          return preview_df

    def run_query(self, query):
        with self.engine.connect() as con:
          preview_df = con.exec_driver_sql(query)
          preview_df = pd.DataFrame(preview_df.fetchall(), columns=preview_df.keys())

        return preview_df

    def set_primary_key(self, column_name):
        if column_name in self.df.columns:
            self.primary_key = column_name

    def set_table_op_mode(self, op_mode):
        self.op_mode = op_mode

    def persist_table_on_dklq(self, source_layer, target_layer, custom_query=False):
        source_table = f"{self.ducklake_db}.{source_layer}.{self.tables[source_layer]}"
        target_table = f"{self.ducklake_db}.{target_layer}.{self.tables[target_layer]}"
        source_table = self.tables[source_layer] if source_layer == 'tmp' else source_table

        result = self.dklq.insert_table(source_table,
                                        target_table,
                                        self.op_mode,
                                        self.primary_key,
                                        custom_query)

        if not result[0]:
          raise result[1]

    def set_list_tables(self, subject):
        for layer in ["tmp","bronze", "silver", "gold"]:
          prefix = "" if layer == "tmp" else "tb_"
          self.tables[layer]= f"{prefix}{layer}_{subject}"


    def load_bronze(self, file, layer='bronze'):

        try:

          table= self.dklq.normalize_name(file.name.replace("\\", "/").split("/")[-1].split(".")[0])
          self.set_list_tables(table)
          with self.engine.connect() as con:
            con.exec_driver_sql(f"""
                CREATE TEMPORARY TABLE {self.tables['tmp']} AS
                SELECT * FROM read_csv_auto('{file.name}', header=True)
            """)

            self.cols = [row[0] for row in con.exec_driver_sql(f"DESCRIBE {self.tables['tmp']}").fetchall()]
            con.commit()
            return True, "success"
        except Exception as e:
          raise (f"{e} -> {traceback.format_exc()}")


## Inteface

#### Functions

In [198]:
import gradio as gr

def change_tab_bronze():
    return 'bronze', 'tmp'

def change_tab_silver():
    return 'silver', 'bronze',

def change_tab_gold():
    return 'gold',  'silver'

def set_table_dropdown(spitter_operator, table, table_ref):
  table = table if table else table_ref
  subject = "_".join(table.split("_")[2:])
  spitter_operator.set_list_tables(subject)
  return spitter_operator

def load_tables(spitter_operator, layer):
    if not spitter_operator:
      spitter_operator = SpitterOperator()

    choices = SpitterDucklakeEngines.get_tables(spitter_operator.engine,
                                                spitter_operator.ducklake_db,
                                                layer)
    return (
              gr.update(choices=["", *choices],interactive=True,visible=True),
              spitter_operator
             )


def change_tab_queries(spitter_operator):
  if not spitter_operator:
    spitter_operator = SpitterOperator()
  return spitter_operator


def init_by_file(file):
    spitter_operator = SpitterOperator(content=file)
    return spitter_operator, gr.update(interactive=True), spitter_operator.tables['bronze']

def preview_table(spitter_operator, source_layer):
    return spitter_operator.preview_table(spitter_operator.tables[source_layer])

def run_query(spitter_operator, query):
    return spitter_operator.run_query(query), True

def run_query_layer(spitter_operator, query):
    choices = ["overwrite","upsert"]
    result = spitter_operator.run_query(query)
    result_cols = result.columns.tolist()
    return (
        result,
        gr.update(interactive=True, visible=True, choices= choices),
        result_cols
        )

gr.update(interactive=True)
def set_pk_column(spitter_operator, pk_column):
    if spitter_operator is not None:
        spitter_operator.set_primary_key(pk_column)
    return spitter_operator

#button options
def set_pk_chk_visibility(spitter_operator, checkbox_value, cols=[]):
    cols = cols if cols else spitter_operator.cols
    if checkbox_value:
        return gr.update(interactive=True, visible=True,choices=cols)
    else:
        return gr.update(interactive=False, visible=False)

def set_op_chk_visibility(spitter_operator, layer):
    choices = ["append", "overwrite"] if layer == "bronze" else ["overwrite","upsert"]
    return gr.update(interactive=True, visible=True, choices= choices)

def set_table_op_mode_btn(spitter_operator, op_mode, layer):
  spitter_operator.set_table_op_mode(op_mode)
  return f"Commit - table -> {spitter_operator.tables[layer]} | op_mode -> {op_mode}", True, gr.update(interactive=True)

def publish_table(spitter_operator, commit_state, source_layer, target_layer, custom_query=False):
  if commit_state:
    result = spitter_operator.persist_table_on_dklq(source_layer, target_layer, custom_query)
    if target_layer == "bronze":
      return (
          "sucess",
          None,
          pd.DataFrame(),
          gr.update(
                    choices=[spitter_operator.tables[target_layer]],
                    interactive=False,
                    visible=False,
                    value=spitter_operator.tables[target_layer]
                    ),
          spitter_operator.tables[target_layer]
          )
    elif target_layer == "silver":
        return (
          "sucess",
          pd.DataFrame(),
          gr.update(
                    choices=[spitter_operator.tables[target_layer]],
                    interactive=False,
                    visible=False,
                    value=spitter_operator.tables[target_layer]
                    ),
          spitter_operator.tables[target_layer],
          "",
          ""
          )

  else:
    return "Error. Please check your settings!"


def generate_insight(spitter_operator, source_layer, query):
  table = spitter_operator.tables[source_layer]
  #table = f"{spitter_operator.ducklake_db}.{source_layer}.{table}"
  analyst = spitter_operator.analyst.get_query_engine(tables=[table])

  result = analyst.query(query)
  query = result.metadata["sql_query"]
  answer = result.response
  df_result = spitter_operator.run_query(query)

  return_q = f"Resposta:\n {answer} \n\n Query Utilizada:\n {query}"

  return return_q, query, df_result

def generate_query(spitter_operator, query,  source_layer, target_layer, last_query):
  query_gen = spitter_operator.engenieer.generate_sql(
                                                      query,
                                                      f"{spitter_operator.ducklake_db}.{source_layer}.{spitter_operator.tables[source_layer]}",
                                                      target_layer,
                                                      last_query=last_query
                                                      )
  return query_gen

def update_table_list(spitter_operator):
    query = """
        SELECT database_name, schema_name, table_name
        FROM duckdb_tables
        WHERE database_name not like '__ducklake_metadata%'
          AND database_name != 'main'
        ORDER BY database_name, schema_name, table_name
    """
    with spitter_operator.engine.connect() as con:
      result = con.exec_driver_sql(query).fetchall()

    if not result:
        return "Nenhuma tabela encontrada."

    struct = {}
    for row in result:
        db = row[0]
        schema = row[1]
        tabela = row[2]
        struct.setdefault(db, {}).setdefault(schema, []).append(tabela)

    md = ""
    for db, schemas in struct.items():
        md += f"### üóÑ {db}\n"
        for schema, tabelas in schemas.items():
            md += f"- **{schema}**\n"
            for t in tabelas:
                md += f"  - {t}\n"
    return md

#### Bronze

In [162]:
def get_ui_bronze():
  with gr.TabItem("Bronze Layer", id=0) as b_tab:
      b_tab.select(
          change_tab_bronze,
          outputs=[st_layer, st_source_layer]
      )
      gr.Markdown("### Upload dos dados brutos")
      st_choices = gr.State([])
      st_commit_bronze = gr.State(False)
      csv_bronze = gr.File(label="Selecione um CSV",file_count="single", type="filepath")

      bronze_btn = gr.Button("Preview Bronze", interactive=False)
      bronze_table = gr.DataFrame(label="Bronze Data")

      op_b_dropdown = gr.Dropdown(label="Selecione o modo de opera√ß√£o da tabela (obrigat√≥rio)", choices=[], interactive=False, visible=False)



      with gr.Row():
        output_b = gr.Markdown("")
        alerta_b_component = gr.Markdown("")
        publish_bronze_btn = gr.Button(value=f"Confirmar", visible=True, interactive=False)

      #actions
      csv_bronze.upload(init_by_file,
                        inputs=csv_bronze,
                        outputs=[st_spitter_operator,bronze_btn, st_s_table_ref]
                        )

      bronze_btn.click(preview_table,
                       inputs=[st_spitter_operator, st_source_layer],
                       outputs=bronze_table
                       )
      st_spitter_operator.change(set_op_chk_visibility,
                                 inputs=[st_spitter_operator, st_layer],
                                 outputs=[op_b_dropdown]
                                 )

      op_b_dropdown.change(set_table_op_mode_btn,
                    inputs=[st_spitter_operator, op_b_dropdown, st_layer],
                    outputs=[output_b, st_commit_bronze, publish_bronze_btn])

      publish_bronze_btn.click(publish_table,
                                 inputs=[st_spitter_operator, st_commit_bronze,
                                         st_source_layer, st_layer],
                                 outputs=[alerta_b_component, csv_bronze,
                                          bronze_table, op_b_dropdown,
                                          st_s_table_ref
                                          ])

  return b_tab

#### Silver

In [163]:
def get_ui_silver():
  with gr.TabItem("Camada Silver", id=1) as s_tab:
      s_tab.select(
            change_tab_silver,
            outputs=[st_layer, st_source_layer]
        )
      st_commit_silver =  gr.State(False)
      output_s_cols =  gr.State([])

      input_s_dropdown = gr.Dropdown(label="Tabela referencia",
                            choices=[],
                            visible=False,
                            interactive=False
                            )

      load_s = gr.Button("Ler tabelas consulta")

      input_s = gr.Textbox(label="Por favor, insira as transforma√ß√µes necess√°rias",
                            lines=2,
                            max_lines=10,
                            placeholder="Escreva aqui...",
                            interactive=True
                          )
      with gr.Row():
        submit_gen_q = gr.Button("Gerar consulta")
        test_q = gr.Button("Testar consulta")

      output_s_query = gr.Textbox(label="Query gerada",
                                  lines=2,
                                  max_lines=10,
                                  placeholder="Sua query sera gerada aqui, voce pode alterar e testar sua consulta...",
                                  interactive=True
                                )

      test_table = gr.DataFrame(label="Saida")

      pk_checkbox = gr.Checkbox(label="A tabela possui uma chave prim√°ria?", value=False)
      pk_dropdown = gr.Dropdown(label="Selecione a chave", choices=[], interactive=False, visible=False)
      op_s_dropdown = gr.Dropdown(label="Selecione o modo de opera√ß√£o da tabela (obrigat√≥rio)",
                                  choices=[], interactive=False, visible=False)


      with gr.Row():
        output_s = gr.Markdown("")
        alerta_s_component = gr.Markdown("")
        publish_silver_btn = gr.Button(value=f"Confirmar", visible=True, interactive=False)


      #actions
      # st_s_table_ref.change(lambda x: gr.update(value=x),
      #                       inputs=st_s_table_ref,
      #                       outputs=input_s_dropdown )

      load_s.click(load_tables,
                 inputs=[st_spitter_operator, st_source_layer],
                 outputs=[input_s_dropdown, st_spitter_operator]
                 )
      input_s_dropdown.change(set_table_dropdown,
                            inputs=[st_spitter_operator,input_s_dropdown,
                                   st_s_table_ref],
                             outputs=st_s_table_ref)
      submit_gen_q.click(generate_query,
                         inputs=[st_spitter_operator, input_s,
                                 st_source_layer, st_layer,
                                output_s_query],
                         outputs=output_s_query
                         )
      test_q.click(run_query_layer,
                   inputs=[st_spitter_operator, output_s_query ],
                   outputs=[test_table, op_s_dropdown, output_s_cols]
                   )

      pk_checkbox.change(set_pk_chk_visibility,
                         inputs=[st_spitter_operator, pk_checkbox,output_s_cols],
                         outputs=pk_dropdown
                         )

      pk_dropdown.change(set_pk_column,
                         inputs=[st_spitter_operator, pk_dropdown],
                         outputs=st_spitter_operator
                         )

      op_s_dropdown.change(set_table_op_mode_btn,
                          inputs=[st_spitter_operator,
                                  op_s_dropdown, st_layer],
                          outputs=[output_s, st_commit_silver,
                                   publish_silver_btn]
                         )
      publish_silver_btn.click(publish_table,
                                 inputs=[st_spitter_operator,
                                         st_commit_silver,
                                         st_source_layer, st_layer,
                                         output_s_query],
                                 outputs=[alerta_s_component,
                                          test_table,
                                          op_s_dropdown,
                                          st_s_table_ref,
                                          input_s,
                                          output_s_query])




  return s_tab

#### Gold

In [199]:
def get_ui_gold():
  with gr.TabItem("Camada Gold", id=2) as g_tab:
      g_tab.select(
            change_tab_gold,
            outputs=[st_layer, st_source_layer]
        )
      st_commit_gold =  gr.State(False)
      output_g_query =  gr.State("")

      input_g_dropdown = gr.Dropdown(label="Tabela referencia",
                            choices=[],
                            visible=False,
                            interactive=False
                            )

      load_g = gr.Button("Ler tabelas consulta")

      input_g = gr.Textbox(label="Por favor, Qual o insight desejado?",
                            lines=2,
                            max_lines=10,
                            placeholder="Escreva aqui...",
                            interactive=True
                          )
      with gr.Row():
        submit_ins_q = gr.Button("Obter resposta")

      output_g_insight = gr.Textbox(label="Resposta gerada",
                                  lines=2,
                                  max_lines=10,
                                  placeholder="Sua resposta sera gerada aqui, abaixo voce podera visualizar os dados",
                                  interactive=True
                                )

      insight_table = gr.DataFrame(label="Saida")

      gold_table = gr.Textbox(label="Informe um nome para a tabela gold que sera criada, sera aplicado um prefixo 'tb_gold_basename_'")

      with gr.Row():
        output_g = gr.Markdown("")
        alerta_g_component = gr.Markdown("")
        publish_gold_btn = gr.Button(value=f"Confirmar", visible=True, interactive=False)


      #actions
      # st_g_table_ref.change(lambda x: gr.update(value=x),
      #                       inputs=st_g_table_ref,
      #                       outputs=input_g_dropdown )

      load_g.click(load_tables,
                 inputs=[st_spitter_operator, st_source_layer],
                 outputs=[input_g_dropdown, st_spitter_operator]
                 )
      input_g_dropdown.change(set_table_dropdown,
                            inputs=[st_spitter_operator, input_g_dropdown,
                                   st_g_table_ref],
                             outputs=st_g_table_ref)

      submit_ins_q.click(generate_insight,
                         inputs=[st_spitter_operator, st_source_layer, input_g],
                         outputs=[output_g_insight,insight_table,output_g_query]
                         #, output_g_query, insight_table]
                         )
      # publish_gold_btn.click(publish_table,
      #                            inputs=[st_spitter_operator,
      #                                    st_commit_gold,
      #                                    st_source_layer,
      #                                    st_layer,
      #                                    output_g_query],
      #                            outputs=[alerta_g_component,
      #                                     st_g_table_ref,
      #                                     input_g,
      #                                     output_g_query])




  return g_tab

#### Explorer

In [165]:
def get_explorer_ui():
    with gr.TabItem("Explorer", id=4) as ex_tab:
      ex_tab.select(
                change_tab_queries,
                inputs=[st_spitter_operator],
                outputs=[st_spitter_operator]
            )
      st_result = gr.State(False)

      with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("Databases e Tabelas")
            update = gr.Button("Atualizar")
            output = gr.Markdown()
            update.click(update_table_list, inputs=st_spitter_operator,  outputs=output)

        with gr.Column(scale=2):
            gr.Markdown("Interface de Queries SQL")
            ex_query = gr.Textbox(
                            label="Escreva sua Query SQL",
                            placeholder="Ex: SELECT * FROM clientes WHERE db.schema.idade > 25",
                            lines=5,
                            elem_id="query-box"
                          )
            run = gr.Button("Executar Query")
            output = gr.DataFrame(label="Resultado", interactive=False)
            run.click(run_query, inputs=[st_spitter_operator, ex_query],
                      outputs=[output,st_result])
    return ex_tab

#### UI

In [None]:


with gr.Blocks(title="# Arquitetura Medalh√£o - Data Spitter - CSV - multi Reads") as spitter_demo:
  gr.Markdown("# Arquitetura Medalh√£o (Bronze / Silver / Gold)")
  gr.Markdown("## Spitter Data Lakehouse Explorer")
  gr.Markdown("## Leia seus dados, configure as operacoes, e maos a obra. Apresentamos suporte a cargas incrementais e de sobrescrita")
  gr.Markdown("As tabelas criadas utilizarao o nome do arquivo como referencia")

  st_spitter_operator = gr.State(None)
  st_layer = gr.State("bronze")
  st_source_layer = gr.State("tmp")
  st_profile = gr.State("")
  st_s_table_ref = gr.State("")
  st_g_table_ref = gr.State("")
  st_s_table_choices = gr.State([])


  with gr.Tabs() as tabs:
    #BronzeLayer
    get_ui_bronze()


    #SilverLayer
    get_ui_silver()

    #GoldLayer
    get_ui_gold()

    #Explorer
    get_explorer_ui()


if __name__ == "__main__":
    spitter_demo.launch(debug=True)
#spitter_demo.launch(debug=True)
#spitter_demo.queue(show_error=True)
#spitter_demo.launch(share=True)



It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://42d91f207fe36d7503.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Databases anexados: ['memory', 'spitter_ducklake']
Tabelas identificadas: ['tb_silver_escolas_gramame_geisel', 'tb_silver_sales', 'tb_silver_vendas']


Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/gradio/queueing.py", line 759, in process_events
    response = await route_utils.call_process_api(
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/gradio/route_utils.py", line 354, in call_process_api
    output = await app.get_blocks().process_api(
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/gradio/blocks.py", line 2127, in process_api
    data = await self.postprocess_data(block_fn, result["prediction"], state)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/gradio/blocks.py", line 1904, in postprocess_data
    prediction_value = block.postprocess(prediction_value)
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/gradio/components/dataframe.py", line 453, in postpro

In [186]:
spitter_operator = SpitterOperator()
engine = spitter_operator.analyst.get_query_engine(tables=["tb_silver_escolas_gramame_geisel"])

Databases anexados: ['memory', 'spitter_ducklake']




In [196]:
engine.query("Qual bairro possui mais escolas").metadata["sql_query"]

'SELECT tb_silver_escolas_gramame_geisel.bairro, COUNT(tb_silver_escolas_gramame_geisel.escola) AS num_escolas FROM tb_silver_escolas_gramame_geisel GROUP BY tb_silver_escolas_gramame_geisel.bairro ORDER BY num_escolas DESC'