# example of Packative Inference

this should mirror the deployed API

it requires:

1. packative data spreadsheet (exported from google sheets)
2. deployed weaviate, postgres with packative data pre-loaded
3. envfile with postgress password, aws & azure openai credentials

In [1]:
import sys
sys.path.append("../src")
from text2sql import hello
print(hello.message)

hello, world!


In [2]:
import json
import os

import numpy as np
import pandas as pd

from dotenv import load_dotenv
load_dotenv()

True

## load data

### create postgres dataset

In [3]:
from text2sql.data import PostgresDataset

In [4]:
packative_dataset = PostgresDataset(
    "localhost",
    5432,
    "genapostgre",
    os.getenv("POSTGRES_PASSWORD"),
)

In [5]:
# run a query on the database to test
packative_dataset.query_database("genapostgre", "SELECT COUNT(DISTINCT bank) FROM bank_account;")

[{'count': 10}]

### load csv data from spreadsheet

In [6]:
packative_train_file = "./data/packative_nl2sql_ko_train_revised_20241025.csv"
packative_test_file = "./data/packative_nl2sql_eval_20241029(final).csv"

In [7]:
# load data into list of dicts
packative_train_data = pd.read_csv(packative_train_file).to_dict(orient="records")
packative_test_data = pd.read_csv(packative_test_file).to_dict(orient="records")

In [8]:
packative_train_data[0]

{'no_sql_template': 1,
 'sql_template_type': 'sql_syntax',
 'sql_template': 'SELECT * FROM {table1} FULL OUTER JOIN {table2} ON {table1}.{column3:uuid} = {table2}.{column4:uuid};',
 'sql_query': 'SELECT * FROM customer FULL OUTER JOIN address ON customer.id = address.customer_id;',
 'nl_ko_query': '고객 테이블과 주소 테이블을 조인하여 모든 세부 정보를 제공해 주세요.'}

In [9]:
packative_test_data[0]

{'no_sql_template': 1,
 'sql_template_type': 'sql_syntax',
 'sql_query': "SELECT custom_field.field_name, custom_field.field_type, custom_field_value.value FROM custom_field INNER JOIN custom_field_value ON custom_field.id = custom_field_value.custom_field_id WHERE field_name like '%email%';",
 'nl_ko_query': "사용자 정의 필드 이름에 'email'이 들어가는 필드 이름과 필드 타입, 필드 값을 보여주세요",
 'nl_en_query': "Please show the field names, types, and values for custom fields that contain 'email' in their name.",
 'execution_result': '[{"field_name": "email_53", "field_type": "string", "value": "Value 2"}, {"field_name": "email_83", "field_type": "string", "value": "Value 10"}, {"field_name": "email_3", "field_type": "string", "value": "Value 4_4"}, {"field_name": "email_63", "field_type": "string", "value": "Value 10_10"}, {"field_name": "email_13", "field_type": "string", "value": "Value 1_21"}, {"field_name": "email", "field_type": "string", "value": "Value 6_26"}, {"field_name": "email_73", "field_type": "string

## embed the training data for retrieval

API v2 is using bedrock cohere "multilingual v3" in "clustering" mode

In [10]:
from text2sql.engine.embeddings import BedrockCohereEmbedder

  from tqdm.autonotebook import tqdm, trange


In [11]:
# create embedder
embedder = BedrockCohereEmbedder(
    region_name="us-east-1",
    model="cohere.embed-multilingual-v3",
    input_type="clustering",
    batch_size=8,
)

In [12]:
# embed queries and save embeddings to temp file 
packative_train_embeddings_file = "./data/packative_query_cohere_embeddings.npy"
train_queries = [example["nl_ko_query"] for example in packative_train_data]
if not os.path.isfile(packative_train_embeddings_file):
    print(f"generating train embeddings and saving to '{packative_train_embeddings_file}'")
    train_embeddings = embedder.embed(train_queries, verbose=True)
    np.save(packative_train_embeddings_file, train_embeddings)
else:
    print(f"loading train embeddings from existing file '{packative_train_embeddings_file}'")
    train_embeddings = np.load(packative_train_embeddings_file)

generating train embeddings and saving to './data/packative_query_cohere_embeddings.npy'


  datetime_now = datetime.datetime.utcnow()
100%|██████████| 463/463 [03:40<00:00,  2.10it/s]


## create local retriever

we can use weaviate as well, but for ease of setup, use local

In [13]:
from text2sql.engine.retrieval import WeaviateRetriever

In [14]:
retriever = WeaviateRetriever(
    host="localhost", 
    port=8081, 
    grpc_port=50051, 
    collection_name="PackativeQueriesCohereClustering"
    )

In [15]:
retriever.populate_collection(
    embeddings=train_embeddings,
    data=packative_train_data,
)

100%|██████████| 3702/3702 [00:00<00:00, 5142.56it/s]


{'collection_name': 'PackativeQueriesCohereClustering',
 'properties': {'invertedIndexConfig': {'bm25': {'b': 0.75, 'k1': 1.2},
   'cleanupIntervalSeconds': 60,
   'indexNullState': False,
   'indexPropertyLength': False,
   'indexTimestamps': False,
   'stopwords': {'preset': 'en'}},
  'multiTenancyConfig': {'enabled': False,
   'autoTenantCreation': False,
   'autoTenantActivation': False},
  'properties': [{'name': 'no_sql_template',
    'dataType': ['int'],
    'indexFilterable': True,
    'indexSearchable': False,
    'indexRangeFilters': False,
    'tokenization': None,
    'moduleConfig': {'none': {}}},
   {'name': 'sql_template_type',
    'dataType': ['text'],
    'indexFilterable': True,
    'indexSearchable': True,
    'indexRangeFilters': False,
    'tokenization': 'word',
    'moduleConfig': {'none': {}}},
   {'name': 'sql_template',
    'dataType': ['text'],
    'indexFilterable': True,
    'indexSearchable': True,
    'indexRangeFilters': False,
    'tokenization': 'word'

In [16]:
# search an (existing) query as a sanity test; top result should be the query itself
search_query = packative_train_data[0]["nl_ko_query"]
search_results = retriever.query(embedder.embed(search_query), top_k=3)
print(json.dumps(search_results, indent=2))

[
  {
    "id": "58277bd0-f83a-572a-81d8-367a58785015",
    "distance": 3.737211227416992e-05,
    "data": {
      "sql_template_type": "sql_syntax",
      "sql_template": "SELECT * FROM {table1} FULL OUTER JOIN {table2} ON {table1}.{column3:uuid} = {table2}.{column4:uuid};",
      "nl_ko_query": "\uace0\uac1d \ud14c\uc774\ube14\uacfc \uc8fc\uc18c \ud14c\uc774\ube14\uc744 \uc870\uc778\ud558\uc5ec \ubaa8\ub4e0 \uc138\ubd80 \uc815\ubcf4\ub97c \uc81c\uacf5\ud574 \uc8fc\uc138\uc694.",
      "sql_query": "SELECT * FROM customer FULL OUTER JOIN address ON customer.id = address.customer_id;",
      "no_sql_template": 1
    }
  },
  {
    "id": "32af7e18-fdfd-5e1e-a760-e22260bf1e02",
    "distance": 0.18406933546066284,
    "data": {
      "no_sql_template": 1,
      "sql_template_type": "sql_syntax",
      "nl_ko_query": "\uc8fc\ubb38 \ud14c\uc774\ube14\uacfc \uacb0\uc81c \ud14c\uc774\ube14\uc744 \uc870\uc778\ud558\uc5ec \ubaa8\ub4e0 \uc815\ubcf4\ub97c \ubcf4\uc5ec\uc8fc\uc138\uc694.",
      

## Database schema, prompt formatter

by default, the API uses "basic" formatting in DAIL-SQL type format

In [17]:
from text2sql.engine.prompts import LegacyFewShotPromptFormatter

In [18]:
# get formatted database schema description from the dataset
schema_description = packative_dataset.describe_database_schema("genapostgre", mode="basic")
print(schema_description[:1000])

table 'email' with columns: id , created_at , updated_at , deleted_at , metadata , to , cc , bcc , subject , language , body , is_opened , first_opened_at , last_opened_at , entity_type , entity_id , tenant_id , creator_id , modifier_id
table 'api_key' with columns: id , created_at , updated_at , deleted_at , name , value , tenant_id , user_id , metadata
table 'business_registration' with columns: id , created_at , updated_at , deleted_at , registration_number , company_name , representative_name , date_of_issuance , date_of_commencement , business_type , business_item , email , address , tenant_id , document_id , business_registration_type , metadata
table 'design_comment_thread' with columns: id , created_at , updated_at , deleted_at , page , tenant_id , order_item_design_id , user_id , customer_id , position_x , position_y , metadata
table 'customer' with columns: id , created_at , updated_at , deleted_at , number , name , website , company_size , industry , reference , converted_da

In [19]:
# create a prompt formatter to generate few-shot learning prompts
formatter = LegacyFewShotPromptFormatter(
    database_type="postgres",
    few_shot_query_key="nl_ko_query",
    few_shot_target_key="sql_query",
)

In [20]:
messages = formatter.generate_messages(
    schema_description=schema_description,
    query="how much wood could a woodchuck chuck if a woodchuck could chuck wood?",
    few_shot_examples=search_results,
)

In [21]:
# print(json.dumps(messages, indent=2))

## LLM inference

the `LegacyFewShotPromptFormatter` automatically requests that the output is in a markdown code block.

so, we can pass the postprocessing function `extract_first_code_block` as `post_func` to remove the code block parts.

In [22]:
from text2sql.engine.generation import AzureGenerator, BedrockGenerator
from text2sql.engine.generation.postprocessing import extract_first_code_block

In [23]:
# model = "meta.llama3-1-8b-instruct-v1:0"
# generator = BedrockGenerator(
#     region_name="us-west-2",
#     model=model,
#     post_func=extract_first_code_block,
# )

In [24]:
model = os.environ.get("AZURE_OPENAI_GEN_MODEL")
generator = AzureGenerator(
    api_key=os.environ.get("AZURE_OPENAI_API_KEY"),
    azure_endpoint=os.environ.get("AZURE_OPENAI_API_ENDPOINT"),
    api_version=os.environ.get("AZURE_OPENAI_API_VERSION"),
    model=model,
    post_func=extract_first_code_block,
)

print(f"using '{model}'")

using 'gena-4o'


### test inference on a training sample

first, load the sample, then do retrieval and generation.

then, try running the query on the database.

In [25]:
# load one training sample
training_sample = packative_train_data[0]
sample_query = training_sample["nl_ko_query"]
sample_sql = training_sample["sql_query"]
print(sample_query)
print(sample_sql)

고객 테이블과 주소 테이블을 조인하여 모든 세부 정보를 제공해 주세요.
SELECT * FROM customer FULL OUTER JOIN address ON customer.id = address.customer_id;


In [26]:
# create chat messages for LLM input
few_shot_examples = retriever.query(embedder.embed(sample_query), top_k=4)[1:]  # it's training so remove the real query

# create chat messages
messages = formatter.generate_messages(
    schema_description=schema_description,
    query=sample_query,
    few_shot_examples=search_results,
)

# this is the final user prompt
print(messages[-1]["content"])

today's date: Tuesday, November 12, 2024
text query: 고객 테이블과 주소 테이블을 조인하여 모든 세부 정보를 제공해 주세요.
please give me a postgres SQL query as markdown code block.


In [27]:
# do LLM inference
prediction = generator.generate(messages)  # need better cleaning later
print(f"target:\n{sample_sql}\n\n")
print(f"prediction:\n{prediction}")

target:
SELECT * FROM customer FULL OUTER JOIN address ON customer.id = address.customer_id;


prediction:
SELECT * FROM customer JOIN address ON customer.id = address.customer_id;


In [28]:
results = packative_dataset.validate_query("genapostgre", prediction)
print(f"validated: {results.get('validated', False)}")
print(f"message: {results.get('message', 'none')}")
print(f"first execution result:")
print(results.get("execution_result", ["none"])[0])

validated: True
message: ok
first execution result:
{'id': UUID('ef02ee58-44d9-4a8c-a125-dfd1b511f8a1'), 'created_at': datetime.datetime(2023, 10, 10, 12, 3), 'updated_at': datetime.datetime(2023, 10, 10, 12, 3), 'deleted_at': None, 'number': '4', 'name': 'Riverside Bungalow', 'website': 'www.globalfinance.com_73', 'company_size': 'Large', 'industry': 'Finance', 'reference': None, 'converted_date': datetime.datetime(2023, 9, 25, 10, 45), 'secret_key': 'sk-54321_73', 'tenant_id': UUID('0063f7f8-eeac-4014-905f-d8086a588762'), 'creator_id': UUID('d2c3aefd-478e-4809-8afa-19a4f957e130'), 'modifier_id': UUID('d2c3aefd-478e-4809-8afa-19a4f957e130'), 'assignee_id': UUID('d2c3aefd-478e-4809-8afa-19a4f957e130'), 'business_registration_id': UUID('c63e5259-0f9e-48e8-8633-144ab84312da'), 'converted_from_id': UUID('bfb76548-5b32-43d9-a6a2-3c0d86669387'), 'type': 'Partner', 'metadata': {}, 'line1': '321 Bayou St', 'line2': None, 'city': 'Houston', 'state': 'TX', 'zip_code': '77001', 'primary': False,

## WIP: predict (dev set example)

todo: make some sort of `Experiment` or `Runner` class to do this

todo: improve throughput with batched and/or threaded inference

In [29]:
# inference all test samples
import tqdm
test_results = []
for test_sample in tqdm.tqdm(packative_test_data):
    sample_query = test_sample["nl_ko_query"]
    sample_sql = test_sample["sql_query"]
    # get similar queries
    few_shot_examples = retriever.query(embedder.embed(sample_query), top_k=3)
    # create chat messages
    messages = formatter.generate_messages(
    schema_description=schema_description,
    query=sample_query,
    few_shot_examples=few_shot_examples,
    )
    # inference
    prediction: str | None = generator.generate(messages)
    # validate
    results: dict = packative_dataset.validate_query("genapostgre", prediction)
    # todo: repair & re-evaluate, when implemented
    # save
    output = test_sample.copy()
    output["prediction"] = prediction
    output.update(results)
    test_results.append(output)

100%|██████████| 146/146 [04:02<00:00,  1.66s/it]


## WIP: Evaluation

wrap in some sort of `Evaluator` class to do this, or call from `Experiment` (above)

In [30]:
from text2sql.evaluation.metrics import (
    get_soft_f1_score,
    get_intent_match,
    get_sql_match,
    get_execution_match
)

In [31]:
test_target_sql = [row.get("sql_query") for row in packative_test_data]
test_target_executions = [json.loads(row.get("execution_result")) for row in packative_test_data]
test_predicted_sql = [row.get("prediction") for row in test_results]
test_predicted_executions = [row.get("execution_result") for row in test_results]

In [32]:
sql_match_scores = [get_sql_match(test_predicted_sql[i], test_target_sql[i]) for i in range(len(test_predicted_executions))]
execution_match_scores = [get_execution_match(test_predicted_executions[i], test_target_executions[i]) for i in range(len(test_predicted_executions))]

One fix for LT09 not applied, it would re-cause the same error.


Error in formatting query: None. Returning original query.
Error in formatting query: None. Returning original query.
Error in formatting query: None. Returning original query.


In [33]:
print(f"mean sql  match: {np.mean(sql_match_scores)}")
print(f"mean exec match: {np.mean(execution_match_scores)}")

mean sql  match: 0.363013698630137
mean exec match: 0.3356164383561644


In [34]:
# these are buggy, so 0.0 if fail
soft_f1_scores = []
intent_scores = []
for i in range(len(test_predicted_executions)):
    try:
        soft_f1_scores.append(get_soft_f1_score(test_predicted_executions[i], test_target_executions[i]))
    except:
        soft_f1_scores.append(0.0)
    try:
        intent_scores.append(get_intent_match(test_predicted_executions[i], test_target_executions[i]))
    except:
        intent_scores.append(0.0)

In [35]:
print(f"mean soft f1: {np.mean(soft_f1_scores)}")
print(f"mean intent : {np.mean(intent_scores)}")

mean soft f1: 0.5657182999648754
mean intent : 0.5753424657534246
