# example of Packative Inference

this should mirror the deployed API

it requires:

1. packative data spreadsheet (exported from google sheets)
2. deployed weaviate, postgres with packative data pre-loaded
3. envfile with postgress password, aws & azure openai credentials

In [1]:
import sys
sys.path.append("../src")
from text2sql import hello
print(hello.message)

hello, world!


In [2]:
import json
import os

import numpy as np
import pandas as pd
import tqdm

from dotenv import load_dotenv
load_dotenv()

True

## load data

In [3]:
from text2sql.data import PostgresDataset
from text2sql.engine.embeddings import BedrockCohereEmbedder
from text2sql.engine.retrieval import WeaviateRetriever
from text2sql.engine.prompts import LegacyFewShotPromptFormatter
from text2sql.engine.generation import AzureGenerator, BedrockGenerator
from text2sql.engine.generation.postprocessing import extract_first_code_block
from text2sql.evaluation.metrics import (
    get_soft_f1_score,
    get_intent_match,
    get_sql_match,
    get_execution_match
)

  from tqdm.autonotebook import tqdm, trange


### create pipeline modules

In [4]:
# create "dataset" database reader
packative_dataset = PostgresDataset(
    "localhost",
    5432,
    "genapostgre",
    os.getenv("POSTGRES_PASSWORD"),
)

In [5]:
# create embedder
embedder = BedrockCohereEmbedder(
    region_name="us-east-1",
    model="cohere.embed-multilingual-v3",
    input_type="clustering",
    batch_size=8,
)

In [6]:
# create retriever
retriever = WeaviateRetriever(
    host="localhost", 
    port=8081, 
    grpc_port=50051, 
    collection_name="PackativeQueriesCohereClustering"
    )

In [7]:
# create a prompt formatter to generate few-shot learning prompts
schema_description = packative_dataset.describe_database_schema("genapostgre", mode="basic")
formatter = LegacyFewShotPromptFormatter(
    database_type="postgres",
    few_shot_query_key="nl_ko_query",
    few_shot_target_key="sql_query",
)

In [8]:
# create a LLM generator

# model = "meta.llama3-1-8b-instruct-v1:0"
# generator = BedrockGenerator(
#     region_name="us-west-2",
#     model=model,
#     post_func=extract_first_code_block,
# )
model = os.environ.get("AZURE_OPENAI_GEN_MODEL")
generator = AzureGenerator(
    api_key=os.environ.get("AZURE_OPENAI_API_KEY"),
    azure_endpoint=os.environ.get("AZURE_OPENAI_API_ENDPOINT"),
    api_version=os.environ.get("AZURE_OPENAI_API_VERSION"),
    model=model,
    post_func=extract_first_code_block,
)
print(f"using '{model}'")

using 'gena-4o'


### load data

In [9]:
# load data into list of dicts
packative_train_file = "./data/packative_nl2sql_ko_train_revised_20241025.csv"
packative_test_file = "./data/packative_nl2sql_eval_20241029(final).csv"
packative_train_data = pd.read_csv(packative_train_file).to_dict(orient="records")
packative_test_data = pd.read_csv(packative_test_file).to_dict(orient="records")

In [10]:
# embed queries and save embeddings to temp file 
packative_train_embeddings_file = "./data/packative_query_cohere_embeddings.npy"
train_queries = [example["nl_ko_query"] for example in packative_train_data]
if not os.path.isfile(packative_train_embeddings_file):
    print(f"generating train embeddings and saving to '{packative_train_embeddings_file}'")
    train_embeddings = embedder.embed(train_queries, verbose=True)
    np.save(packative_train_embeddings_file, train_embeddings)
else:
    print(f"loading train embeddings from existing file '{packative_train_embeddings_file}'")
    train_embeddings = np.load(packative_train_embeddings_file)

loading train embeddings from existing file './data/packative_query_cohere_embeddings.npy'


In [11]:
retriever.populate_collection(
    embeddings=train_embeddings,
    data=packative_train_data,
)

100%|██████████| 3702/3702 [00:00<00:00, 4456.49it/s]


{'collection_name': 'PackativeQueriesCohereClustering',
 'properties': {'invertedIndexConfig': {'bm25': {'b': 0.75, 'k1': 1.2},
   'cleanupIntervalSeconds': 60,
   'indexNullState': False,
   'indexPropertyLength': False,
   'indexTimestamps': False,
   'stopwords': {'preset': 'en'}},
  'multiTenancyConfig': {'enabled': False,
   'autoTenantCreation': False,
   'autoTenantActivation': False},
  'properties': [{'name': 'no_sql_template',
    'dataType': ['int'],
    'indexFilterable': True,
    'indexSearchable': False,
    'indexRangeFilters': False,
    'tokenization': None,
    'moduleConfig': {'none': {}}},
   {'name': 'sql_template_type',
    'dataType': ['text'],
    'indexFilterable': True,
    'indexSearchable': True,
    'indexRangeFilters': False,
    'tokenization': 'word',
    'moduleConfig': {'none': {}}},
   {'name': 'sql_template',
    'dataType': ['text'],
    'indexFilterable': True,
    'indexSearchable': True,
    'indexRangeFilters': False,
    'tokenization': 'word'

### WIP: predict

todo: manage this with a class, and improve throughput with batched and/or threaded inference

In [12]:
# inference all test samples through "pipeline"
test_results = []
for test_sample in tqdm.tqdm(packative_test_data):
    sample_query = test_sample["nl_ko_query"]
    sample_sql = test_sample["sql_query"]
    # get similar queries
    few_shot_examples = retriever.query(embedder.embed(sample_query), top_k=3)
    # create chat messages
    messages = formatter.generate_messages(
    schema_description=schema_description,
    query=sample_query,
    few_shot_examples=few_shot_examples,
    )
    # inference
    prediction: str | None = generator.generate(messages)
    # validate
    results: dict = packative_dataset.validate_query("genapostgre", prediction)
    # todo: repair & re-evaluate, when implemented
    # save
    output = test_sample.copy()
    output["prediction"] = prediction
    output.update(results)
    test_results.append(output)

  datetime_now = datetime.datetime.utcnow()
100%|██████████| 146/146 [04:16<00:00,  1.76s/it]


### WIP: Evaluate

todo: manage this with a class

In [13]:
test_target_sql = [row.get("sql_query") for row in packative_test_data]
test_target_executions = [json.loads(row.get("execution_result")) for row in packative_test_data]
test_predicted_sql = [row.get("prediction") for row in test_results]
test_predicted_executions = [row.get("execution_result") for row in test_results]

In [14]:
sql_match_scores = [get_sql_match(test_predicted_sql[i], test_target_sql[i]) for i in range(len(test_predicted_executions))]
execution_match_scores = [get_execution_match(test_predicted_executions[i], test_target_executions[i]) for i in range(len(test_predicted_executions))]

One fix for LT09 not applied, it would re-cause the same error.


Error in formatting query: None. Returning original query.
Error in formatting query: None. Returning original query.


In [15]:
print(f"mean sql  match: {np.mean(sql_match_scores)}")
print(f"mean exec match: {np.mean(execution_match_scores)}")

mean sql  match: 0.3561643835616438
mean exec match: 0.363013698630137


In [16]:
# these are buggy, so 0.0 if fail
soft_f1_scores = []
intent_scores = []
for i in range(len(test_predicted_executions)):
    try:
        soft_f1_scores.append(get_soft_f1_score(test_predicted_executions[i], test_target_executions[i]))
    except:
        soft_f1_scores.append(0.0)
    try:
        intent_scores.append(get_intent_match(test_predicted_executions[i], test_target_executions[i]))
    except:
        intent_scores.append(0.0)

In [17]:
print(f"mean soft f1: {np.mean(soft_f1_scores)}")
print(f"mean intent : {np.mean(intent_scores)}")

mean soft f1: 0.6104236706976434
mean intent : 0.6027397260273972
