In [1]:
from langchain_community.utilities import SQLDatabase

db = SQLDatabase.from_uri("sqlite:///../data/catastici.db")

# test DB
print(db.dialect)
print(db.get_usable_table_names())
db.run("SELECT * FROM catastici LIMIT 1;")

sqlite
['catastici']


"[(3183, 'liberal', 'campi', 'casa e bottega da barbier', 70, 'campo vicino alla chiesa')]"

In [2]:
import pandas as pd
query_res = pd.read_csv('./test_data_generated.csv')

In [3]:
import ast
def clean_query(sql_query):
    """clean the output"""
    # change to list
    sql_query_list = ast.literal_eval(sql_query)
    
    # split on ;
    sql_query_list = [query.split(';')[0]+';' for query in sql_query_list]
    return sql_query_list

def check_sql_executability(query, db):
    try:
        return db.run(query)
    except:
        return "ERROR"

In [4]:
# clean output
for idx, row in query_res.iterrows():
    query_list_clean = clean_query(row['generated_query'])
    final_out = None
    for out in query_list_clean:
        answer = check_sql_executability(out, db)
        if answer != "ERROR":
            final_out = out
            break
    if final_out == None:
        final_out = '\n'.join(query_list_clean)
        answer = "ERROR"
    query_res.loc[idx,'generated_answer'] = answer
    query_res.loc[idx,'generated_query'] = final_out

# Check

In [5]:
query_res.loc[(query_res['generated_answer']=='ERROR'), 'output'] = 'ERROR'
query_res.loc[(query_res['generated_answer']==query_res['true_answer']), 'output'] = 'EM'

In [6]:
query_res['output'].value_counts()

output
EM       295
ERROR      4
Name: count, dtype: int64

In [7]:
# import os
# os.environ["OPENAI_API_KEY"] = ''

from openai import OpenAI
client = OpenAI()

In [8]:
system_prompt_1 = """You are an assistant that is an expert in assessing Sqlite SQL queries.
You are given a Database Schema, a question, a true SQL query that answers the question and generated SQL query to answer the question.
Compare the true and generated SQL queries and evaluate if the generated query answers the question and acceptable.
Note that true and generated SQL queries are not the same, but both may be accapted as long as they both answer the question.
Respond with YES if generated query is acceptable, or NO if generated query does not answer the question.
"""

# system_prompt_2 = """You are an assistant that is an expert in assessing Sqlite SQL queries.
# You are given a Database Schema, a question and an SQL query to answer the question.
# Look at the SQL query and assess if the query answers the question and acceptable.
# Respond with YES if the query is acceptable, or NO if generated query does not answer the question.
# """

prompt = """### Database Schema
CREATE TABLE [catastici]
(
    [Owner_ID] INT, -- Unique ID of each owner of the property
    [Owner_First_Name] NVARCHAR(30), -- First name of the owner of the property
    [Owner_Family_Name] NVARCHAR(30), -- Family name of the owner of the property
    [Property_Type] NVARCHAR(100), -- Specific type of the property given in Italian. For example, "casa", "bottega da barbier", "bottega da fruttariol".
    [Rent_Income] INT, -- Rent price of the property that the owner receives as income, given in Venice ancient gold coin ducato.
    [Property_Location] NVARCHAR(100) -- Ancient approximate toponym of the property given in Italian.
);

### Question
{question}

### True SQL query
{true_query}

### Generated SQL query
{generated_query}

### Response
"""

In [9]:
example = query_res[query_res.output.isna()].iloc[10]
print(example['question'])
print(example['true_query'])
print(example['generated_query'])

What are the names of everyone holding property ownership?
SELECT DISTINCT "Owner_First_Name", "Owner_Family_Name" 
FROM catastici

SELECT "Owner_First_Name", "Owner_Family_Name"
FROM catastici;


In [11]:
response = client.chat.completions.create(
  model="gpt-4-turbo-preview",
  messages=[
    {"role": "system", "content": system_prompt_1},
    {"role": "user", "content": prompt.format(question=example['question'],true_query=example['true_query'],generated_query=example['generated_query'])}
  ]
)
print(response.choices[0].message.content)

NO


In [12]:
responses_1 = {}
for idx, r in query_res[query_res.output.isna()].iterrows():
    response = client.chat.completions.create(
        model="gpt-4-turbo-preview",
        messages=[
            {"role": "system", "content": system_prompt_1},
            {"role": "user", "content": prompt.format(question=r['question'],true_query=r['true_query'],generated_query=r['generated_query'])}
        ]
    )
    responses_1[idx] = response.choices[0].message.content

In [59]:
# responses_2 = {}
# for idx, r in query_res[query_res.output.isna()].iterrows():
#     response = client.chat.completions.create(
#         model="gpt-4-turbo-preview",
#         messages=[
#             {"role": "system", "content": system_prompt_2},
#             {"role": "user", "content": prompt.format(question=r['question'],true_query=r['true_query'],generated_query=r['generated_query'])}
#         ]
#     )
#     responses_2[idx] = response.choices[0].message.content

In [13]:
for k,v in responses_1.items():
    query_res.loc[k, 'output'] = v

In [14]:
query_res.output.value_counts()

output
EM       295
NO       152
YES       49
ERROR      4
Name: count, dtype: int64

In [26]:
query_res.groupby(['output','level_len'])['question_id'].count()

output  level_len
EM      0            270
        1             25
ERROR   0              3
        1              1
NO      0             86
        1             66
YES     0             41
        1              8
Name: question_id, dtype: int64

In [29]:
query_res.groupby(['output','level_nest'])['question_id'].count()

output  level_nest
EM      0             264
        1              31
ERROR   0               3
        1               1
NO      0              83
        1              69
YES     0              45
        1               4
Name: question_id, dtype: int64

**Results**
- Wrong -> 234<br>
    - True -> 49
    - False -> 152
- Error -> 4<br>
- Exact Match -> 295
- True -> **344**

In [31]:
for idx, row in query_res[query_res.output == "ERROR"].iterrows():
    print(f"{idx} - {row['question_id']}")
    print(f"Question: {row['question']}")
    print(f"Evidence: {row['evidence']}")
    print()
    print('True SQL:')
    print(row['true_query'])
    print('Generated SQL:')
    print(row['generated_query'])
    print('\n\n') 

212 - 42
Question: What's the standard income derived from leasing properties excluding those in "segue rughetta verso calle del fontico"?
Evidence: "Property_Location" = "segue rughetta verso calle del fontico"

True SQL:
SELECT AVG("Rent_Income") 
FROM catastici 
WHERE "Property_Location" != 'segue rughetta verso calle del fontico';
Generated SQL:
SELECT STDDEV("Rent_Income") 
FROM catastici 
WHERE "Property_Location" != 'segue rughetta verso calle del fontico';
SELECT STDDEV("Rent_Income")
FROM catastici
WHERE "Property_Location" != 'segue rughetta verso calle del fontico';
SELECT STDDEV("Rent_Income") 
FROM catastici 
WHERE "Property_Location" <> 'segue rughetta verso calle del fontico';
SELECT STDDEV("Rent_Income") 
FROM catastici
WHERE "Property_Location" != 'segue rughetta verso calle del fontico';



225 - 45
Question: What is the number of households holding ownership of real estate located on "calle corrente dell'occa"?
Evidence: "Property_Location" = "calle corrente dell'occ

In [30]:
for idx, row in query_res[query_res.output == "NO"].iterrows():
    print(f"{idx} - {row['question_id']}")
    print(f"Question: {row['question']}")
    print(f"Evidence: {row['evidence']}")
    print()
    print('True SQL:')
    print(row['true_query'])
    print('Generated SQL:')
    print(row['generated_query'])
    print('\n\n') 

6 - 1
Question: Can you tell me the aggregate amount of properties detailed within the dataset?
Evidence: nan

True SQL:
SELECT COUNT(*) AS Total_Properties
FROM catastici;
Generated SQL:

SELECT SUM("Rent_Income")
FROM catastici;



7 - 1
Question: What's the cumulative figure for properties found in the dataset?
Evidence: nan

True SQL:
SELECT COUNT(*) AS Total_Properties
FROM catastici;
Generated SQL:

SELECT SUM(Rent_Income)
FROM catastici;



10 - 2
Question: Are you able to list every property location included in the dataset?
Evidence: nan

True SQL:
SELECT DISTINCT "Property_Location" 
FROM catastici;
Generated SQL:

SELECT "Property_Location"
FROM catastici;



12 - 2
Question: Could you provide the names of all property locations that appear in the dataset?
Evidence: nan

True SQL:
SELECT DISTINCT "Property_Location" 
FROM catastici;
Generated SQL:

SELECT "Property_Location"
FROM catastici;



13 - 2
Question: Might you enumerate all the property locations contained in the d