In [1]:
from langchain_community.utilities import SQLDatabase

db = SQLDatabase.from_uri("sqlite:///../data/catastici.db")

# test DB
print(db.dialect)
print(db.get_usable_table_names())
db.run("SELECT * FROM catastici LIMIT 1;")

sqlite
['catastici']


"[(0, 3183, 'liberal', 'campi', 'casa e bottega da barbier', 70, 'campo vicino alla chiesa')]"

In [2]:
from collections import Counter
import pandas as pd
import ast
import re
import warnings
warnings.filterwarnings('ignore')

def clean_query(sql_query):
    """clean the output"""
    # change to list
    sql_query_list = ast.literal_eval(sql_query)
    
    # split on ;
    sql_query_list = [query.split(';')[0].strip() + ';' for query in sql_query_list]
    
    # replace '
    sql_query_list = [re.sub(r"([a-z])'([a-z])",r"\1''\2", query) for query in sql_query_list]

    return sql_query_list

def check_sql_executability(query, db):
    try:
        return db.run(query)
    except Exception as e:
        return str(e)
     
def find_most_common_answer(answers):
    answers = ["ERROR" if "error" in answer else answer for answer in answers]
    most_common_answer, most_common_count = Counter(answers).most_common(1)[0]
    if most_common_answer == "ERROR":
        if most_common_count == 4:
            return most_common_answer
        else:
            return Counter(answers).most_common(2)[1][0]

    return most_common_answer

def clean_answer(answer, to_replace = ['[', ']', '(', ',)', "'", ')']):
    pattern = '|'.join(map(re.escape, to_replace))
    cleaned_answer = re.sub(pattern, '', answer)
    return [ans.strip() for ans in cleaned_answer.split(',')]

def ngram_overlap(true_answer, generated_answer):
    """Calculates 1-gram overlap"""
    # Convert lists to sets to remove duplicates
    true_set = set(true_answer)
    generated_set = set(generated_answer)
    
    # Calculate the intersection of the sets
    overlap_count = len(true_set.intersection(generated_set))
    
    # Calculate the percentage of overlap
    overlap_percentage = overlap_count / len(true_set) if len(true_set) > 0 else 0.0
    
    return round(overlap_percentage, 3)

In [3]:
# import the dataset
query_res = pd.read_csv('./test_data_generated.csv')

# clean output
for idx, row in query_res.iterrows():
    query_list_clean = clean_query(row['generated_query'])
    final_out = None
    answers = []
    for out in query_list_clean:
        answers.append(check_sql_executability(out, db))
    answer = find_most_common_answer(answers)
    
    if answer != "ERROR":
        final_out = query_list_clean[answers.index(answer)]
    if final_out == None:
        final_out = query_list_clean[0]
        answer = check_sql_executability(final_out, db)
    query_res.loc[idx,'generated_answer'] = answer
    query_res.loc[idx,'generated_query'] = final_out
    
query_res.loc[(query_res['generated_answer'].str.contains("error")), 'output'] = 'ERROR'
query_res.loc[(query_res['generated_answer']==query_res['true_answer']), 'output'] = 'EM'
query_res['output'].value_counts()

output
EM       253
ERROR      9
Name: count, dtype: int64

In [4]:
query_wrong = query_res[query_res.output.isna()]
query_wrong.true_answer.fillna("",inplace=True)

query_wrong['generated_answer_clean'] = query_wrong['generated_answer'].apply(clean_answer)
query_wrong['true_answer_clean'] = query_wrong['true_answer'].apply(clean_answer)

n_gram = []
for _, row in query_wrong.iterrows():
    n_gram.append(ngram_overlap(row['true_answer_clean'], row['generated_answer_clean']))
query_wrong['n_gram_overlap'] = n_gram
query_wrong['n_gram_matching'] = query_wrong['n_gram_overlap']>0.33

query_wrong['n_gram_matching'].value_counts()

n_gram_matching
False    131
True     107
Name: count, dtype: int64

# Check

In [5]:
query_res.loc[(query_res['generated_answer']=='ERROR'), 'output'] = 'ERROR'
query_res.loc[(query_res['generated_answer']==query_res['true_answer']), 'output'] = 'EM'

In [6]:
query_res['output'].value_counts()

output
EM       253
ERROR     13
Name: count, dtype: int64

In [7]:
# import os
# os.environ["OPENAI_API_KEY"] = ''

from openai import OpenAI
client = OpenAI()

In [8]:
system_prompt_1 = """You are an assistant that is an expert in assessing Sqlite SQL queries.
You are given a Database Schema, a question, a true SQL query that answers the question and generated SQL query to answer the question.
Compare the true and generated SQL queries and evaluate if the generated query answers the question and acceptable.
Note that true and generated SQL queries are not the same, but both may be accapted as long as they both answer the question.
Respond with YES if generated query is acceptable, or NO if generated query does not answer the question.
"""

system_prompt_2 = """You are an assistant that is an expert in assessing Sqlite SQL queries.
You are given a Database Schema, a question and an SQL query to answer the question.
Look at the SQL query and assess if the query answers the question.
Respond with YES if the query asnwers the question directly or indirectly, or NO if generated query does not answer the question nor include the answer to the question.
"""

prompt_1 = """### Database Schema
CREATE TABLE [catastici]
(
    [Owner_ID] INT, -- Unique ID of each owner of the property
    [Owner_First_Name] NVARCHAR(30), -- First name of the owner of the property
    [Owner_Family_Name] NVARCHAR(30), -- Family name of the owner of the property
    [Property_Type] NVARCHAR(100), -- Specific type of the property given in Italian. For example, "casa", "bottega da barbier", "bottega da fruttariol".
    [Rent_Income] INT, -- Rent price of the property that the owner receives as income, given in Venice ancient gold coin ducato.
    [Property_Location] NVARCHAR(100) -- Ancient approximate toponym of the property given in Italian.
);

### Question
{question}

### True SQL query
{true_query}

### Generated SQL query
{generated_query}

### Response
"""

prompt_2 = """### Database Schema
CREATE TABLE [catastici]
(
    [Owner_ID] INT, -- Unique ID of each owner of the property
    [Owner_First_Name] NVARCHAR(30), -- First name of the owner of the property
    [Owner_Family_Name] NVARCHAR(30), -- Family name of the owner of the property
    [Property_Type] NVARCHAR(100), -- Specific type of the property given in Italian. For example, "casa", "bottega da barbier", "bottega da fruttariol".
    [Rent_Income] INT, -- Rent price of the property that the owner receives as income, given in Venice ancient gold coin ducato.
    [Property_Location] NVARCHAR(100) -- Ancient approximate toponym of the property given in Italian.
);

### Question
{question}

### Generated SQL query
{generated_query}

### Response
"""

In [19]:
example = query_res[query_res.output.isna()].iloc[20]
print(example['question'])
print(example['true_query'])
print(example['generated_query'])

Can you determine the quantity of properties categorized under "casa"?
SELECT COUNT("Property_Type") 
FROM catastici 
WHERE "Property_Type" = 'casa';
SELECT "Property_Type", COUNT(*) AS "Number_of_casa_properties"
FROM catastici
WHERE "Property_Type" = 'casa'
GROUP BY "Property_Type";


In [20]:
response = client.chat.completions.create(
  model="gpt-3.5-turbo",
  messages=[
    {"role": "system", "content": system_prompt_1},
    {"role": "user", "content": prompt_1.format(question=example['question'],true_query=example['true_query'],generated_query=example['generated_query'])}
  ]
)
print(response.choices[0].message.content)

NO


In [21]:
response = client.chat.completions.create(
  model="gpt-3.5-turbo",
  messages=[
    {"role": "system", "content": system_prompt_2},
    {"role": "user", "content": prompt_2.format(question=example['question'],generated_query=example['generated_query'])}
  ]
)
print(response.choices[0].message.content)

YES


In [66]:
responses_1 = {}
for idx, r in query_res[query_res.output.isna()].iterrows():
    response = client.chat.completions.create(
        model="gpt-4-turbo-preview",
        messages=[
            {"role": "system", "content": system_prompt_1},
            {"role": "user", "content": prompt_1.format(question=r['question'],true_query=r['true_query'],generated_query=r['generated_query'])}
        ]
    )
    responses_1[idx] = response.choices[0].message.content

In [22]:
responses_2 = {}
for idx, r in query_res[query_res.output.isna()].iterrows():
    response = client.chat.completions.create(
        model="gpt-4-turbo-preview",
        messages=[
            {"role": "system", "content": system_prompt_2},
            {"role": "user", "content": prompt_2.format(question=r['question'],generated_query=r['generated_query'])}
        ]
    )
    responses_2[idx] = response.choices[0].message.content

In [23]:
query_wrong = query_res[query_res.output.isna()].drop('output',axis=1)

In [25]:
for k,v in responses_1.items():
    query_wrong.loc[k, 'output_1'] = v

for k,v in responses_2.items():
    query_wrong.loc[k, 'output_2'] = v

In [75]:
query_wrong.output_1.value_counts()

output_1
NO     159
YES     75
Name: count, dtype: int64

In [26]:
query_wrong.output_2.value_counts()

output_2
NO     130
YES    104
Name: count, dtype: int64

**Results**
- Wrong -> 234<br>
    - True -> 104
    - Wrong -> 130
- Error -> 13<br>
- Exact Match -> 253
- True -> **357**

In [32]:
for k,v in responses_2.items():
    query_res.loc[k, 'output'] = v

In [35]:
query_res['output_binary'] = (query_res.output=='EM') | (query_res.output=='YES')

In [36]:
query_res.groupby('level_len')['output_binary'].mean()

level_len
0    0.7775
1    0.4600
Name: output_binary, dtype: float64

In [37]:
query_res.groupby('level_nest')['output_binary'].mean()

level_nest
0    0.789873
1    0.428571
Name: output_binary, dtype: float64

# Examples

In [30]:
for idx, row in query_wrong[query_wrong.output_2 == "NO"].sample(5).iterrows():
    print(f"{idx} - {row['question_id']}")
    print(row['output_2'])
    print(f"Question: {row['question']}")
    print(f"Evidence: {row['evidence']}")
    print()
    print('True SQL:')
    print(row['true_query'])
    print('Generated SQL:')
    print(row['generated_query'])
    print('\n\n') 

397 - 79
NO
Question: What percentage of the dataset does each property type represent?
Evidence: nan

True SQL:
SELECT "Property_Type", COUNT("Property_Type") AS Property_Count, (COUNT("Property_Type") * 100.0 / (SELECT COUNT(*) FROM catastici)) AS Proportion
FROM catastici
GROUP BY "Property_Type";
Generated SQL:

SELECT "Property_Type", COUNT(*) AS num_properties
FROM catastici
GROUP BY "Property_Type";



185 - 37
NO
Question: Who among the property owners earns the most from a single property in terms of rent income?
Evidence: nan

True SQL:
SELECT "Owner_First_Name", "Owner_Family_Name", MAX("Rent_Income") AS "Max_Rent_Income"
FROM catastici;
Generated SQL:

SELECT "Owner_First_Name", "Owner_Family_Name", SUM("Rent_Income") AS Total_Rent_Income
FROM catastici
GROUP BY "Owner_First_Name", "Owner_Family_Name"
ORDER BY Total_Rent_Income DESC
LIMIT 1;



266 - 53
NO
Question: Can you enumerate the different kinds of properties that are distinct to a particular area?
Evidence: nan

Tr

In [31]:
for idx, row in query_wrong[query_wrong.output_2 == "YES"].sample(5).iterrows():
    print(f"{idx} - {row['question_id']}")
    print(row['output_2'])
    print(f"Question: {row['question']}")
    print(f"Evidence: {row['evidence']}")
    print()
    print('True SQL:')
    print(row['true_query'])
    print('Generated SQL:')
    print(row['generated_query'])
    print('\n\n') 

200 - 40
YES
Question: What's the interval of rental earnings found on "calle de franchi"?
Evidence: "Property_Location" = "calle de franchi"

True SQL:
SELECT MIN("Rent_Income"), MAX("Rent_Income") 
FROM catastici 
WHERE "Property_Location" = 'calle de franchi';
Generated SQL:
SELECT MAX("Rent_Income") - MIN("Rent_Income") 
FROM catastici 
WHERE "Property_Location" = 'calle de franchi';



51 - 10
YES
Question: Who are the entities listed as owners of properties?
Evidence: nan

True SQL:
SELECT DISTINCT "Owner_First_Name", "Owner_Family_Name" 
FROM catastici
Generated SQL:

SELECT "Owner_First_Name", "Owner_Family_Name"
FROM catastici
GROUP BY "Owner_First_Name", "Owner_Family_Name"
ORDER BY "Owner_First_Name", "Owner_Family_Name";



324 - 64
YES
Question: Are there any properties with a rent income lower than 60 ducati?
Evidence: nan

True SQL:
SELECT "Owner_First_Name", "Owner_Family_Name", "Property_Type", "Rent_Income", "Property_Location" 
FROM catastici 
WHERE "Rent_Income" < 6