In [1]:
from langchain.chains import create_sql_query_chain
from langchain_community.utilities import SQLDatabase

db = SQLDatabase.from_uri("sqlite:///../data/catastici.db")

# test DB
print(db.dialect)
print(db.get_usable_table_names())
db.run("SELECT * FROM catastici LIMIT 1;")

sqlite
['catastici']


"[('liberal', 'campi', 'casa e bottega da barbier', 70, 'campo vicino alla chiesa')]"

In [2]:
import pandas as pd
query_res = pd.read_csv('./test_data_generated.csv')

In [16]:
import re
import ast
def clean_query(sql_query):
    """clean the output"""
    # change to list
    sql_query_list = ast.literal_eval(sql_query)
    
    # split on ;
    sql_query_list = [query.split(';')[0]+';' for query in sql_query_list]
    return sql_query_list

def check_sql_executability(query, db):
    try:
        return db.run(query)
    except:
        return "ERROR"

In [17]:
# clean output
for idx, row in query_res.iterrows():
    query_list_clean = clean_query(row['generated_query'])
    final_out = None
    for out in query_list_clean:
        answer = check_sql_executability(out, db)
        if answer != "ERROR":
            final_out = out
            break
    if final_out == None:
        final_out = '\n'.join(query_list_clean)
        answer = "ERROR"
    query_res.loc[idx,'generated_answer'] = answer
    query_res.loc[idx,'generated_query'] = final_out

In [27]:
query_res[(query_res['generated_answer']!='ERROR') & (query_res['generated_answer']!=query_res['true_answer'])].shape

(246, 7)

In [21]:
query_res[(query_res['generated_answer']=='ERROR')].shape

(18, 7)

# Check

- Wrong -> 246<br>
    - True -> 162
    - Wrong -> 84 <= 12 + [123, 166, 167, 169, 175-179, 186, 209, 213, 225, 227, 256-258, 266, 276-278, 281 (uppercased), 308, 330, 338, 345-346, 348-349, 351, 370-371, 375-377, 383, 387, 388, 408, 415-419, 422-423, 441, 455-459, 461-464, 476, 479-489, 495-499]
- Error -> 18<br>
- Exact Match -> 230
- True -> **392**

### Wrong Groud Truth
17(distinct), 20, 40, 75, 95, 121, 135, 170(answer is nan), 185(answer is nan), 240, 306, 311 (do sum), 390-393 (median), 417 (this specific question), 480 (tricky median)

### Ambigious questions <br>
200 - 204, 85 - 89 (total property or single property?), 90-94, 375, 430, 433

### Both correct
- Question: What does the typical rental revenue look like for properties situated at "al ponte di san provolo"? -> TRUE (average), GEN (average by property type)
- What is the range of rent incomes in "calle de franchi"? -> TRUE (min, max), GEN (max-min)

### Wrong generation

Sometimes, it puts extra filter (on Limit to avoid long answer, but it is not asked) - 3, 8, 11, 12, 13, 14, 25, 35<br>
- It, sometimes, uses limit 1 when question is singular
Sometimes, it confuses the feature names: i.e Rent_Income instead of Property_Type <br>
Example questions:
- Which properties have a rent income higher than 50 ducati? (limit 1)
- Question: What are the different classifications of property present in "teren alli gesouiti"? (uses "Property_Classification")
- How many properties are enumerated in the dataset? (uses "WHERE "Property_Type" = 'enumerazione'")
- Who is the owner earning the highest rental income from a property? (does sum instead of max)
- What is the count of unique locations where properties are situated? (does not count)
- What does the typical rental revenue amount to for each category of the "perina" "capello" properties? (it does "Property_Location" = 'perina')
- What is the total number of properties generating less than 30 ducati in rent? (does sum of rent instead of count)

### Limitations - Error
Non-existing SQL keywords, such as STDDEV: e.g. For each property location, what's the standard rental income? -> uses STDDEV instead of AVG (question is also ambigious)<br>
Statistics, i.e. median, standard deviation, variance, ... <br>
More than 1 arguments in Count <br>
Non-existing feature names, when qeustion is ambigious: e.g. How many real estate properties are on lease for over "38"? <br>
Example questions: 
- Could you provide the median leasing earnings across all assets in "al ponte di san provolo"? (uses PERCENTILE_CONT)
- Can you tell me the total number of different property owners present in the dataset? (>2 args in Count)
- For each property location, what's the standard rental income? (uses SDDEV)
- How many real estate properties are on lease for over "38"? (uses non-existing feature)
- Whose real estate assets are spread over the widest array of locations?
- What is the number of households possessing assets across multiple type categories?
- What is the average rent income variance across all locations? (wrong statistics)

In [None]:
for idx, row in query_res[(query_res['generated_answer']!='ERROR') & (query_res['generated_answer']!=query_res['true_answer'])].iterrows():
    print(f"{row['level']} - {row['question_id']} - {idx}")
    print(f"Question: {row['question']}")
    print(f"Answer True: {row['true_answer']}")
    print(f"Answer Generated: {row['generated_answer']}")
    print('True SQL:')
    print(row['true_query'])
    print('Generated SQL:')
    print(row['generated_query'])
    print('\n\n')    