# SQLDatabaseSequentialChain - Mondial FK - GPT 3.5

In [1]:
!pip install langchain_experimental




[notice] A new release of pip is available: 23.2.1 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
from langchain.sql_database import SQLDatabase
from langchain.chains import LLMChain
from langchain_experimental.sql.base import SQLDatabaseSequentialChain
from langchain.chat_models import ChatOpenAI
from urllib.parse import quote  
from langchain.prompts.prompt import PromptTemplate
from langchain.callbacks import get_openai_callback
import time
from dotenv import load_dotenv
import os
import sys
import json
load_dotenv()

path = os.path.abspath('')
module_path = os.path.join(path,'..\..')
print(module_path)
if module_path not in sys.path:
    sys.path.append(module_path+"\\functions")


from sqldatabase_langchain_utils import SQLDatabaseLangchainUtils

# Schema

In [3]:
SCHEMA = 'mondial_gpt'
PREFIX = 'mondial'
FILE_NAME_RESULT = f"results/12_sql_sequential_queries_chatgpt_{SCHEMA}_fk_instances.json"


In [4]:
def save_queries(queries):
    data = {"queries":queries}
    with open(FILE_NAME_RESULT, "w") as arquivo_json:
        json.dump(data, arquivo_json, indent=4) 
        
def read_queries():
    with open(FILE_NAME_RESULT, encoding='utf-8', errors='ignore') as json_data:
        data = json.load(json_data, strict=False)

    queries = data["queries"]
    return queries


## Conexão

In [None]:
json_file_path = f"../../datasets/{SCHEMA}_db_connection.json"
with open(json_file_path, encoding='utf-8', errors='ignore') as json_data:
    db_connection = json.load(json_data, strict=False)

db_connection

### Utilizando o SQLDatabase para pegar todas as informações do database

In [6]:
db = SQLDatabaseLangchainUtils(db_connection=db_connection)

exclusao = [
    f"{SCHEMA}_tmdp",
    f"{SCHEMA}_tmdpmap",
    f"{SCHEMA}_tmds",
    f"{SCHEMA}_tmjmap",
    f"{SCHEMA}_tpv",
    f"{SCHEMA}_tmdc",
    f"{SCHEMA}_tmdcmap",
    f"{SCHEMA}_tmdej",
    f"{SCHEMA}_log_action",
    f"{SCHEMA}_log_error",
    f"{SCHEMA}_favorite_item", 
    f"{SCHEMA}_favorite_query",
    f"{SCHEMA}_favorite_tag",
    f"{SCHEMA}_favorite_tag_item",
    f"{SCHEMA}_favorite_visualization",
    f"{SCHEMA}_dashboard",
    f"{SCHEMA}_history",
    "teste_cliente",
    "teste_fornecedor",
    "teste_funcionario"
]

include_tables = [s for s in db.get_table_names() if not s.startswith(PREFIX) and s not in exclusao]
db = SQLDatabaseLangchainUtils(db_connection=db_connection, include_tables=include_tables, samples=2)
db.get_table_names()

  self._metadata.reflect(
  self._metadata.reflect(
  self._metadata.reflect(


['airport',
 'borders',
 'city',
 'citylocalname',
 'cityothername',
 'citypops',
 'continent',
 'country',
 'countrylocalname',
 'countryothername',
 'countrypops',
 'desert',
 'economy',
 'encompasses',
 'ethnicgroup',
 'geo_desert',
 'geo_estuary',
 'geo_island',
 'geo_lake',
 'geo_mountain',
 'geo_river',
 'geo_sea',
 'geo_source',
 'island',
 'islandin',
 'ismember',
 'lake',
 'lakeonisland',
 'language',
 'located',
 'locatedon',
 'mergeswith',
 'mountain',
 'mountainonisland',
 'organization',
 'politics',
 'population',
 'province',
 'provincelocalname',
 'provinceothername',
 'provpops',
 'religion',
 'river',
 'riveronisland',
 'riverthrough',
 'sea']

In [7]:
len(db.get_table_names())

46

## Criando o SQLChainPrompt especifico para o SQLDatabaseSequential

In [8]:
f = open(f"prompts/sql_chain_prompt_template.txt", "r")
prompt_template = f.read()
f.close()


QUERY_PROMPT = PromptTemplate(
    input_variables=["input", "table_info", "top_k"], template=prompt_template
)

print(QUERY_PROMPT)

input_variables=['input', 'table_info', 'top_k'] output_parser=None partial_variables={} template="Given an input question, first create a syntactically correct Oracle SQL query to run, then look at the results of the query and return the answer. Unless the user specifies in his question a specific number of examples he wishes to obtain, don't limits your query to at most {top_k} results. You can order the results by a relevant column to return the most interesting examples in the database.\n\nNever query for all the columns from a specific table, only ask for a the few relevant columns given the question.\n\nPay attention to use only the column names that you can see in the schema description. Be careful to not query for columns that do not exist. Also, pay attention to which column is in which table.\n\nUse the following format:\n\nQuestion: Question here\nSQLQuery: SQL Query to run\nSQLResult: Result of the SQLQuery\nAnswer: Final answer here\n\nOnly use the following tables:\n{tabl

## Criando o Chain para gerar SQL

In [None]:
llm = ChatOpenAI(temperature=0,model_name='gpt-3.5-turbo',verbose=True)

db_chain = SQLDatabaseSequentialChain.from_llm(llm, db.db, query_prompt=QUERY_PROMPT, verbose=True, use_query_checker=True, return_intermediate_steps=False, return_sql=True)
db_chain

## Preparando as consultas em linguagem natural para rodar no LLM

In [10]:

json_file_path = f"../../datasets/{PREFIX}/queries_{PREFIX}.json"
with open(json_file_path, encoding='utf-8', errors='ignore') as json_data:
    queries = json.load(json_data, strict=False)
queries = queries['queries']
queries

[{'id': '1',
  'question': 'What is the area of Thailand?',
  'query_string': '',
  'type': 'simple'},
 {'id': '2',
  'question': 'What are the provinces with an area greater than 10000?',
  'query_string': '',
  'type': 'simple'},
 {'id': '3',
  'question': 'What are the languages spoken in Poland?',
  'query_string': '',
  'type': 'medium'},
 {'id': '4',
  'question': 'How deep is Lake Kariba?',
  'query_string': '',
  'type': 'simple'},
 {'id': '5',
  'question': 'What is the total of provinces of Netherlands?',
  'query_string': '',
  'type': 'complex'},
 {'id': '6',
  'question': 'What is the percentage of religious people are hindu in thailand?',
  'query_string': '',
  'type': 'complex'},
 {'id': '7',
  'question': 'List the number of provinces each river flows through.',
  'query_string': '',
  'type': 'medium'},
 {'id': '8',
  'question': 'Find all countries that became independent between 8/1/1910 and 8/1/1950.',
  'query_string': '',
  'type': 'complex'},
 {'id': '9',
  'que

# Rodando as consultas no LLM para gerar SQL

In [12]:

# A cada X consultas, vai ser gerado um delay de 10s para evitar o bloqueio da API.
number_of_queries_to_delay = 20
count = 0
for instance in queries:
    if count == number_of_queries_to_delay:
        count = 0
        time.sleep(5)
        
    with get_openai_callback() as cb:
        start_time = time.time()
        sql_query = db_chain({
            "query": instance["question"]
        })
        end_time = time.time()
        instance["query_string"] = sql_query['result']
        instance['total_tokens'] = cb.total_tokens
        instance['prompt_tokens'] = cb.prompt_tokens
        instance['completion_tokens'] = cb.completion_tokens
        instance['total_cost'] = cb.total_cost  
        instance['time'] = end_time - start_time
        print(instance['id'], instance['question'], instance["query_string"], instance['time'], instance['total_cost'])
    save_queries(queries)

    count += 1
    
queries



[1m> Entering new SQLDatabaseSequentialChain chain...[0m




Table names to use:
[33;1m[1;3m['country'][0m

[1m> Entering new SQLDatabaseChain chain...[0m
What is the area of Thailand?
SQLQuery:

  for tbl in self._metadata.sorted_tables



[1m> Finished chain.[0m

[1m> Finished chain.[0m
1 What is the area of Thailand? SELECT area FROM country WHERE name = 'Thailand' 1.6058058738708496 0.0008445


[1m> Entering new SQLDatabaseSequentialChain chain...[0m
Table names to use:
[33;1m[1;3m['province'][0m

[1m> Entering new SQLDatabaseChain chain...[0m
What are the provinces with an area greater than 10000?
SQLQuery:
[1m> Finished chain.[0m

[1m> Finished chain.[0m
2 What are the provinces with an area greater than 10000? SELECT name, country, area
FROM province
WHERE area > 10000 1.5946290493011475 0.000898


[1m> Entering new SQLDatabaseSequentialChain chain...[0m
Table names to use:
[33;1m[1;3m['country', 'language'][0m

[1m> Entering new SQLDatabaseChain chain...[0m
What are the languages spoken in Poland?
SQLQuery:
[1m> Finished chain.[0m

[1m> Finished chain.[0m
3 What are the languages spoken in Poland? SELECT name
FROM language
WHERE country = 'PL' 1.3679816722869873 0.0010185


[1m> Enterin

[{'id': '1',
  'question': 'What is the area of Thailand?',
  'query_string': "SELECT area FROM country WHERE name = 'Thailand'",
  'type': 'simple',
  'total_tokens': 559,
  'prompt_tokens': 547,
  'completion_tokens': 12,
  'total_cost': 0.0008445,
  'time': 1.6058058738708496},
 {'id': '2',
  'question': 'What are the provinces with an area greater than 10000?',
  'query_string': 'SELECT name, country, area\nFROM province\nWHERE area > 10000',
  'type': 'simple',
  'total_tokens': 593,
  'prompt_tokens': 576,
  'completion_tokens': 17,
  'total_cost': 0.000898,
  'time': 1.5946290493011475},
 {'id': '3',
  'question': 'What are the languages spoken in Poland?',
  'query_string': "SELECT name\nFROM language\nWHERE country = 'PL'",
  'type': 'medium',
  'total_tokens': 674,
  'prompt_tokens': 659,
  'completion_tokens': 15,
  'total_cost': 0.0010185,
  'time': 1.3679816722869873},
 {'id': '4',
  'question': 'How deep is Lake Kariba?',
  'query_string': "SELECT depth\nFROM lake\nWHERE 

In [13]:
queries = read_queries()
queries

[{'id': '1',
  'question': 'What is the area of Thailand?',
  'query_string': "SELECT area FROM country WHERE name = 'Thailand'",
  'type': 'simple',
  'total_tokens': 559,
  'prompt_tokens': 547,
  'completion_tokens': 12,
  'total_cost': 0.0008445,
  'time': 1.6058058738708496},
 {'id': '2',
  'question': 'What are the provinces with an area greater than 10000?',
  'query_string': 'SELECT name, country, area\nFROM province\nWHERE area > 10000',
  'type': 'simple',
  'total_tokens': 593,
  'prompt_tokens': 576,
  'completion_tokens': 17,
  'total_cost': 0.000898,
  'time': 1.5946290493011475},
 {'id': '3',
  'question': 'What are the languages spoken in Poland?',
  'query_string': "SELECT name\nFROM language\nWHERE country = 'PL'",
  'type': 'medium',
  'total_tokens': 674,
  'prompt_tokens': 659,
  'completion_tokens': 15,
  'total_cost': 0.0010185,
  'time': 1.3679816722869873},
 {'id': '4',
  'question': 'How deep is Lake Kariba?',
  'query_string': "SELECT depth\nFROM lake\nWHERE 

#### Fixing queries

In [11]:
to_fix = [40,59,62,72,85,99]
for pos in to_fix:
    instance = queries[pos]
    q = read_queries()
    with get_openai_callback() as cb:
        start_time = time.time()
        sql_query = db_chain({
            "query": instance["question"]
        })
        end_time = time.time()
        instance["query_string"] = sql_query['result']
        instance['total_tokens'] = cb.total_tokens
        instance['prompt_tokens'] = cb.prompt_tokens
        instance['completion_tokens'] = cb.completion_tokens
        instance['total_cost'] = cb.total_cost  
        instance['time'] = end_time - start_time
        q[pos] = instance
        print(instance['id'], instance['question'], instance["query_string"], instance['time'], instance['total_cost'])
        save_queries(q)



[1m> Entering new SQLDatabaseSequentialChain chain...[0m




Table names to use:
[33;1m[1;3m['borders', 'country', 'geo_sea'][0m

[1m> Entering new SQLDatabaseChain chain...[0m
How many countries that are close to the Mediterranean Sea?
SQLQuery:

  for tbl in self._metadata.sorted_tables



[1m> Finished chain.[0m

[1m> Finished chain.[0m
41 How many countries that are close to the Mediterranean Sea? SELECT COUNT(DISTINCT country.code)
FROM country
JOIN borders ON country.code = borders.country1 OR country.code = borders.country2
JOIN geo_sea ON country.code = geo_sea.country
WHERE geo_sea.sea = 'Mediterranean Sea' 2.097057342529297 0.0013845


[1m> Entering new SQLDatabaseSequentialChain chain...[0m
Table names to use:
[33;1m[1;3m['city', 'organization'][0m

[1m> Entering new SQLDatabaseChain chain...[0m
 List the names of capital cities which are the base for organizations in alphabetical order
SQLQuery:
[1m> Finished chain.[0m

[1m> Finished chain.[0m
60  List the names of capital cities which are the base for organizations in alphabetical order SELECT DISTINCT c.name
FROM city c
JOIN organization o ON c.name = o.city
ORDER BY c.name ASC 1.7531583309173584 0.0013445


[1m> Entering new SQLDatabaseSequentialChain chain...[0m
Table names to use:
[33;1m