# Experimentos com o Mondial-Foreign Key utilizando o SQLAgent no GPT 3.5

In [None]:

from langchain.sql_database import SQLDatabase
from langchain.chat_models import ChatOpenAI
from urllib.parse import quote  

from custom_sql_toolkit.custom_sql_database_toolkit import CustomSQLDatabaseToolkit
from custom_sql_toolkit.prompt import (
    CUSTOM_SQL_PREFIX,
    CUSTOM_FORMAT_INSTRUCTIONS,    
)
from langchain.callbacks import get_openai_callback
from langchain.agents import create_sql_agent

from langchain.agents.agent_types import AgentType

from dotenv import load_dotenv
import os
import sys
import json
import time

load_dotenv()

path = os.path.abspath('')
module_path = os.path.join(path,'..\..')
print(module_path)
if module_path not in sys.path:
    sys.path.append(module_path+"\\functions")

from sqldatabase_langchain_utils import SQLDatabaseLangchainUtils



# Schema

In [2]:
SCHEMA = 'mondial_gpt'
PREFIX = 'mondial'
FILE_NAME_RESULT = f"results/7_sql_agent_chatgpt_{SCHEMA}_fk.json"

Funções para salavar e ler as consultas geradas

In [3]:
def save_queries(queries):
    data = {"queries":queries}
    with open(FILE_NAME_RESULT, "w") as arquivo_json:
        json.dump(data, arquivo_json, indent=4) 

def read_queries():
    with open(FILE_NAME_RESULT, encoding='utf-8', errors='ignore') as json_data:
        data = json.load(json_data, strict=False)
    queries = data["queries"]
    return queries

## Conexão

In [None]:
json_file_path = f"../../datasets/{SCHEMA}_db_connection.json"
with open(json_file_path, encoding='utf-8', errors='ignore') as json_data:
    db_connection = json.load(json_data, strict=False)

db_connection

### Utilizando o SQLDatabase para pegar todas as informações do database

In [5]:
db = SQLDatabaseLangchainUtils(db_connection=db_connection)

exclusao = [
    f"{SCHEMA}_tmdp",
    f"{SCHEMA}_tmdpmap",
    f"{SCHEMA}_tmds",
    f"{SCHEMA}_tmjmap",
    f"{SCHEMA}_tpv",
    f"{SCHEMA}_tmdc",
    f"{SCHEMA}_tmdcmap",
    f"{SCHEMA}_tmdej",
    f"{SCHEMA}_log_action",
    f"{SCHEMA}_log_error",
    f"{SCHEMA}_favorite_item", 
    f"{SCHEMA}_favorite_query",
    f"{SCHEMA}_favorite_tag",
    f"{SCHEMA}_favorite_tag_item",
    f"{SCHEMA}_favorite_visualization",
    f"{SCHEMA}_dashboard",
    f"{SCHEMA}_history",
    "teste_cliente",
    "teste_fornecedor",
    "teste_funcionario"
]

include_tables = [s for s in db.get_table_names() if not s.startswith(PREFIX) and s not in exclusao]
db = SQLDatabaseLangchainUtils(db_connection=db_connection, include_tables=include_tables)
db.get_table_names()

  self._metadata.reflect(
  self._metadata.reflect(
  self._metadata.reflect(


['airport',
 'borders',
 'city',
 'citylocalname',
 'cityothername',
 'citypops',
 'continent',
 'country',
 'countrylocalname',
 'countryothername',
 'countrypops',
 'desert',
 'economy',
 'encompasses',
 'ethnicgroup',
 'geo_desert',
 'geo_estuary',
 'geo_island',
 'geo_lake',
 'geo_mountain',
 'geo_river',
 'geo_sea',
 'geo_source',
 'island',
 'islandin',
 'ismember',
 'lake',
 'lakeonisland',
 'language',
 'located',
 'locatedon',
 'mergeswith',
 'mountain',
 'mountainonisland',
 'organization',
 'politics',
 'population',
 'province',
 'provincelocalname',
 'provinceothername',
 'provpops',
 'religion',
 'river',
 'riveronisland',
 'riverthrough',
 'sea']

In [6]:
len(db.get_table_names())

46

In [7]:
cusmom_agent_prefix_prompt = CUSTOM_SQL_PREFIX
custom_format_instructions = CUSTOM_FORMAT_INSTRUCTIONS

In [8]:
print(custom_format_instructions)

Use the following format:

Question: the input question you must answer
Thought: you should always think about what to do
Action: the action to take, should be one of [{tool_names}]
Action Input: the input to the action
Observation: the result of the action
... (this Thought/Action/Action Input/Observation can repeat N times)
Thought: I now know the SQL QUERY generated 
Final Answer: the SQL QUERY generated 


## Criando o Chain para gerar SQL

In [9]:
llm=ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo", verbose=True)
toolkit = CustomSQLDatabaseToolkit(db=db.db, llm=llm)

agent_executor = create_sql_agent(
    llm=llm,
    toolkit=toolkit,
    prefix=cusmom_agent_prefix_prompt,
    format_instructions=custom_format_instructions,
    verbose=True,
    agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
)


In [10]:
with get_openai_callback() as cb:
    sql_query = agent_executor({"input":"What is the area of Thailand?"})
    print(cb.total_tokens)
    print(cb.prompt_tokens)
    print(cb.completion_tokens)
    print(cb.total_cost)
print(sql_query['output'])



[1m> Entering new AgentExecutor chain...[0m


[32;1m[1;3mAction: sql_db_list_tables
Action Input: ""[0m
Observation: [38;5;200m[1;3mairport, borders, city, citylocalname, cityothername, citypops, continent, country, countrylocalname, countryothername, countrypops, desert, economy, encompasses, ethnicgroup, geo_desert, geo_estuary, geo_island, geo_lake, geo_mountain, geo_river, geo_sea, geo_source, island, islandin, ismember, lake, lakeonisland, language, located, locatedon, mergeswith, mountain, mountainonisland, organization, politics, population, province, provincelocalname, provinceothername, provpops, religion, river, riveronisland, riverthrough, sea[0m
Thought:[32;1m[1;3mI should query the schema of the country table to see if it contains information about the area of Thailand.
Action: sql_db_schema
Action Input: "country"[0m
Observation: [33;1m[1;3m
CREATE TABLE country (
	name VARCHAR(50 CHAR) NOT NULL, 
	code VARCHAR(4 CHAR) NOT NULL, 
	capital VARCHAR(50 CHAR), 
	province VARCHAR(50 CHAR), 
	area NUMBER, 
	popu

  for tbl in self._metadata.sorted_tables


[32;1m[1;3mThe country table contains the area of each country, including Thailand.
Action: custom_sql_db_query
Action Input: "SELECT area FROM country WHERE name = 'Thailand'"[0m
Observation: [36;1m[1;3mSELECT area FROM country WHERE name = 'Thailand'[0m
Thought:[32;1m[1;3mI now know the SQL QUERY generated.
Final Answer: SELECT area FROM country WHERE name = 'Thailand'[0m

[1m> Finished chain.[0m
2946
2845
101
0.0044695
SELECT area FROM country WHERE name = 'Thailand'


Total tokens: **3140**<br>
Prompt tokens: **2995**<br>
completion tokens: **145**<br>
Total cost: **0.0047825**<br>

## Preparando as consultas em linguagem natural para rodar no LLM

In [11]:

json_file_path = f"../../datasets/{PREFIX}/queries_{PREFIX}.json"
with open(json_file_path, encoding='utf-8', errors='ignore') as json_data:
    queries = json.load(json_data, strict=False)
queries = queries['queries']
queries

[{'id': '1',
  'question': 'What is the area of Thailand?',
  'query_string': '',
  'type': 'simple'},
 {'id': '2',
  'question': 'What are the provinces with an area greater than 10000?',
  'query_string': '',
  'type': 'simple'},
 {'id': '3',
  'question': 'What are the languages spoken in Poland?',
  'query_string': '',
  'type': 'medium'},
 {'id': '4',
  'question': 'How deep is Lake Kariba?',
  'query_string': '',
  'type': 'simple'},
 {'id': '5',
  'question': 'What is the total of provinces of Netherlands?',
  'query_string': '',
  'type': 'complex'},
 {'id': '6',
  'question': 'What is the percentage of religious people are hindu in thailand?',
  'query_string': '',
  'type': 'complex'},
 {'id': '7',
  'question': 'List the number of provinces each river flows through.',
  'query_string': '',
  'type': 'medium'},
 {'id': '8',
  'question': 'Find all countries that became independent between 8/1/1910 and 8/1/1950.',
  'query_string': '',
  'type': 'complex'},
 {'id': '9',
  'que

# Rodando as consultas no LLM para gerar SQL

In [11]:

# A cada X consultas, vai ser gerado um delay de 10s para evitar o bloqueio da API.
number_of_queries_to_delay = 2
count = 0
errors = []
position = 0
for instance in queries:
    if count == number_of_queries_to_delay:
        count = 0
        time.sleep(20)
    try:
        with get_openai_callback() as cb:
            start_time = time.time()
            response = agent_executor({"input":instance["question"]})  
            end_time = time.time()
            sql_query = response['output']
            if "Agent stopped" in sql_query:
                errors.append(position)
            else:
                instance["query_string"] = sql_query
                instance['total_tokens'] = cb.total_tokens
                instance['prompt_tokens'] = cb.prompt_tokens
                instance['completion_tokens'] = cb.completion_tokens
                instance['total_cost'] = cb.total_cost  
                instance['time'] = end_time - start_time
                print(instance['id'], instance['question'], instance["query_string"], instance['time'], instance['total_cost'])
                save_queries(queries)
    except:
        errors.append(position)
    count += 1
    position+=1
queries



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mAction: sql_db_list_tables
Action Input: ""[0m
Observation: [38;5;200m[1;3mairport, borders, city, citylocalname, cityothername, citypops, continent, country, countrylocalname, countryothername, countrypops, desert, economy, encompasses, ethnicgroup, geo_desert, geo_estuary, geo_island, geo_lake, geo_mountain, geo_river, geo_sea, geo_source, island, islandin, ismember, lake, lakeonisland, language, located, locatedon, mergeswith, mountain, mountainonisland, organization, politics, population, province, provincelocalname, provinceothername, provpops, religion, river, riveronisland, riverthrough, sea[0m
Thought:[32;1m[1;3mI should query the schema of the country table to see if it contains information about the area of Thailand.
Action: sql_db_schema
Action Input: "country"[0m
Observation: [33;1m[1;3m
CREATE TABLE country (
	name VARCHAR(50 CHAR) NOT NULL, 
	code VARCHAR(4 CHAR) NOT NULL, 
	capital VARCHAR(50 CHAR), 
	

Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised Timeout: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600).


[32;1m[1;3mAction: sql_db_list_tables
Action Input: ""[0m
Observation: [38;5;200m[1;3mairport, borders, city, citylocalname, cityothername, citypops, continent, country, countrylocalname, countryothername, countrypops, desert, economy, encompasses, ethnicgroup, geo_desert, geo_estuary, geo_island, geo_lake, geo_mountain, geo_river, geo_sea, geo_source, island, islandin, ismember, lake, lakeonisland, language, located, locatedon, mergeswith, mountain, mountainonisland, organization, politics, population, province, provincelocalname, provinceothername, provpops, religion, river, riveronisland, riverthrough, sea[0m
Thought:[32;1m[1;3mI should query the schema of the "country" table to see what columns are available.
Action: sql_db_schema
Action Input: "country"[0m
Observation: [33;1m[1;3m
CREATE TABLE country (
	name VARCHAR(50 CHAR) NOT NULL, 
	code VARCHAR(4 CHAR) NOT NULL, 
	capital VARCHAR(50 CHAR), 
	province VARCHAR(50 CHAR), 
	area NUMBER, 
	population NUMBER, 
	CONSTRAI

[{'id': '1',
  'question': 'What is the area of Thailand?',
  'query_string': "SELECT area FROM country WHERE name = 'Thailand'",
  'type': 'simple',
  'total_tokens': 2946,
  'prompt_tokens': 2845,
  'completion_tokens': 101,
  'total_cost': 0.0044695,
  'time': 4.5023193359375},
 {'id': '2',
  'question': 'What are the provinces with an area greater than 10000?',
  'query_string': 'SELECT name FROM province WHERE area > 10000',
  'type': 'simple',
  'total_tokens': 3025,
  'prompt_tokens': 2914,
  'completion_tokens': 111,
  'total_cost': 0.004593,
  'time': 6.216726064682007},
 {'id': '3',
  'question': 'What are the languages spoken in Poland?',
  'query_string': "SELECT name FROM language WHERE country = 'POL'",
  'type': 'medium',
  'total_tokens': 2913,
  'prompt_tokens': 2817,
  'completion_tokens': 96,
  'total_cost': 0.0044175,
  'time': 4.470462322235107},
 {'id': '4',
  'question': 'How deep is Lake Kariba?',
  'query_string': "SELECT name, depth FROM lake WHERE name = 'Lak

In [11]:
errors

[35]

Se existir erros

In [12]:

# A cada X consultas, vai ser gerado um delay de 10s para evitar o bloqueio da API.
number_of_queries_to_delay = 15
count = 0
queries = read_queries()
for pos in errors:
    instance = queries[pos]
    print(instance['question'])
    if count == number_of_queries_to_delay:
        count = 0
        time.sleep(10)
    try:
        with get_openai_callback() as cb:
            response = agent_executor({"input":instance["question"]})  
            sql_query = response['output']
            instance["query_string"] = sql_query
            instance['total_tokens'] = cb.total_tokens
            instance['prompt_tokens'] = cb.prompt_tokens
            instance['completion_tokens'] = cb.completion_tokens
            instance['total_cost'] = cb.total_cost  
            queries[pos] = instance
        save_queries(queries)

        count += 1
    except:
        pass
queries

List African countries by number of neighbors.


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mAction: sql_db_list_tables
Action Input: ""[0m
Observation: [38;5;200m[1;3mairport, borders, city, citylocalname, cityothername, citypops, continent, country, countrylocalname, countryothername, countrypops, desert, economy, encompasses, ethnicgroup, geo_desert, geo_estuary, geo_island, geo_lake, geo_mountain, geo_river, geo_sea, geo_source, island, islandin, ismember, lake, lakeonisland, language, located, locatedon, mergeswith, mountain, mountainonisland, organization, politics, population, province, provincelocalname, provinceothername, provpops, religion, river, riveronisland, riverthrough, sea[0m
Thought:[32;1m[1;3mI should query the schema of the country and borders tables to see what columns are available.
Action: sql_db_schema
Action Input: "country, borders"[0m
Observation: [33;1m[1;3m
CREATE TABLE borders (
	country1 VARCHAR(4 CHAR) NOT NULL, 
	country2 VARCHAR

  for tbl in self._metadata.sorted_tables


[32;1m[1;3mI can use the country and borders tables to list African countries by number of neighbors. I will need to join the country and borders tables and count the number of distinct country2 values for each country1 value.
Action: custom_sql_db_query
Action Input: 
SELECT c.name AS country, COUNT(DISTINCT b.country2) AS num_neighbors
FROM country c
JOIN borders b ON c.code = b.country1
WHERE c.continent = 'Africa'
GROUP BY c.name
ORDER BY num_neighbors DESC[0m
Observation: [36;1m[1;3mError: (oracledb.exceptions.DatabaseError) ORA-00904: "C"."CONTINENT": invalid identifier
[SQL: SELECT c.name AS country, COUNT(DISTINCT b.country2) AS num_neighbors
FROM country c
JOIN borders b ON c.code = b.country1
WHERE c.continent = 'Africa'
GROUP BY c.name
ORDER BY num_neighbors DESC]
(Background on this error at: https://sqlalche.me/e/20/4xp6)[0m
Thought:[32;1m[1;3mI received an error because the "continent" column is not available in the "country" table. I should query the schema of th

[{'id': '1',
  'question': 'What is the area of Thailand?',
  'query_string': "SELECT area FROM country WHERE name = 'Thailand'",
  'type': 'simple',
  'total_tokens': 2946,
  'prompt_tokens': 2845,
  'completion_tokens': 101,
  'total_cost': 0.0044695},
 {'id': '2',
  'question': 'What are the provinces with an area greater than 10000?',
  'query_string': 'SELECT name FROM province WHERE area > 10000',
  'type': 'simple',
  'total_tokens': 3002,
  'prompt_tokens': 2901,
  'completion_tokens': 101,
  'total_cost': 0.0045535},
 {'id': '3',
  'question': 'What are the languages spoken in Poland?',
  'query_string': "SELECT name FROM language WHERE country = 'POL'",
  'type': 'medium',
  'total_tokens': 2913,
  'prompt_tokens': 2817,
  'completion_tokens': 96,
  'total_cost': 0.0044175},
 {'id': '4',
  'question': 'How deep is Lake Kariba?',
  'query_string': "SELECT depth FROM lake WHERE name = 'Lake Kariba'",
  'type': 'simple',
  'total_tokens': 3241,
  'prompt_tokens': 3114,
  'comple

#### Fixing queries

In [12]:

to_fix = [40,59,62,72,85,99]
for pos in to_fix:
    instance = queries[pos]
    q = read_queries()
    try:
        with get_openai_callback() as cb:
            start_time = time.time()
            response = agent_executor({"input":instance["question"]})  
            end_time = time.time()
            sql_query = response['output']
            if "Agent stopped" in sql_query:
                print("Agent stopped" + str(pos))
            else:
                instance["query_string"] = sql_query
                instance['total_tokens'] = cb.total_tokens
                instance['prompt_tokens'] = cb.prompt_tokens
                instance['completion_tokens'] = cb.completion_tokens
                instance['total_cost'] = cb.total_cost  
                instance['time'] = end_time - start_time
                print(instance['id'], instance['question'], instance["query_string"], instance['time'], instance['total_cost'])
                q[pos] = instance
                save_queries(q)
    except:
        print("Agent error" + str(pos))



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mAction: sql_db_list_tables
Action Input: ""[0m
Observation: [38;5;200m[1;3mairport, borders, city, citylocalname, cityothername, citypops, continent, country, countrylocalname, countryothername, countrypops, desert, economy, encompasses, ethnicgroup, geo_desert, geo_estuary, geo_island, geo_lake, geo_mountain, geo_river, geo_sea, geo_source, island, islandin, ismember, lake, lakeonisland, language, located, locatedon, mergeswith, mountain, mountainonisland, organization, politics, population, province, provincelocalname, provinceothername, provpops, religion, river, riveronisland, riverthrough, sea[0m
Thought:[32;1m[1;3mI should query the schema of the "country" table to see what columns are available.
Action: sql_db_schema
Action Input: "country"[0m
Observation: [33;1m[1;3m
CREATE TABLE country (
	name VARCHAR(50 CHAR) NOT NULL, 
	code VARCHAR(4 CHAR) NOT NULL, 
	capital VARCHAR(50 CHAR), 
	province VARCHAR(50 CHAR)