# My own sql database

I will create a database with two tables. 
- One will have names of people and their height
- one will have names of people and their weight.

The names of the tables and of the features will be gibberish; randomly generated strings

In [1]:
import random
import string

def generate_random_string(length):
    letters = string.ascii_lowercase
    return ''.join(random.choice(letters) for _ in range(length))

# Generate a random string of length 3
generate_random_string(3)

'kqi'

In [2]:
generate_random_string(3)

'fgm'

In [3]:
import sqlite3

# Connect to the database (creates a new database if it doesn't exist)
conn = sqlite3.connect('trial.sqlite')

# Create a cursor object to execute SQL statements
cursor = conn.cursor()

# Create the first table
cursor.execute('''
    CREATE TABLE IF NOT EXISTS mml (
        id INTEGER PRIMARY KEY,
        afe TEXT,
        tio REAL
    )
''')
y_data = [
    (1, 'Alice', 205.1),
    (2, 'Bob', 201.2),
    (3, 'Cat', 199.4)
]
cursor.executemany(
    'INSERT INTO mml VALUES (?, ?, ?)', 
    y_data
)


# Create the second table
cursor.execute('''
    CREATE TABLE IF NOT EXISTS fsr (
        id INTEGER PRIMARY KEY,
        hpx TEXT,
        cnv REAL
    )
''')
gmass_data = [
    (1, 'Alice',175.2 ),
    (2, 'Bob', 156.2),
    (3, 'Cat', 123.8)
]
cursor.executemany(
    'INSERT INTO fsr VALUES (?, ?, ?)', 
    gmass_data
)
# Commit the changes and close the connection
conn.commit()
conn.close()

Now the schema does not sematically reveal anything about the meaning of the tables or features. That information is stored in data_dictionary_variaton.txt.

In [17]:
conn = sqlite3.connect('trial.sqlite')
cursor = conn.cursor()
cursor.execute("SELECT sql FROM sqlite_master WHERE type='table';")

# Fetch all rows containing table schema
table_schemas = cursor.fetchall()

# Print the table schemas
for schema in table_schemas:
    print(schema[0])
conn.commit()
conn.close()

CREATE TABLE mml (
        id INTEGER PRIMARY KEY,
        afe TEXT,
        tio REAL
    )
CREATE TABLE fsr (
        id INTEGER PRIMARY KEY,
        hpx TEXT,
        cnv REAL
    )


# Queries

In [6]:
import sqlite3


# Define a finction to be made into a tool. 
def run_sqlite_query(query):
    conn = sqlite3.connect('trial.sqlite')
    c =  conn.cursor() #Cursor is the object that allows us to access to the db.
    try:
        c.execute(query)
        result = c.fetchall() 
    except sqlite3.OperationalError as err:
        result = f"The following error occuerd: {str(err)}"
    conn.commit()
    conn.close()
    return result

# check that query works
print(run_sqlite_query("""
SELECT * FROM mml;
"""))
print(run_sqlite_query("""
SELECT * FROM fsr;
"""))

[(1, 'Alice', 205.1), (2, 'Bob', 201.2), (3, 'Cat', 199.4)]
[(1, 'Alice', 175.2), (2, 'Bob', 156.2), (3, 'Cat', 123.8)]


## Create query tool 

In [7]:
from langchain.tools import Tool

run_query_tool = Tool.from_function(
    name="run_sqlite_query", 
    description="Run a sqlite query.",
    func=run_sqlite_query,
)
# Check that it works:
run_query_tool("""
SELECT * FROM mml 
""")

[(1, 'Alice', 205.1), (2, 'Bob', 201.2), (3, 'Cat', 199.4)]

## Agent aware of Query Tool

In [9]:
from langchain.agents import OpenAIFunctionsAgent

from langchain.prompts import ChatPromptTemplate
from langchain.schema import SystemMessage
from langchain.prompts import HumanMessagePromptTemplate
from langchain.prompts import MessagesPlaceholder

from langchain.agents import AgentExecutor

from langchain_openai import ChatOpenAI
from dotenv import load_dotenv

load_dotenv() # for David Cherney's OpenAI account.
chat=ChatOpenAI()
tools=[run_query_tool]

prompt = ChatPromptTemplate(
    messages=[
        SystemMessage(
            content = (
                "You are an AI that has access to a sqlite database. "
                )
            ),
        HumanMessagePromptTemplate.from_template("{input}"),
        MessagesPlaceholder(
            variable_name = "agent_scratchpad"
            )
        ]
    )

agent = OpenAIFunctionsAgent(
    llm=chat,
    prompt=prompt,
    tools=tools
    # memory = 
)

agent_executor = AgentExecutor(
    agent=agent,
    verbose=True,
    tools=tools # Must be the same as in agent.
)

nl_query="What is the average height of the people in the database?"
agent_executor(nl_query)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `run_sqlite_query` with `{'query': 'SELECT AVG(height) FROM people'}`


[0m[36;1m[1;3mThe following error occuerd: no such table: people[0m[32;1m[1;3mI'm sorry, but it seems that there is no table named "people" in the database. Please make sure that the table exists and try again.[0m

[1m> Finished chain.[0m


{'input': 'What is the average height of the people in the database?',
 'output': 'I\'m sorry, but it seems that there is no table named "people" in the database. Please make sure that the table exists and try again.'}

The LLM guessed that there was a table named people and a feature in that table called height. It needs to be informed of the names of the taabls and the schema. To do that we will
1. create a function to list the tables and add that list to the systemprompt
2. create a function that returns the schema, create a tool for that function, and thus allow the agent to look up the schema as needed. 

# List tables function
This is needed so that the query tool can be informed of the names of the tables.

In [11]:
import sqlite3

def list_tables():
    conn = sqlite3.connect("trial.sqlite")
    c = conn.cursor()
    # List tables in the database using SQL.
    c.execute("SELECT name FROM sqlite_master WHERE type='table';")
    rows = c.fetchall() 
    # SQL format issue:
    # Each item in rows was a 2-tuple with first item a table name as a string, 
    # second item blank. Replace all toples with their fist entry.
    rows = [row[0] for row in rows if row[0] is not None]
    # Reformat as a string with the name of one table on each line. 
    tables = " ".join(rows)
    conn.commit()
    conn.close()
    return tables

tables = list_tables()
tables

'mml fsr'

## Tables informed Agent with Query Tool

In [12]:
from langchain.agents import OpenAIFunctionsAgent

from langchain.prompts import ChatPromptTemplate
from langchain.schema import SystemMessage
from langchain.prompts import HumanMessagePromptTemplate
from langchain.prompts import MessagesPlaceholder

from langchain.agents import AgentExecutor

from langchain_openai import ChatOpenAI
from dotenv import load_dotenv

load_dotenv() # for David Cherney's OpenAI account.
chat=ChatOpenAI()
tools=[run_query_tool]

prompt = ChatPromptTemplate(
    messages=[
        SystemMessage(
            content = (
                "You are an AI that has access to a sqlite database. "
                f"The database has the following tables: {list_tables().split(' ')}\n"
                "Do not make any assumptions about what tables exist. "
                )
            ),
        HumanMessagePromptTemplate.from_template("{input}"),
        MessagesPlaceholder(
            variable_name = "agent_scratchpad"
            )
        ]
    )

agent = OpenAIFunctionsAgent(
    llm=chat,
    prompt=prompt,
    tools=tools
    # memory = 
)

agent_executor = AgentExecutor(
    agent=agent,
    verbose=True,
    tools=tools # Must be the same as in agent.
)

nl_query="What is the average height of the people in the database?"
agent_executor(nl_query)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `run_sqlite_query` with `{'query': 'SELECT AVG(height) FROM mml'}`


[0m[36;1m[1;3mThe following error occuerd: no such column: height[0m[32;1m[1;3m
Invoking: `run_sqlite_query` with `{'query': 'SELECT AVG(height) FROM fsr'}`


[0m[36;1m[1;3mThe following error occuerd: no such column: height[0m[32;1m[1;3mI apologize, but it seems that there is no "height" column in any of the tables in the database. Therefore, I cannot calculate the average height of the people in the database.[0m

[1m> Finished chain.[0m


{'input': 'What is the average height of the people in the database?',
 'output': 'I apologize, but it seems that there is no "height" column in any of the tables in the database. Therefore, I cannot calculate the average height of the people in the database.'}

The LLM 
- knew that the names of the tables 
- assumed there was a feature called height in one of those tables and repeatedly went looking for it.

## Tool for examining schema

In [16]:
def describe_tables(table_names):
    conn = sqlite3.connect("trial.sqlite")
    c = conn.cursor()
    # Create a string that is a list of tables for SQL syntax. 
    # e.g. tables = "WHERE name in ('users','orders','products');" .
    tables = ', '.join("'"+ table + "'" for table in table_names) 
    schemas_tuples = c.execute(
        "SELECT sql "
        "FROM sqlite_master " 
        f"WHERE type='table' and name IN ({tables});"
        )
    schemas_tuples = list(schemas_tuples) # Seems to make it more permananet.
    schemas = [schema_tuple[0] for schema_tuple in schemas_tuples 
           if schema_tuple[0] is not None]
    schemas_string = '\n'.join(schemas)
    conn.commit()
    conn.close()
    return schemas_string

describe_tables_tool = Tool.from_function(
    name="describe_tables",
    description=(
        "Gven a list of table names, " 
        + "returns the schema of those tables."
        ),
    func=describe_tables,
 #   args_schema=DescribeTablesArgsSchema 
)
print(describe_tables(table_names = list_tables().split(' ')))

CREATE TABLE mml (
        id INTEGER PRIMARY KEY,
        afe TEXT,
        tio REAL
    )
CREATE TABLE fsr (
        id INTEGER PRIMARY KEY,
        hpx TEXT,
        cnv REAL
    )


## Table informed, query and schema tools

In [18]:
load_dotenv() # for David Cherney's OpenAI account.
chat=ChatOpenAI()
# Give the agent the new tool.
tools=[run_query_tool, describe_tables_tool]

prompt = ChatPromptTemplate(
    messages=[
        SystemMessage(
            content = (
                "You are an AI that has access to a sqlite database. "
                f"The database has the following tables: {list_tables().split(' ')}\n"
                "Do not make any assumptions about what tables or columns exist. "
                "Instead, pass the list of tables to the 'descibe_tables' function to obtain the schema of the tables."
                "You might need to join the tables on the feature N"
                )
            ),
        HumanMessagePromptTemplate.from_template("{input}"),
        MessagesPlaceholder(
            variable_name = "agent_scratchpad"
            )
        ]
    )

agent = OpenAIFunctionsAgent(
    llm=chat,
    prompt=prompt,
    tools=tools
    # memory = 
)

agent_executor = AgentExecutor(
    agent=agent,
    verbose=True,
    tools=tools # Must be the same as in agent.
)

nl_query = "What is the average height of the people in the database?"
agent_executor(nl_query)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `run_sqlite_query` with `{'query': 'SELECT AVG(height) FROM mml'}`


[0m[36;1m[1;3mThe following error occuerd: no such column: height[0m[32;1m[1;3m
Invoking: `describe_tables` with `{'table_names': 'mml'}`


[0m[33;1m[1;3m[0m[32;1m[1;3m
Invoking: `describe_tables` with `{'table_names': 'mml'}`


[0m[33;1m[1;3m[0m[32;1m[1;3m
Invoking: `describe_tables` with `{'table_names': 'mml'}`


[0m[33;1m[1;3m[0m[32;1m[1;3m
Invoking: `describe_tables` with `{'table_names': 'mml'}`


[0m[33;1m[1;3m[0m[32;1m[1;3mIt seems that the 'mml' table does not have a column named 'height'. Can you provide the correct column name for the height information?[0m

[1m> Finished chain.[0m


{'input': 'What is the average height of the people in the database?',
 'output': "It seems that the 'mml' table does not have a column named 'height'. Can you provide the correct column name for the height information?"}

# Data dictionary DB

A data dictionary named `data_dictoinary.txt` will be fed through a text splitter on split charater "\n" and the result put in the chroma database `data_dictionary_vector_store`.

In [24]:
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter 
import tiktoken
from langchain_openai import OpenAIEmbeddings
from langchain.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings
    )
from langchain.vectorstores.chroma import Chroma
from langchain.text_splitter import CharacterTextSplitter 
from langchain_openai import OpenAIEmbeddings
from dotenv import load_dotenv 

In [30]:
embeddings = OpenAIEmbeddings()

text_splitter = CharacterTextSplitter(
    is_separator_regex=False, 
    chunk_size = 87, 
    separator = "\n", 
    chunk_overlap = 0  
)

loader = TextLoader("data_dictionary_variation.txt")
docs = loader.load_and_split(
    text_splitter=text_splitter,
)
db = Chroma.from_documents(
    documents = docs,
    embedding=embeddings,
    # Name the directory for the vector store.
    persist_directory="vs_variation"
)
print(f"The databse is of type {type(db)} \n" 
      f"and has {db._collection.count()} entries.")

The databse is of type <class 'langchain_community.vectorstores.chroma.Chroma'> 
and has 6 entries.


In [33]:
# Open the db if you do not want to create it. 
embeddings = OpenAIEmbeddings()

db =  Chroma(
    persist_directory="vs_variation", 
    embedding_function=embeddings)

## Similarity Search Tool

In [38]:
# test the vector store
results = db.similarity_search(
    query = "what symbol is used to represent a person's height inside a table?",
    k = 2 #Number of results to display. Default 4
    )

print("\n".join([result.page_content for result in results]))

Column tio represents the height of the person in centimeters.
Table mml contains data about peoples' height.


In [37]:
"\n".join([result.page_content for result in results])

"Column tio represents the height of the person in centimeters.\nTable mml contains data about peoples' height."

In [39]:
def data_dictionary_similarity_search(query):
    results = db.similarity_search(
        query = query,
        k = 2 #Number of results to display. Default 4
    )
    return "\n".join([result.page_content for result in results])
    
query = "Which feature of which table is about peoples' weight?"
data_dictionary_similarity_search(query)

"Table fsr contains data about peoples' weight.\nTable mml contains data about peoples' height."

In [61]:
dd_ss_tool = Tool.from_function(
    name="data_dictionary_similarity_search", 
    description="""
    Gives information from the data dictionary that should be used to form an sql query.
    """,
    func=data_dictionary_similarity_search,
    return_direct=True
)
# Check that it works:
dd_ss_tool("""
which table has a feature about weight?""")

"Table fsr contains data about peoples' weight.\nTable mml contains data about peoples' height."

## Table informed, query, schema, and DD equipted

In [62]:
load_dotenv() # for David Cherney's OpenAI account.
chat=ChatOpenAI()
tools=[run_query_tool, describe_tables_tool, dd_ss_tool]

prompt = ChatPromptTemplate(messages=[
    SystemMessage(content=(
        "You are an AI that has access to an sqlite database."
        "You will not understand the names of the tables and columns."
        "You can ask the data_dictionary_similarity_search tool questions about the data dictionary for the database."
    )),
    HumanMessagePromptTemplate.from_template(
        """
        {input} 
        """),
    MessagesPlaceholder(variable_name = "agent_scratchpad")
    ]
)

agent = OpenAIFunctionsAgent(
    llm=chat,
    prompt=prompt,
    tools=tools
    # memory = 
)

agent_executor = AgentExecutor(
    agent=agent,
    verbose=True,
    tools=tools # Must be the same as in agent.
)

nl_query = """
What is the average height of the people in the database?
"""
agent_executor(nl_query)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `run_sqlite_query` with `SELECT AVG(height) FROM people`


[0m[36;1m[1;3mThe following error occuerd: no such table: people[0m[32;1m[1;3m
Invoking: `data_dictionary_similarity_search` with `people`


[0m[38;5;200m[1;3mColumn hpx represents the name of the person
Column afe represents the name of the person[0m
[32;1m[1;3m[0m

[1m> Finished chain.[0m


{'input': '\nWhat is the average height of the people in the database?\n',
 'output': 'Column hpx represents the name of the person\nColumn afe represents the name of the person'}

In [46]:
nl_query = """
What is the average ratio of height to weight of the people in the database?
"""
agent_executor(nl_query)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `run_sqlite_query` with `{'query': 'SELECT AVG(height/weight) as average_ratio FROM mml'}`


[0m[36;1m[1;3mThe following error occuerd: no such column: height[0m[32;1m[1;3m
Invoking: `run_sqlite_query` with `{'query': 'SELECT AVG(height_weight_ratio) as average_ratio FROM mml'}`


[0m[36;1m[1;3mThe following error occuerd: no such column: height_weight_ratio[0m[32;1m[1;3m
Invoking: `run_sqlite_query` with `{'query': 'SELECT AVG(height) / AVG(weight) as average_ratio FROM mml'}`


[0m[36;1m[1;3mThe following error occuerd: no such column: weight[0m[32;1m[1;3m
Invoking: `run_sqlite_query` with `{'query': 'SELECT AVG(height) as average_height, AVG(weight) as average_weight FROM mml'}`


[0m[36;1m[1;3mThe following error occuerd: no such column: height[0m[32;1m[1;3mI'm sorry, but it seems like the database does not have the necessary columns for calculating the average ratio of height to weight.

{'input': '\nWhat is the average ratio of height to weight of the people in the database?\n',
 'output': "I'm sorry, but it seems like the database does not have the necessary columns for calculating the average ratio of height to weight. Can you please provide more information about the structure of the database and the columns available?"}