In [150]:
from dotenv import load_dotenv

from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain import OpenAI
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI

load_dotenv()


True

In [14]:
# Load the code to analyse
loader = DirectoryLoader("source_code/sql_server", glob="**/*.sql", loader_cls=TextLoader, show_progress=True)
documents = loader.load()
documents

100%|██████████| 2/2 [00:00<00:00, 1403.25it/s]


[Document(page_content='/*\n** Copyright Microsoft, Inc. 1994 - 2000\n** All Rights Reserved.\n*/\n\n-- This script does not create a database.\n-- Run this script in the database you want the objects to be created.\n-- Default schema is dbo.\n\nSET NOCOUNT ON\nGO\n\nset quoted_identifier on\nGO\n\n/* Set DATEFORMAT so that the date strings are interpreted correctly regardless of\n   the default DATEFORMAT on the server.\n*/\nSET DATEFORMAT mdy\nGO\n\ngo\nif exists (select * from sysobjects where id = object_id(\'dbo.Employee Sales by Country\') and sysstat & 0xf = 4)\n\tdrop procedure "dbo"."Employee Sales by Country"\nGO\nif exists (select * from sysobjects where id = object_id(\'dbo.Sales by Year\') and sysstat & 0xf = 4)\n\tdrop procedure "dbo"."Sales by Year"\nGO\nif exists (select * from sysobjects where id = object_id(\'dbo.Ten Most Expensive Products\') and sysstat & 0xf = 4)\n\tdrop procedure "dbo"."Ten Most Expensive Products"\nGO\nif exists (select * from sysobjects where id

In [15]:
splitter = RecursiveCharacterTextSplitter(separators=["GO\n", "\n\n", "\n"], chunk_size=2500, chunk_overlap=0)
chunks = splitter.split_documents(documents)
chunks

[Document(page_content='/*\n** Copyright Microsoft, Inc. 1994 - 2000\n** All Rights Reserved.\n*/\n\n-- This script does not create a database.\n-- Run this script in the database you want the objects to be created.\n-- Default schema is dbo.\n\nSET NOCOUNT ON\nGO\n\nset quoted_identifier on\nGO\n\n/* Set DATEFORMAT so that the date strings are interpreted correctly regardless of\n   the default DATEFORMAT on the server.\n*/\nSET DATEFORMAT mdy\nGO\n\ngo\nif exists (select * from sysobjects where id = object_id(\'dbo.Employee Sales by Country\') and sysstat & 0xf = 4)\n\tdrop procedure "dbo"."Employee Sales by Country"\nGO\nif exists (select * from sysobjects where id = object_id(\'dbo.Sales by Year\') and sysstat & 0xf = 4)\n\tdrop procedure "dbo"."Sales by Year"\nGO\nif exists (select * from sysobjects where id = object_id(\'dbo.Ten Most Expensive Products\') and sysstat & 0xf = 4)\n\tdrop procedure "dbo"."Ten Most Expensive Products"\nGO\nif exists (select * from sysobjects where id

In [16]:
embeddings = OpenAIEmbeddings()
docsearch = Chroma.from_documents(chunks, embeddings)


In [179]:
llm = OpenAI(temperature=0, verbose=True)
# llm = OpenAI(model="ada", temperature=0, max_tokens=2049, verbose=True)
# llm = OpenAI(model="text-davinci-003", temperature=0, verbose=True)
# llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0, verbose=True)

qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=docsearch.as_retriever()
)


In [180]:
def fetch_procs():
    query = """
As a Microsoft SQL server programmer reading ANSI-SQL, list all the stored procedure names.  Note that procedure names can also have spaces within the name, as long as the name is in quotes.

Output:
procedure_name, procedure_name, procedure_name, ...
"""
    print("Query: ", query)
    ans = qa.run(query)
    print("Answer: ", ans)
    list = ans.split(',')
    return list


In [181]:
procedures = fetch_procs()
procedures

Query:  
As a Microsoft SQL server programmer reading ANSI-SQL, list all the stored procedure names.  Note that procedure names can also have spaces within the name, as long as the name is in quotes.

Output:
procedure_name, procedure_name, procedure_name, ...

Answer:  
"Sales by Year", CustOrdersDetail, CustOrdersOrders, CustOrderHist, SalesByCategory, reptq1, reptq2, reptq3, SalesByCategory, "Ten Most Expensive Products", "Employee Sales by Country"


['\n"Sales by Year"',
 ' CustOrdersDetail',
 ' CustOrdersOrders',
 ' CustOrderHist',
 ' SalesByCategory',
 ' reptq1',
 ' reptq2',
 ' reptq3',
 ' SalesByCategory',
 ' "Ten Most Expensive Products"',
 ' "Employee Sales by Country"']

In [165]:
for proc in procedures:
    print(proc)

CustOrdersDetail
 CustOrdersOrders
 CustOrderHist
 SalesByCategory
 byroyalty
 reptq1
 reptq2
 reptq3
 "Ten Most Expensive Products"
 "Employee Sales by Country"


In [139]:

def fetch_tables_called_by_proc(procedure_name):
    print("Getting Tables for: ", procedure_name)

    prompt = PromptTemplate(
        input_variables=["procedure_name"],
        template="""
            As a Microsoft SQL server programmer reading ANSI-SQL, list all the tables queried or updated by the stored procedure called {procedure_name}.

            Don't list table names unless they occur within the FROM or INTO clauses of a SELECT or UPDATE statement.

            Output:
            table_name, table_name, table_name, ...
        """
    )
    query = prompt.format(procedure_name=procedure_name)
    ans = qa.run(query)
    print("Query: ", query)
    print("Answer: ", ans)
    list = ans.split(',')
    return list

In [166]:
print(fetch_tables_called_by_proc("CustOrdersOrders"))

Getting Tables for:  CustOrdersOrders
Query:  
            As a Microsoft SQL server programmer reading ANSI-SQL, list all the tables queried or updated by the stored procedure called CustOrdersOrders.

            Don't list table names unless they occur within the FROM or INTO clauses of a SELECT or UPDATE statement.

            Output:
            table_name, table_name, table_name, ...
        
Answer:  Orders
['Orders']


In [167]:

tables_by_procedure_struct = {}
for proc in procedures:
    tables_by_procedure_struct[proc] = fetch_tables_called_by_proc(proc)



Getting Tables for:  CustOrdersDetail
Query:  
            As a Microsoft SQL server programmer reading ANSI-SQL, list all the tables queried or updated by the stored procedure called CustOrdersDetail.

            Don't list table names unless they occur within the FROM or INTO clauses of a SELECT or UPDATE statement.

            Output:
            table_name, table_name, table_name, ...
        
Answer:  "Products", "[Order Details]"
Getting Tables for:   CustOrdersOrders
Query:  
            As a Microsoft SQL server programmer reading ANSI-SQL, list all the tables queried or updated by the stored procedure called  CustOrdersOrders.

            Don't list table names unless they occur within the FROM or INTO clauses of a SELECT or UPDATE statement.

            Output:
            table_name, table_name, table_name, ...
        
Answer:  The stored procedure "CustOrdersOrders" queries the "Orders" table.
Getting Tables for:   CustOrderHist
Query:  
            As a Microsoft SQL 

In [168]:
tables_by_procedure_struct

{'CustOrdersDetail': ['"Products"', ' "[Order Details]"'],
 ' CustOrdersOrders': ['The stored procedure "CustOrdersOrders" queries the "Orders" table.'],
 ' CustOrderHist': ['The tables queried by the stored procedure CustOrderHist are:\n\n- Products\n- [Order Details]\n- Orders\n- Customers'],
 ' SalesByCategory': ['[Order Details]',
  ' Orders',
  ' Products',
  ' Categories'],
 ' byroyalty': ['titleauthor', ' titleauthor'],
 ' reptq1': ['titles'],
 ' reptq2': ['The stored procedure "reptq2" queries the following tables:\n\n- titles\n- Categories\n- Products\n- Orders\n- "Order Details Extended"'],
 ' reptq3': ['titles'],
 ' "Ten Most Expensive Products"': ['Based on the given context',
  ' the stored procedure "Ten Most Expensive Products" only queries the "Products" table. Therefore',
  ' the output would be:\n\nProducts'],
 ' "Employee Sales by Country"': ['Orders',
  ' "Order Subtotals"',
  ' Employees']}

In [169]:
def fetch_tables():
    query = """
As a Microsoft SQL Server syntax parser, find all the CREATE TABLE statements and list all the table names.

Table names can be a single word, or several words that are surrounded by quotes or square brackets.

Example:
CREATE TABLE TableName
Name: TableName

Example:
CREATE TABLE "Table Name"
Name: "Table Name"

Example:
CREATE TABLE "TableName"
Name: TableName

Example:
CREATE TABLE [dbo].[TableName]
Name: TableName

Example:
CREATE TABLE [Table Name]
Name: "Table Name"

Example:
CREATE TABLE [TableName]
Name: TableName


Output:
"TableName", "TableName", "TableName", ...
"""
    print("Query: ", query)
    ans = qa.run(query)
    print("Answer: ", ans)
    list = ans.split(',')
    return list

In [171]:
tables = fetch_tables()
tables

Query:  
As a Microsoft SQL Server syntax parser, find all the CREATE TABLE statements and list all the table names.

Table names can be a single word, or several words that are surrounded by quotes or square brackets.

Example:
CREATE TABLE TableName
Name: TableName

Example:
CREATE TABLE "Table Name"
Name: "Table Name"

Example:
CREATE TABLE "TableName"
Name: TableName

Example:
CREATE TABLE [dbo].[TableName]
Name: TableName

Example:
CREATE TABLE [Table Name]
Name: "Table Name"

Example:
CREATE TABLE [TableName]
Name: TableName


Output:
"TableName", "TableName", "TableName", ...

Answer:  Table Names: "Employees", "Categories", "Customers", "Shippers", "Suppliers"


['Table Names: "Employees"',
 ' "Categories"',
 ' "Customers"',
 ' "Shippers"',
 ' "Suppliers"']

In [172]:
def fetch_foreign_keys_for_table(table_name):
    print("Getting FKs for Table: ", table_name)

    prompt = PromptTemplate(
        input_variables=["table_name"],
        template="""
            As a Microsoft SQL server programmer reading ANSI-SQL, list all the tables that have a foreign key foreign key constraint that references the {table_name} table.

            Example:
            CREATE TABLE RelatedTableName (

            CONSTRAINT "FK_Name" FOREIGN KEY 
            (
                "{table_name}ID"
            ) REFERENCES "dbo"."{table_name}" (
                "{table_name}ID"
            )
            )
            GO
            table_name: RelatedTableName

            Example:
            ALTER TABLE RelatedTableName
            ADD CONSTRAINT [FK_Name] FOREIGN KEY 
            (
                [{table_name}ID]
            ) REFERENCES [dbo].[{table_name}] (
                [{table_name}ID]
            )
            GO
            table_name: RelatedTableName

            Output:
            table_name, table_name, table_name, ...
        """
    )
    query = prompt.format(table_name=table_name)
    ans = qa.run(query)
    print("Query: ", query)
    print("Answer: ", ans)
    list = ans.split(',')
    return list

In [173]:
print(fetch_foreign_keys_for_table("Employees"))

Getting FKs for Table:  Employees
Query:  
            As a Microsoft SQL server programmer reading ANSI-SQL, list all the tables that have a foreign key foreign key constraint that references the Employees table.

            Example:
            CREATE TABLE RelatedTableName (

            CONSTRAINT "FK_Name" FOREIGN KEY 
            (
                "EmployeesID"
            ) REFERENCES "dbo"."Employees" (
                "EmployeesID"
            )
            )
            GO
            table_name: RelatedTableName

            Example:
            ALTER TABLE RelatedTableName
            ADD CONSTRAINT [FK_Name] FOREIGN KEY 
            (
                [EmployeesID]
            ) REFERENCES [dbo].[Employees] (
                [EmployeesID]
            )
            GO
            table_name: RelatedTableName

            Output:
            table_name, table_name, table_name, ...
        
Answer:  table_name: Territories, EmployeeTerritories
['table_name: Territories', ' Em