In [45]:
from dotenv import load_dotenv
import openai
import os
import langchain.llms as llms
from langchain.chat_models import ChatOpenAI
from langchain.chains import create_sql_query_chain
from langchain import HuggingFaceHub, LLMChain
import pandas as pd
import sqlite3
from langchain_community.utilities import SQLDatabase
from langchain.prompts import PromptTemplate
import openpyxl
from langchain.sql_database import SQLDatabase
from langchain_community.agent_toolkits import create_sql_agent

In [26]:
working_dir = "C:/Users/nithi/OneDrive - University of Illinois Chicago/Documents/Capstone project/LLM project"
os.chdir(working_dir)

print(f"The current working directory: {os.getcwd()}")

The current working directory: C:\Users\nithi\OneDrive - University of Illinois Chicago\Documents\Capstone project\LLM project


In [27]:
# Load Excel file
excel_file = 'dataset.xlsx' 
df = pd.read_excel(excel_file, sheet_name=0)  # Reads the first sheet

# Save to SQLite DB
db_file = 'dataset.db'  
conn = sqlite3.connect(db_file)
table_name = 'models'
df.to_sql(table_name, conn, if_exists='replace', index=False)
conn.close()

print(f"Excel data from '{excel_file}' has been successfully saved to '{db_file}' in the table '{table_name}'.")


Excel data from 'dataset.xlsx' has been successfully saved to 'dataset.db' in the table 'models'.


In [64]:
db = SQLDatabase.from_uri("sqlite:///dataset.db")
print(db.dialect)
print(db.get_usable_table_names())
db.run('SELECT * FROM models limit 3;')

sqlite
['models']


"[('Model 1', 'Multi-Class', '[1,2,3]', '2022-04-07 00:00:00', '[1218, 1873, 1245]', '[[0.71, 0.76, 0.81], [0.76, 0.81, 0.81], [0.78, 0.88, 0.93]]', 'Nationwide'), ('Model 1', 'Multi-Class', '[1,2,3]', '2022-04-08 00:00:00', '[2014, 4559, 4492]', '[[0.83, 0.91, 0.74], [0.86, 0.75, 0.89], [0.9, 0.81, 0.75]]', 'Nationwide'), ('Model 1', 'Multi-Class', '[1,2,3]', '2022-04-09 00:00:00', '[2270, 4990, 3276]', '[[0.81, 0.76, 0.8], [0.92, 0.8, 0.73], [0.8, 0.77, 0.86]]', 'Nationwide')]"

In [63]:
# Using GPT2 large model
llm = HuggingFaceHub(repo_id="openai-community/gpt2-large")

prompt_template = PromptTemplate(
    input_variables=["input", "top_k", "table_info"],
    template="Generate a SQL query to answer this question: {input}. Using the table info: {table_info}, provide up to {top_k} SQL variations."
)
chain = create_sql_query_chain(prompt=prompt_template, llm=llm, db=db)
response = chain.invoke({"question": "How many records are there in models table", "top_k": 5}) 
print(response)

Generate a SQL query to answer this question: How many records are there in models table
SQLQuery: . Using the table info: 
CREATE TABLE models (
	"Model Name" TEXT, 
	"Model Type" TEXT, 
	"Model Version" TEXT, 
	"Date" TIMESTAMP, 
	"Daily Volume" TEXT, 
	"Performance Metrics" TEXT, 
	"Company Name" TEXT
)

/*
3 rows from models table:
Model Name	Model Type	Model Version	Date	Daily Volume	Performance Metrics	Company Name
Model 1	Multi-Class	[1,2,3]	2022-04-07 00:00:00	[1218, 1873, 1245]	[[0.71, 0.76, 0.81], [0.76, 0.81, 0.81], [0.78, 0.88, 0.93]]	Nationwide
Model 1	Multi-Class	[1,2,3]	2022-04-08 00:00:00	[2014, 4559, 4492]	[[0.83, 0.91, 0.74], [0.86, 0.75, 0.89], [0.9, 0.81, 0.75]]	Nationwide
Model 1	Multi-Class	[1,2,3]	2022-04-09 00:00:00	[2270, 4990, 3276]	[[0.81, 0.76, 0.8], [0.92, 0.8, 0.73], [0.8, 0.77, 0.86]]	Nationwide
*/, provide up to 5 SQL variations.


In [59]:
from langchain_core.prompts import PromptTemplate

template = '''Given an input question, first create a syntactically correct query to run, then look at the results of the query and return the answer.
Use the following format:

Question: "How many records are there in models table"
SQLQuery: "SELECT COUNT(*) FROM models"
SQLResult: "3116"
Answer: "3116"

Only use the following tables:

{table_info}.

Question: {input}
provide up to {top_k} SQL variations.'''
prompt = PromptTemplate.from_template(template)
prompt

PromptTemplate(input_variables=['input', 'table_info', 'top_k'], template='Given an input question, first create a syntactically correct query to run, then look at the results of the query and return the answer.\nUse the following format:\n\nQuestion: "How many records are there in models table"\nSQLQuery: "SELECT COUNT(*) FROM models"\nSQLResult: "3116"\nAnswer: "3116"\n\nOnly use the following tables:\n\n{table_info}.\n\nQuestion: {input}\nprovide up to {top_k} SQL variations.')

In [60]:


chain = create_sql_query_chain(prompt=prompt, llm=llm, db=db)

response = chain.invoke({"question": "How many records are there in models table", "top_k": 5}) 

print(response)

Given an input question, first create a syntactically correct query to run, then look at the results of the query and return the answer.
Use the following format:

Question: "How many records are there in models table"
SQLQuery: "SELECT COUNT(*) FROM models"
