In [73]:
from dotenv import load_dotenv
import openai
import os
import langchain.llms as llms
from langchain.chains import create_sql_query_chain
from langchain import HuggingFaceHub, LLMChain
import pandas as pd
import sqlite3
from langchain_community.utilities import SQLDatabase
from langchain.prompts import PromptTemplate
import openpyxl
from langchain.sql_database import SQLDatabase
from langchain_community.agent_toolkits import create_sql_agent
from langchain_community.tools.sql_database.tool import QuerySQLDataBaseTool
from operator import itemgetter
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI

In [74]:
working_dir = "C:/Users/nithi/OneDrive - University of Illinois Chicago/Documents/Capstone project/LLM project"
os.chdir(working_dir)

print(f"The current working directory: {os.getcwd()}")

The current working directory: C:\Users\nithi\OneDrive - University of Illinois Chicago\Documents\Capstone project\LLM project


In [75]:
# Load CSV file
csv_file = 'dataset_flattened.csv' 
df = pd.read_csv(csv_file)  # Reads the first sheet

# Save to SQLite DB
db_file = 'dataset.db'  
conn = sqlite3.connect(db_file)
table_name = 'models'
df.to_sql(table_name, conn, if_exists='replace', index=False)
conn.close()

print(f"Excel data from '{csv_file}' has been successfully saved to '{db_file}' in the table '{table_name}'.")


Excel data from 'dataset_flattened.csv' has been successfully saved to 'dataset.db' in the table 'models'.


In [86]:
db = SQLDatabase.from_uri("sqlite:///dataset.db")

llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.5)

write_query = create_sql_query_chain(llm, db)
response = write_query.invoke({"question": "How many companies are there"})
response 

'SELECT COUNT(DISTINCT "Company Name") AS "Total Companies" FROM models;'

In [80]:
execute_query = QuerySQLDataBaseTool(db=db)
chain = write_query | execute_query
chain.invoke({"question": "Different companies in our table"})


"[('Nationwide',), ('State Farm',), ('Liberty Mutual',), ('Allstate',)]"

In [123]:
SQl_prompt = PromptTemplate.from_template(
    """You are a SQLite expert. Given an input question, create a syntactically correct SQLite query to run, to answer the input question.
The "Performance Metrics" column in the database stores a list of three values for each entry: [accuracy, precision, MAE]. To query based on one of these metrics, you will need to extract the appropriate value from the list.
For questions about accuracy, focus on the first value in the list. For precision, the second, and for MAE, the third.
Remember, your query should aim to answer the question with a single SQL statement and limit the results appropriately.
You MUST double check your query before executing it. If you get an error while executing a query, rewrite the query and try again.
Never query for all columns from a table. Only query the columns that are needed to answer the question. Wrap each column name in double quotes (") to denote them as delimited identifiers.
Be careful not to query for columns that do not exist. Also, pay attention to which column is in which table.
Pay attention to use the date('now') function to get the current date if the question involves "today".
Only use the following tables:

{table_info}.

Question: {input}
Provide up to {top_k} SQL variations."""
)


# Initialize Chain 1: Generate SQL Query (this part seems correct based on your setup)
generate_sql_chain = create_sql_query_chain(prompt=SQl_prompt, llm=llm, db=db)

answer_prompt = PromptTemplate.from_template(
    """Given the following user question, corresponding SQL query, and SQL result, answer the user question.
    
Question: {question}
SQL Query: {query}
SQL Result: {result}
Answer: """
)

# Initialize Chain 2: Execute SQL Query and generate structured answer
execute_sql_chain = answer_prompt | llm | StrOutputParser()


def execute_combined_chain(question):
    # Step 1: Generate SQL Query
    sql_query = generate_sql_chain.invoke({"question": question, "top_k": 1})
    
    # Step 2: Execute the SQL Query to get the result
    sql_result = execute_query(sql_query)  # Ensure this returns the result of executing the SQL query
    
    # Step 3: Pass the necessary inputs to the final chain and format the output to include both SQL query and result
    final_response = execute_sql_chain.invoke({"question": question, "query": sql_query, "result": sql_result})
    
    # Format the final answer to include both the SQL query and its result
    final_answer = f"SQL Query: {sql_query}\n Answer: {final_response}"
    return final_answer


# Example invocation
question = "What is the MAE for Model 1 version 2 during March 2020?"
final_answer = execute_combined_chain(question)
print(final_answer)


SQL Query: ```sql
SELECT SUBSTR("Performance Metrics", INSTR("Performance Metrics", ',', 1) + 2, LENGTH("Performance Metrics") - INSTR("Performance Metrics", ',', 1) - 2) AS "MAE"
FROM models
WHERE "Model Name" = 'Model 1' AND "Model Version" = 2 AND "Date" LIKE '%3/2020%'
LIMIT 1;
```
 Answer: The error message indicates a syntax error in the SQL query provided. The query needs to be corrected in order to retrieve the MAE for Model 1 version 2 during March 2020. Once the syntax error is fixed, the query should be executed again to obtain the desired result.


In [122]:
# Example "Performance Metrics" values
metrics = ['[0.71, 0.76, 0.81]', '[0.76, 0.81, 0.81]']

# Preprocess to extract metrics
for metric_str in metrics:
    # Strip brackets and split by comma
    accuracy, precision, mae = metric_str.strip('[]').split(', ')
    print(f"Accuracy: {accuracy}, Precision: {precision}, MAE: {mae}")


Accuracy: 0.71, Precision: 0.76, MAE: 0.81
Accuracy: 0.76, Precision: 0.81, MAE: 0.81
