<!-- # Data Exploration -->


In [1]:
# Built-in library
from pathlib import Path
import re
import json
from typing import Any, Literal, Optional, Union
import logging
import warnings

# Standard imports
import numpy as np
import numpy.typing as npt
from pprint import pprint
import pandas as pd
import polars as pl
from rich.console import Console
from rich.theme import Theme

custom_theme = Theme(
    {
        "white": "#FFFFFF",  # Bright white
        "info": "#00FF00",  # Bright green
        "warning": "#FFD700",  # Bright gold
        "error": "#FF1493",  # Deep pink
        "success": "#00FFFF",  # Cyan
        "highlight": "#FF4500",  # Orange-red
    }
)
console = Console(theme=custom_theme)

# Visualization
# import matplotlib.pyplot as plt

# NumPy settings
np.set_printoptions(precision=4)

# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

# Polars settings
pl.Config.set_fmt_str_lengths(1_000)
pl.Config.set_tbl_cols(n=1_000)

warnings.filterwarnings("ignore")

# Black code formatter (Optional)
%load_ext lab_black

# auto reload imports
%load_ext autoreload
%autoreload 2

In [2]:
def go_up_from_current_directory(*, go_up: int = 1) -> None:
    """This is used to up a number of directories.

    Params:
    -------
    go_up: int, default=1
        This indicates the number of times to go back up from the current directory.

    Returns:
    --------
    None
    """
    import os
    import sys

    CONST: str = "../"
    NUM: str = CONST * go_up

    # Goto the previous directory
    prev_directory = os.path.join(os.path.dirname(__name__), NUM)
    # Get the 'absolute path' of the previous directory
    abs_path_prev_directory = os.path.abspath(prev_directory)

    # Add the path to the System paths
    sys.path.insert(0, abs_path_prev_directory)
    print(abs_path_prev_directory)

<!-- ### SQL Database(s) -->

In [24]:
go_up_from_current_directory(go_up=1)


from src.db_utils import SQLFromTabularData
from src.chatbot import Chatbot, get_response

/Users/neidu/Desktop/Projects/Personal/My_Projects/Gen-AI-Projects/QA_and_RAG


In [4]:
create_db: SQLFromTabularData = SQLFromTabularData(
    file_path="../data/flat_files/titanic-data.csv",
    db_path="../data/flat_files/stored_data.db",
    table_name="titanic",
)


# create_db: SQLFromTabularData = SQLFromTabularData(
#     file_path="../data/flat_files/titanic-data.csv",
#     db_path="../data/flat_files/stored_data.db",
#     table_name="titanic",
# )


create_db.run()

Error creating/updating the DB: Table 'titanic' already exists.
DB Path: sqlite:///../data/flat_files/stored_data.db
Available table Names: ['breast_cancer', 'diabetes', 'titanic']


In [5]:
go_up_from_current_directory(go_up=2)

from QA_and_RAG import PACKAGE_ROOT_PATH
from QA_and_RAG.src.utils.utilities import ProcessFiles
from config import config, settings

/Users/neidu/Desktop/Projects/Personal/My_Projects/Gen-AI-Projects


In [6]:
import os

os.environ["WORKING_DIR"] = settings.WORKING_DIR

In [7]:
print(config.QA_and_RAG.path)

/Users/neidu/Desktop/Projects/Personal/My_Projects/Gen-AI-Projects//QA_and_RAG


In [25]:
logger = logging.getLogger()


class ModelManager:
    """A singleton class for managing ML model and its dependencies.

    This class ensures only one instance of the model and its dependencies
    are loaded in memory at any time.

    Attributes
    ----------
    _instance : ModelManager | None
        Singleton instance of the class
    _model : Chatbot | None
        The model
    """

    _instance: "ModelManager | None" = None
    _model: "Chatbot | None" = None

    def __new__(cls) -> "ModelManager":
        """Create a new instance of ModelManager if one doesn't exist.

        Returns
        -------
        ModelManager
            The singleton instance of ModelManager
        """
        if cls._instance is None:
            cls._instance = super().__new__(cls)
            cls._instance._load_model()
        return cls._instance

    def _load_model(self) -> None:
        """Load the model and its dependencies from disk.

        Raises
        ------
        HTTPException
            If there is an error loading the model or dependencies
        """
        try:
            logger.info("Loading chatbot")
            self._model = get_response(chatbot, message, chat_type, app_functionality)

        except Exception as e:
            logger.error(f"Error loading chatbot: {e}")
            raise Exception("Error loading chatbot")

    def clear_cache(self) -> None:
        """Clear the cached chatbot and reload them."""
        self._model = None
        self._load_model()

In [26]:
m = ModelManager()
m

Error loading chatbot: get_response() missing 4 required positional arguments: 'chatbot', 'message', 'chat_type', and 'app_functionality'


Exception: Error loading chatbot

In [None]:
from sqlalchemy import create_engine


db_path: str = "../data/sql/chinook.db"
conn = create_engine(f"sqlite:///{db_path}")
query: str = "SELECT name FROM sqlite_master WHERE type='table';"
pl.read_database(query=query, connection=conn.connect()).to_series().to_list()

In [None]:
query = "SELECT * FROM Track LIMIT 10"

pl.read_database(query=query, connection=conn.connect())

In [None]:
query = "SELECT * FROM Artist LIMIT 10"

pl.read_database(query=query, connection=conn.connect())

In [None]:
fp: str = "../data/flat_files/breast-cancer.csv"
df: pl.DataFrame = pl.read_csv(fp)

df.head()

In [None]:
# Requires Pyarrow
try:
    full_db_path: str = "sqlite:///test.db"
    df.head().write_database(table_name="sample_table", connection=full_db_path)

except Exception as e:
    print(e)

In [10]:
from sqlalchemy import inspect, Inspector
from pydantic import BaseModel, computed_field, Field, PrivateAttr

In [None]:
conn = create_engine(full_db_path)
insp: Inspector = inspect(conn)

insp.get_table_names()

In [12]:
import os
import langchain
from langchain.chains import create_sql_query_chain
from langchain_community.utilities import SQLDatabase
from langchain_community.tools.sql_database.tool import QuerySQLDatabaseTool
from langchain_community.agent_toolkits import create_sql_agent
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableLambda, RunnablePassthrough


langchain.debug = True

In [None]:
from omegaconf import OmegaConf, DictConfig


config: DictConfig = OmegaConf.load("../../config/config.yaml")

print(OmegaConf.to_yaml(config, resolve=True))

In [None]:
file_path: str = str(ROOT_PATH / "/data/sql/chinook.db")
file_path

# ROOT_PATH

In [None]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4o-mini", api_key=settings.OPENAI_API_KEY, temperature=0)
llm

In [None]:
from operator import itemgetter


chat_type: Literal["qa-with-stored-sql-db", "qa-with-flat-file-sql-db"] = (
    "qa-with-stored-sql-db"
)
app_functionality: str = "chat"
message: str = "How many employees are there"

if app_functionality == "chat":
    if chat_type == "qa-with-stored-sql-db":
        # if os.path.exists(config.QA_and_RAG.chinook_db_path):
        if os.path.exists(f"../{file_path}"):  # for jupyter notebook

            # db = SQLDatabase.from_uri(config.QA_and_RAG.chinook_db_path)
            db = SQLDatabase.from_uri(
                f"sqlite:///../{file_path}"
            )  # for jupyter notebook
            exec_query = QuerySQLDatabaseTool(db=db)
            write_query = create_sql_query_chain(llm, db=db)

            answer_prompt = PromptTemplate.from_template(
                template=config.QA_and_RAG.sql_agent_prompt
            )

            answer = answer_prompt | llm | StrOutputParser()

            def execute_and_format(data: dict[str, str]) -> str:
                """Execute SQL query and format the answer.

                Parameters
                ----------
                data : dict[str, str]
                    Dictionary containing 'query' and 'question' keys with string values.
                    The 'query' value may optionally start with 'SQLQuery:'.

                Returns
                -------
                str
                    Formatted answer based on the question and query result.
                """
                # Execute query
                sql: str = data["query"]
                if sql.startswith("SQLQuery:"):
                    sql = sql.replace("SQLQuery:", "").strip()
                result: str = exec_query.invoke(sql)

                return answer.invoke({"question": data["question"], "result": result})

            chain = (
                RunnablePassthrough()
                # Add the key `query` to the input dict
                .assign(query=write_query)
                # Chain the output of the prev step with the next step
                .pipe(RunnableLambda(execute_and_format))
            )

            langchain.debug = True
            response = chain.invoke({"question": message})

In [71]:
def _qa_sql_agent(db_path: str, config: DictConfig, message: str, llm: Any) -> str:
    db = SQLDatabase.from_uri(f"sqlite:///../{db_path}")  # for jupyter notebook
    print(db.dialect)
    print(db.get_usable_table_names())
    exec_query = QuerySQLDatabaseTool(db=db)
    write_query = create_sql_query_chain(llm, db=db)

    answer_prompt = PromptTemplate.from_template(
        template=config.QA_and_RAG.sql_agent_prompt
    )

    answer = answer_prompt | llm | StrOutputParser()

    def execute_and_format(data: dict[str, str]) -> str:
        """Execute SQL query and format the answer.

        Parameters
        ----------
        data : dict[str, str]
            Dictionary containing 'query' and 'question' keys with string values.
            The 'query' value may optionally start with 'SQLQuery:'.

        Returns
        -------
        str
            Formatted answer based on the question and query result.
        """
        # Execute query
        sql: str = data["query"]
        print("===" * 20)
        print("sql: ", sql)
        if (
            sql.startswith("SQLQuery:")
            or sql.startswith("sql:")
            or sql.__contains__("```")
        ):
            sql = (
                sql.replace("SQLQuery:", "")
                .replace("sql:", "")
                .replace("```sql", "")
                .replace("```", "")
                .strip()
            )
        result: str = exec_query.invoke(sql)

        return answer.invoke({"question": data["question"], "result": result})

    chain = (
        RunnablePassthrough()
        # Add the key `query` to the input dict
        .assign(query=write_query)
        # Chain the output of the prev step with the next step
        .pipe(RunnableLambda(execute_and_format))
    )
    response: str = chain.invoke({"question": message})
    return response

In [None]:
fp: str = "../data/flat_files/titanic.csv"
df: pl.DataFrame = pl.read_csv(fp)

# print(df["BloodPressure"].mean())
df.filter(pl.col("Sex").eq("female")).describe()
# df.head()

In [58]:
# db_path: str = "/data/flat_files/stored_data.db"
# message: str = "What is the total smoothnessmean?"
# _qa_sql_agent(db_path=db_path, config=config, message=message, llm=llm)

In [None]:
from enum import Enum


class ChatType(Enum):
    qa_with_stored_sql_db = "q&a-with-stored-sql-db"
    qa_with_stored_flat_file_sql_db = "q&a-with-stored-flat-file-sql-db"
    qa_with_uploaded_flat_file_sql_db = "q&a-with-uploaded-flat-file-sql-db"


chat_type: Literal[
    ChatType.qa_with_stored_sql_db,
    ChatType.qa_with_stored_flat_file_sql_db,
    ChatType.qa_with_uploaded_flat_file_sql_db,
] = ChatType.qa_with_stored_sql_db.value
app_functionality: str = "chat"
message: str = "How many employees are there"
chatbot: list[str] = []
file_path: str = str(ROOT_PATH / "/data/sql/chinook.db")
flat_file_path: str = str(ROOT_PATH / "/data/sql/chinook.db")

if app_functionality == "chat":
    if chat_type == ChatType.qa_with_stored_sql_db.value:
        # if os.path.exists(config.QA_and_RAG.chinook_db_path):
        if os.path.exists(f"../{file_path}"):  # for jupyter notebook
            response = _qa_sql_agent(
                db_path=file_path, config=config, message=message, llm=llm
            )
        else:
            chatbot.append(
                (
                    message,
                    f"SQL DB does not exist. Please first create the `chinook` db",
                )
            )
    # elif chat_type == "qa-with-flat-file-sql-db":
    elif chat_type == ChatType.qa_with_uploaded_flat_file_sql_db.value:
        if os.path.exists(f"../{file_path}"):  # for jupyter notebook
            db = SQLDatabase.from_uri(f"sqlite:///../{file_path}")
            print(db.dialect)
        else:
            chatbot.append(
                (
                    message,
                    f"SQL DB from does not exist. Please first create the `chinook` db",
                )
            )

In [60]:
def my_func(db_path: str, llm: Any) -> tuple[Any, ...]:
    db = SQLDatabase.from_uri(f"sqlite:///../{db_path}")  # for jupyter notebook
    print(db.dialect)
    return db, llm

In [None]:
db_path: str = "/data/flat_files/stored_data.db"
message: str = "How many people survived in the Titanic?"
message: str = "What is the average blood pressure?"

_qa_sql_agent(db_path=db_path, config=config, message=message, llm=llm)

In [None]:
db_path: str = "/data/flat_files/stored_data.db"
# message: str = "How many people survived in the Titanic?"
db, llm = my_func(db_path=db_path, llm=llm)

agent_executor = create_sql_agent(
    llm=llm, db=db, agent_type="openai-tools", verbose=False
)
response = agent_executor.invoke(message)["output"]
response

In [None]:
from langchain import hub
from langchain_core.prompts.chat import ChatPromptTemplate


query_prompt_template: ChatPromptTemplate = hub.pull(
    "langchain-ai/sql-query-system-prompt"
)
# console.print(query_prompt_template.messages[0])
query_prompt_template.messages[0].pretty_print()

In [128]:
from typing import Annotated, TypedDict


chat_llm: ChatOpenAI = ChatOpenAI(
    model="gpt-4o-mini", api_key=settings.OPENAI_API_KEY, temperature=0
)


class State(TypedDict):
    question: str
    query: str
    result: str
    answer: str


class QueryOutput(TypedDict):
    """Generated SQL query."""

    query: Annotated[str, ..., "Syntactically valid SQL query."]


def write_query(state: State):
    """Generate SQL query to fetch information."""
    prompt = query_prompt_template.invoke(
        {
            "dialect": db.dialect,
            "top_k": 10,
            "table_info": db.get_table_info(),
            "input": state["question"],
        }
    )
    structured_llm = chat_llm.with_structured_output(QueryOutput)
    result = structured_llm.invoke(prompt)
    return {"query": result["query"]}


def execute_query(state: State):
    """Execute SQL query."""
    execute_query_tool = QuerySQLDatabaseTool(db=db)
    return {"result": execute_query_tool.invoke(state["query"])}


def generate_answer(state: State):
    """Answer question using retrieved information as context."""
    prompt = (
        "Given the following user question, corresponding SQL query, "
        "and SQL result, answer the user question.\n\n"
        f'Question: {state["question"]}\n'
        f'SQL Query: {state["query"]}\n'
        f'SQL Result: {state["result"]}'
    )
    response = chat_llm.invoke(prompt)
    return {"answer": response.content}

In [None]:
state_data: State = State(
    {"question": "How many men were there in the Titanic that survived?"}
)
query: str = write_query(state_data)["query"]
state_data["query"] = query

query_result: dict[str, Any] = execute_query(state_data)["result"]
state_data["result"] = query_result

generate_answer(state=state_data)

In [None]:
# TODO:
# 1. Remove the Titanic dataset.
# 2. Update the sql agent using the latest version.
# 3. Create the Python scripts/modules.