# Data Exploration


In [1]:
# Built-in library
from pathlib import Path
import re
import json
from typing import Any, Literal, Optional, Union
import logging
import warnings

# Standard imports
import numpy as np
import numpy.typing as npt
from pprint import pprint
import pandas as pd
import polars as pl
from rich.console import Console
from rich.theme import Theme

custom_theme = Theme(
    {
        "white": "#FFFFFF",  # Bright white
        "info": "#00FF00",  # Bright green
        "warning": "#FFD700",  # Bright gold
        "error": "#FF1493",  # Deep pink
        "success": "#00FFFF",  # Cyan
        "highlight": "#FF4500",  # Orange-red
    }
)
console = Console(theme=custom_theme)

# Visualization
# import matplotlib.pyplot as plt

# NumPy settings
np.set_printoptions(precision=4)

# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

# Polars settings
pl.Config.set_fmt_str_lengths(1_000)
pl.Config.set_tbl_cols(n=1_000)

warnings.filterwarnings("ignore")

# Black code formatter (Optional)
%load_ext lab_black

# auto reload imports
%load_ext autoreload
%autoreload 2

In [2]:
def go_up_from_current_directory(*, go_up: int = 1) -> None:
    """This is used to up a number of directories.

    Params:
    -------
    go_up: int, default=1
        This indicates the number of times to go back up from the current directory.

    Returns:
    --------
    None
    """
    import os
    import sys

    CONST: str = "../"
    NUM: str = CONST * go_up

    # Goto the previous directory
    prev_directory = os.path.join(os.path.dirname(__name__), NUM)
    # Get the 'absolute path' of the previous directory
    abs_path_prev_directory = os.path.abspath(prev_directory)

    # Add the path to the System paths
    sys.path.insert(0, abs_path_prev_directory)
    print(abs_path_prev_directory)

### SQL Database(s)

In [19]:
go_up_from_current_directory(go_up=1)

from src.db_utils import SQLFromTabularData
from QA_and_RAG import ROOT_PATH

/Users/neidu/Desktop/Projects/Personal/My_Projects/Gen-AI-Projects/QA_and_RAG


In [20]:
go_up_from_current_directory(go_up=2)

from config import settings

/Users/neidu/Desktop/Projects/Personal/My_Projects/Gen-AI-Projects


In [34]:
from sqlalchemy import create_engine


db_path: str = "../data/sql/chinook.db"
conn = create_engine(f"sqlite:///{db_path}")
query: str = "SELECT name FROM sqlite_master WHERE type='table';"
pl.read_database(query=query, connection=conn.connect()).to_series().to_list()

['Album',
 'Artist',
 'Customer',
 'Employee',
 'Genre',
 'Invoice',
 'InvoiceLine',
 'MediaType',
 'Playlist',
 'PlaylistTrack',
 'Track']

In [35]:
query = "SELECT * FROM Track LIMIT 10"

pl.read_database(query=query, connection=conn.connect())

TrackId,Name,AlbumId,MediaTypeId,GenreId,Composer,Milliseconds,Bytes,UnitPrice
i64,str,i64,i64,i64,str,i64,i64,f64
1,"""For Those About To Rock (We Salute You)""",1,1,1,"""Angus Young, Malcolm Young, Brian Johnson""",343719,11170334,0.99
2,"""Balls to the Wall""",2,2,1,"""U. Dirkschneider, W. Hoffmann, H. Frank, P. Baltes, S. Kaufmann, G. Hoffmann""",342562,5510424,0.99
3,"""Fast As a Shark""",3,2,1,"""F. Baltes, S. Kaufman, U. Dirkscneider & W. Hoffman""",230619,3990994,0.99
4,"""Restless and Wild""",3,2,1,"""F. Baltes, R.A. Smith-Diesel, S. Kaufman, U. Dirkscneider & W. Hoffman""",252051,4331779,0.99
5,"""Princess of the Dawn""",3,2,1,"""Deaffy & R.A. Smith-Diesel""",375418,6290521,0.99
6,"""Put The Finger On You""",1,1,1,"""Angus Young, Malcolm Young, Brian Johnson""",205662,6713451,0.99
7,"""Let's Get It Up""",1,1,1,"""Angus Young, Malcolm Young, Brian Johnson""",233926,7636561,0.99
8,"""Inject The Venom""",1,1,1,"""Angus Young, Malcolm Young, Brian Johnson""",210834,6852860,0.99
9,"""Snowballed""",1,1,1,"""Angus Young, Malcolm Young, Brian Johnson""",203102,6599424,0.99
10,"""Evil Walks""",1,1,1,"""Angus Young, Malcolm Young, Brian Johnson""",263497,8611245,0.99


In [8]:
query = "SELECT * FROM Artist LIMIT 10"

pl.read_database(query=query, connection=conn.connect())

ArtistId,Name
i64,str
1,"""AC/DC"""
2,"""Accept"""
3,"""Aerosmith"""
4,"""Alanis Morissette"""
5,"""Alice In Chains"""
6,"""Antônio Carlos Jobim"""
7,"""Apocalyptica"""
8,"""Audioslave"""
9,"""BackBeat"""
10,"""Billy Cobham"""


In [9]:
fp: str = "../data/flat_files/breast-cancer.csv"
df: pl.DataFrame = pl.read_csv(fp)

df.head()

id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,radius_se,texture_se,perimeter_se,area_se,smoothness_se,compactness_se,concavity_se,concave points_se,symmetry_se,fractal_dimension_se,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
i64,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
842302,"""M""",17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,1.095,0.9053,8.589,153.4,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
842517,"""M""",20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,0.5435,0.7339,3.398,74.08,0.005225,0.01308,0.0186,0.0134,0.01389,0.003532,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
84300903,"""M""",19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,0.7456,0.7869,4.585,94.03,0.00615,0.04006,0.03832,0.02058,0.0225,0.004571,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
84348301,"""M""",11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,0.4956,1.156,3.445,27.23,0.00911,0.07458,0.05661,0.01867,0.05963,0.009208,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
84358402,"""M""",20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,0.7572,0.7813,5.438,94.44,0.01149,0.02461,0.05688,0.01885,0.01756,0.005115,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [11]:
# Requires Pyarrow
try:
    full_db_path: str = "sqlite:///test.db"
    df.head().write_database(table_name="sample_table", connection=full_db_path)

except Exception as e:
    print(e)

Table 'sample_table' already exists.


In [12]:
from sqlalchemy import inspect, Inspector
from pydantic import BaseModel, computed_field, Field, PrivateAttr

In [13]:
conn = create_engine(full_db_path)
insp: Inspector = inspect(conn)

insp.get_table_names()

['diabetees', 'sample_table']

In [14]:
import os
import langchain
from langchain.chains import create_sql_query_chain
from langchain_community.utilities import SQLDatabase
from langchain_community.tools.sql_database.tool import QuerySQLDatabaseTool
from langchain_community.agent_toolkits import create_sql_agent
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough


langchain.debug = True

In [52]:
from omegaconf import OmegaConf, DictConfig


config: DictConfig = OmegaConf.load("../../config/config.yaml")

print(OmegaConf.to_yaml(config, resolve=True))

QA_and_RAG:
  path: QA_and_RAG
  chinook_db_path: sqlite:///QA_and_RAG/data/sql/chinook.db
  sql_agent_prompt: 'Given the following user question, corresponding SQL query, and
    SQL query result, answer the user question.

    Question: {question}

    SQL Query: {query}

    SQL Result: {result}

    Answer:

    '



In [53]:
file_path: str = str(ROOT_PATH / "/data/sql/chinook.db")
file_path

# ROOT_PATH

'/data/sql/chinook.db'

In [61]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4o-mini", api_key=settings.OPENAI_API_KEY, temperature=0)
llm

ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x1250e1a00>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x1250db800>, root_client=<openai.OpenAI object at 0x1250cb530>, root_async_client=<openai.AsyncOpenAI object at 0x1250e0c50>, model_name='gpt-4o-mini', temperature=0.0, model_kwargs={}, openai_api_key=SecretStr('**********'))

In [62]:
config.QA_and_RAG.sql_agent_prompt

'Given the following user question, corresponding SQL query, and SQL query result, answer the user question.\nQuestion: {question}\nSQL Query: {query}\nSQL Result: {result}\nAnswer:\n'

In [59]:
from operator import itemgetter


chat_type: Literal["qa-with-stored-sql-db", "qa-with-flat-file-sql-db"] = (
    "qa-with-stored-sql-db"
)
app_functionality: str = "chat"
message: str = "What is the album id of the track `Princess of the Dawn`?"
message: str = "How many employees are there"

if app_functionality == "chat":
    if chat_type == "qa-with-stored-sql-db":
        # if os.path.exists(config.QA_and_RAG.chinook_db_path):
        if os.path.exists(f"../{file_path}"):  # for jupyter notebook

            # db = SQLDatabase.from_uri(config.QA_and_RAG.chinook_db_path)
            db = SQLDatabase.from_uri(f"sqlite:///../{file_path}")
            exec_query = QuerySQLDatabaseTool(db=db)
            write_query = create_sql_query_chain(llm, db=db)
            answer_prompt = PromptTemplate.from_template(
                template=config.QA_and_RAG.sql_agent_prompt
            )
            answer = answer_prompt | llm | StrOutputParser()
            chain = (
                RunnablePassthrough.assign(query=write_query).assign(
                    result=itemgetter("query") | exec_query
                )
                | answer
            )
            response = chain.invoke({"question": message})

[32;1m[1;3m[chain/start][0m [1m[chain:RunnableSequence] Entering Chain run with input:
[0m{
  "question": "How many employees are there"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RunnableSequence > chain:RunnableAssign<query>] Entering Chain run with input:
[0m{
  "question": "How many employees are there"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RunnableSequence > chain:RunnableAssign<query> > chain:RunnableParallel<query>] Entering Chain run with input:
[0m{
  "question": "How many employees are there"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RunnableSequence > chain:RunnableAssign<query> > chain:RunnableParallel<query> > chain:RunnableSequence] Entering Chain run with input:
[0m{
  "question": "How many employees are there"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RunnableSequence > chain:RunnableAssign<query> > chain:RunnableParallel<query> > chain:RunnableSequence > chain:RunnableAssign<input,table_info>] Entering Chain run with input:
[0m{
  "question": "How m

In [60]:
response

'It seems there was an error in executing the SQL query due to incorrect formatting. The SQL query should not include the markdown syntax or the "SQLQuery:" label. Here is the corrected SQL query:\n\n```sql\nSELECT COUNT("EmployeeId") AS "NumberOfEmployees" FROM "Employee";\n```\n\nTo answer the user question "How many employees are there?", the query needs to be executed correctly in a database environment that contains the "Employee" table. Since the query execution resulted in an error, I cannot provide the number of employees without the actual query result. Please ensure the query is executed correctly in the database to obtain the desired result.'

In [43]:
from langchain_core.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_template("tell me a joke about {topic}")

chain = prompt | llm | StrOutputParser()
chain.invoke({"topic": "bears"})

[32;1m[1;3m[chain/start][0m [1m[chain:RunnableSequence] Entering Chain run with input:
[0m{
  "topic": "bears"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RunnableSequence > prompt:ChatPromptTemplate] Entering Prompt run with input:
[0m{
  "topic": "bears"
}
[36;1m[1;3m[chain/end][0m [1m[chain:RunnableSequence > prompt:ChatPromptTemplate] [1ms] Exiting Prompt run with output:
[0m[outputs]
[32;1m[1;3m[llm/start][0m [1m[chain:RunnableSequence > llm:ChatOpenAI] Entering LLM run with input:
[0m{
  "prompts": [
    "Human: tell me a joke about bears"
  ]
}
[36;1m[1;3m[llm/end][0m [1m[chain:RunnableSequence > llm:ChatOpenAI] [1.11s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "Why do bears have hairy coats?\n\nBecause they look silly in sweaters!",
        "generation_info": {
          "finish_reason": "stop",
          "logprobs": null
        },
        "type": "ChatGeneration",
        "message": {
          "lc": 1,
          "

'Why do bears have hairy coats?\n\nBecause they look silly in sweaters!'

In [39]:
# db = SQLDatabase.from_uri("sqlite:///Chinook.db")
# llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
chain = create_sql_query_chain(llm, db)
response = chain.invoke({"question": "How many employees are there"})
response

[32;1m[1;3m[chain/start][0m [1m[chain:RunnableSequence] Entering Chain run with input:
[0m{
  "question": "How many employees are there"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RunnableSequence > chain:RunnableAssign<input,table_info>] Entering Chain run with input:
[0m{
  "question": "How many employees are there"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RunnableSequence > chain:RunnableAssign<input,table_info> > chain:RunnableParallel<input,table_info>] Entering Chain run with input:
[0m{
  "question": "How many employees are there"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RunnableSequence > chain:RunnableAssign<input,table_info> > chain:RunnableParallel<input,table_info> > chain:RunnableLambda] Entering Chain run with input:
[0m{
  "question": "How many employees are there"
}
[36;1m[1;3m[chain/end][0m [1m[chain:RunnableSequence > chain:RunnableAssign<input,table_info> > chain:RunnableParallel<input,table_info> > chain:RunnableLambda] [1ms] Exiting Chain run wit

'SQLQuery: SELECT COUNT("EmployeeId") AS "EmployeeCount" FROM "Employee"'

In [None]:
"""You are an agent designed to interact with a SQL database.
Given an input question, create a syntactically correct [33;1m[1;3m{dialect}[0m query to run, then look at the results of the query and return the answer.
Unless the user specifies a specific number of examples they wish to obtain, always limit your query to at most [33;1m[1;3m{top_k}[0m results.
You can order the results by a relevant column to return the most interesting examples in the database.
Never query for all the columns from a specific table, only ask for the relevant columns given the question.
You have access to tools for interacting with the database.
Only use the below tools. Only use the information returned by the below tools to construct your final answer.
You MUST double check your query before executing it. If you get an error while executing a query, rewrite the query and try again.

DO NOT make any DML statements (INSERT, UPDATE, DELETE, DROP etc.) to the database.

To start you should ALWAYS look at the tables in the database to see what you can query.
Do NOT skip this step.
Then you should query the schema of the most relevant tables.
"""