<!-- # Data Exploration -->


In [1]:
# Built-in library
from pathlib import Path
import re
import json
from typing import Any, Literal, Optional, Union
import logging
import warnings

# Standard imports
import numpy as np
import numpy.typing as npt
from pprint import pprint
import pandas as pd
import polars as pl
from rich.console import Console
from rich.theme import Theme

custom_theme = Theme(
    {
        "white": "#FFFFFF",  # Bright white
        "info": "#00FF00",  # Bright green
        "warning": "#FFD700",  # Bright gold
        "error": "#FF1493",  # Deep pink
        "success": "#00FFFF",  # Cyan
        "highlight": "#FF4500",  # Orange-red
    }
)
console = Console(theme=custom_theme)

# Visualization
# import matplotlib.pyplot as plt

# NumPy settings
np.set_printoptions(precision=4)

# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

# Polars settings
pl.Config.set_fmt_str_lengths(1_000)
pl.Config.set_tbl_cols(n=1_000)

warnings.filterwarnings("ignore")

# Black code formatter (Optional)
%load_ext lab_black

# auto reload imports
%load_ext autoreload
%autoreload 2

In [2]:
def go_up_from_current_directory(*, go_up: int = 1) -> None:
    """This is used to up a number of directories.

    Params:
    -------
    go_up: int, default=1
        This indicates the number of times to go back up from the current directory.

    Returns:
    --------
    None
    """
    import os
    import sys

    CONST: str = "../"
    NUM: str = CONST * go_up

    # Goto the previous directory
    prev_directory = os.path.join(os.path.dirname(__name__), NUM)
    # Get the 'absolute path' of the previous directory
    abs_path_prev_directory = os.path.abspath(prev_directory)

    # Add the path to the System paths
    sys.path.insert(0, abs_path_prev_directory)
    print(abs_path_prev_directory)

In [3]:
go_up_from_current_directory(go_up=2)

from QA_and_RAG import PACKAGE_ROOT_PATH
from QA_and_RAG.src.utils.utilities import ProcessFiles
from config import config, settings

/Users/neidu/Desktop/Projects/Personal/My_Projects/Gen-AI-Projects


<!-- ### SQL Database(s) -->

In [4]:
go_up_from_current_directory(go_up=1)


from src.db_utils import SQLFromTabularData
from src.chatbot import Chatbot

/Users/neidu/Desktop/Projects/Personal/My_Projects/Gen-AI-Projects/QA_and_RAG


In [5]:
create_db: SQLFromTabularData = SQLFromTabularData(
    file_path="../data/flat_files/titanic-data.csv",
    db_path="../data/flat_files/stored_data.db",
    table_name="titanic",
)


# create_db: SQLFromTabularData = SQLFromTabularData(
#     file_path="../data/flat_files/titanic-data.csv",
#     db_path="../data/flat_files/stored_data.db",
#     table_name="titanic",
# )


create_db.run()

Error creating/updating the DB: Table 'titanic' already exists.
DB Path: sqlite:///../data/flat_files/stored_data.db
Available table Names: ['breast_cancer', 'diabetes', 'titanic']


In [6]:
titanic_data: pl.DataFrame = pl.read_csv("../data/flat_files/titanic-data.csv")
titanic_data.head()

pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
i64,i64,str,str,f64,i64,i64,str,f64,str,str,str,i64,str
1,1,"""Allen, Miss. Elisabeth Walton""","""female""",29.0,0,0,"""24160""",211.3375,"""B5""","""S""","""2""",,"""St Louis, MO"""
1,1,"""Allison, Master. Hudson Trevor""","""male""",0.9167,1,2,"""113781""",151.55,"""C22 C26""","""S""","""11""",,"""Montreal, PQ / Chesterville, ON"""
1,0,"""Allison, Miss. Helen Loraine""","""female""",2.0,1,2,"""113781""",151.55,"""C22 C26""","""S""",,,"""Montreal, PQ / Chesterville, ON"""
1,0,"""Allison, Mr. Hudson Joshua Creighton""","""male""",30.0,1,2,"""113781""",151.55,"""C22 C26""","""S""",,135.0,"""Montreal, PQ / Chesterville, ON"""
1,0,"""Allison, Mrs. Hudson J C (Bessie Waldo Daniels)""","""female""",25.0,1,2,"""113781""",151.55,"""C22 C26""","""S""",,,"""Montreal, PQ / Chesterville, ON"""


In [7]:
query: str = "How many men were in the Titanic?"

titanic_data.filter(pl.col("sex").eq("male")).select(["name"]).shape[0]

843

In [8]:
query: str = "How many women survived the Titanic?"

titanic_data.filter(((pl.col("sex").eq("female")) & (pl.col("survived").eq(1)))).select(
    ["name"]
).shape[0]

339

### Diabetes Dataset

In [9]:
diabetes_data: pl.DataFrame = pl.read_csv("../data/flat_files/diabetes.csv")
diabetes_data.head()

Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
i64,i64,i64,i64,i64,f64,f64,i64,i64
6,148,72,35,0,33.6,0.627,50,1
1,85,66,29,0,26.6,0.351,31,0
8,183,64,0,0,23.3,0.672,32,1
1,89,66,23,94,28.1,0.167,21,0
0,137,40,35,168,43.1,2.288,33,1


In [10]:
query: str = "What is the average age of people with diabetes?"

diabetes_data.filter(pl.col("Outcome").eq(1)).select(
    ["Age"]
).mean().to_series().to_list()[0]

37.06716417910448

In [11]:
print(diabetes_data["Pregnancies"].unique().to_list())

query: str = "What is the average blood pressure of pregnant people?"
diabetes_data.filter(pl.col("Pregnancies").ge(1)).select(
    ["BloodPressure"]
).mean().to_series().to_list()[0]

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17]


69.43531202435312

In [12]:
from sqlalchemy import create_engine


db_path: str = "../data/sql/chinook.db"
conn = create_engine(f"sqlite:///{db_path}")
query: str = "SELECT name FROM sqlite_master WHERE type='table';"
pl.read_database(query=query, connection=conn.connect()).to_series().to_list()

['Album',
 'Artist',
 'Customer',
 'Employee',
 'Genre',
 'Invoice',
 'InvoiceLine',
 'MediaType',
 'Playlist',
 'PlaylistTrack',
 'Track']

In [13]:
query = "SELECT * FROM Track LIMIT 10"

pl.read_database(query=query, connection=conn.connect())

TrackId,Name,AlbumId,MediaTypeId,GenreId,Composer,Milliseconds,Bytes,UnitPrice
i64,str,i64,i64,i64,str,i64,i64,f64
1,"""For Those About To Rock (We Salute You)""",1,1,1,"""Angus Young, Malcolm Young, Brian Johnson""",343719,11170334,0.99
2,"""Balls to the Wall""",2,2,1,"""U. Dirkschneider, W. Hoffmann, H. Frank, P. Baltes, S. Kaufmann, G. Hoffmann""",342562,5510424,0.99
3,"""Fast As a Shark""",3,2,1,"""F. Baltes, S. Kaufman, U. Dirkscneider & W. Hoffman""",230619,3990994,0.99
4,"""Restless and Wild""",3,2,1,"""F. Baltes, R.A. Smith-Diesel, S. Kaufman, U. Dirkscneider & W. Hoffman""",252051,4331779,0.99
5,"""Princess of the Dawn""",3,2,1,"""Deaffy & R.A. Smith-Diesel""",375418,6290521,0.99
6,"""Put The Finger On You""",1,1,1,"""Angus Young, Malcolm Young, Brian Johnson""",205662,6713451,0.99
7,"""Let's Get It Up""",1,1,1,"""Angus Young, Malcolm Young, Brian Johnson""",233926,7636561,0.99
8,"""Inject The Venom""",1,1,1,"""Angus Young, Malcolm Young, Brian Johnson""",210834,6852860,0.99
9,"""Snowballed""",1,1,1,"""Angus Young, Malcolm Young, Brian Johnson""",203102,6599424,0.99
10,"""Evil Walks""",1,1,1,"""Angus Young, Malcolm Young, Brian Johnson""",263497,8611245,0.99


In [14]:
query = "SELECT * FROM Artist LIMIT 10"

pl.read_database(query=query, connection=conn.connect())

ArtistId,Name
i64,str
1,"""AC/DC"""
2,"""Accept"""
3,"""Aerosmith"""
4,"""Alanis Morissette"""
5,"""Alice In Chains"""
6,"""Antônio Carlos Jobim"""
7,"""Apocalyptica"""
8,"""Audioslave"""
9,"""BackBeat"""
10,"""Billy Cobham"""


In [15]:
fp: str = "../data/flat_files/breast-cancer.csv"
df: pl.DataFrame = pl.read_csv(fp)

df.head()

id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,radius_se,texture_se,perimeter_se,area_se,smoothness_se,compactness_se,concavity_se,concave points_se,symmetry_se,fractal_dimension_se,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
i64,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
842302,"""M""",17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,1.095,0.9053,8.589,153.4,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
842517,"""M""",20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,0.5435,0.7339,3.398,74.08,0.005225,0.01308,0.0186,0.0134,0.01389,0.003532,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
84300903,"""M""",19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,0.7456,0.7869,4.585,94.03,0.00615,0.04006,0.03832,0.02058,0.0225,0.004571,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
84348301,"""M""",11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,0.4956,1.156,3.445,27.23,0.00911,0.07458,0.05661,0.01867,0.05963,0.009208,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
84358402,"""M""",20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,0.7572,0.7813,5.438,94.44,0.01149,0.02461,0.05688,0.01885,0.01756,0.005115,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678
