# Data Exploration


In [1]:
# Built-in library
from pathlib import Path
import re
import json
from typing import Any, Optional, Union
import logging
import warnings

# Standard imports
import numpy as np
import numpy.typing as npt
from pprint import pprint
import pandas as pd
import polars as pl
from rich.console import Console
from rich.theme import Theme

custom_theme = Theme(
    {
        "white": "#FFFFFF",  # Bright white
        "info": "#00FF00",  # Bright green
        "warning": "#FFD700",  # Bright gold
        "error": "#FF1493",  # Deep pink
        "success": "#00FFFF",  # Cyan
        "highlight": "#FF4500",  # Orange-red
    }
)
console = Console(theme=custom_theme)

# Visualization
# import matplotlib.pyplot as plt

# NumPy settings
np.set_printoptions(precision=4)

# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

# Polars settings
pl.Config.set_fmt_str_lengths(1_000)
pl.Config.set_tbl_cols(n=1_000)

warnings.filterwarnings("ignore")

# Black code formatter (Optional)
%load_ext lab_black

# auto reload imports
%load_ext autoreload
%autoreload 2

In [2]:
def go_up_from_current_directory(*, go_up: int = 1) -> None:
    """This is used to up a number of directories.

    Params:
    -------
    go_up: int, default=1
        This indicates the number of times to go back up from the current directory.

    Returns:
    --------
    None
    """
    import os
    import sys

    CONST: str = "../"
    NUM: str = CONST * go_up

    # Goto the previous directory
    prev_directory = os.path.join(os.path.dirname(__name__), NUM)
    # Get the 'absolute path' of the previous directory
    abs_path_prev_directory = os.path.abspath(prev_directory)

    # Add the path to the System paths
    sys.path.insert(0, abs_path_prev_directory)
    print(abs_path_prev_directory)

### SQL Database(s)

In [3]:
from sqlalchemy import create_engine


db_path: str = "../data/chinook.db"
conn = create_engine(f"sqlite:///{db_path}")
query: str = "SELECT name FROM sqlite_master WHERE type='table';"
pl.read_database(query=query, connection=conn.connect()).to_series().to_list()

['Album',
 'Artist',
 'Customer',
 'Employee',
 'Genre',
 'Invoice',
 'InvoiceLine',
 'MediaType',
 'Playlist',
 'PlaylistTrack',
 'Track']

In [4]:
query = "SELECT * FROM Artist LIMIT 10"

pl.read_database(query=query, connection=conn.connect())

ArtistId,Name
i64,str
1,"""AC/DC"""
2,"""Accept"""
3,"""Aerosmith"""
4,"""Alanis Morissette"""
5,"""Alice In Chains"""
6,"""Antônio Carlos Jobim"""
7,"""Apocalyptica"""
8,"""Audioslave"""
9,"""BackBeat"""
10,"""Billy Cobham"""


In [5]:
fp: str = "../data/flat_files/breast-cancer.csv"
df: pl.DataFrame = pl.read_csv(fp)

df.head()

id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,radius_se,texture_se,perimeter_se,area_se,smoothness_se,compactness_se,concavity_se,concave points_se,symmetry_se,fractal_dimension_se,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
i64,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
842302,"""M""",17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,1.095,0.9053,8.589,153.4,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
842517,"""M""",20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,0.5435,0.7339,3.398,74.08,0.005225,0.01308,0.0186,0.0134,0.01389,0.003532,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
84300903,"""M""",19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,0.7456,0.7869,4.585,94.03,0.00615,0.04006,0.03832,0.02058,0.0225,0.004571,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
84348301,"""M""",11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,0.4956,1.156,3.445,27.23,0.00911,0.07458,0.05661,0.01867,0.05963,0.009208,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
84358402,"""M""",20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,0.7572,0.7813,5.438,94.44,0.01149,0.02461,0.05688,0.01885,0.01756,0.005115,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [12]:
# Requires Pyarrow
full_db_path: str = "sqlite:///test.db"
df.head().write_database(table_name="sample_table", connection=full_db_path)

In [62]:
from sqlalchemy import inspect, Inspector
from pydantic import BaseModel, computed_field, Field, PrivateAttr

In [15]:
conn = create_engine(full_db_path)
insp: Inspector = inspect(conn)

insp.get_table_names()

['sample_table']

In [86]:
class SQLFromTabularData(BaseModel):

    file_path: str = Field(pattern=r"\.(csv|parquet)")
    db_path: str = Field(pattern=r"\.db")
    table_name: str

    @computed_field
    @property
    def full_db_path(self) -> str:
        return f"sqlite:///{self.db_path}"

    def _read_data(self) -> pl.DataFrame | None:
        try:
            df: pl.DataFrame = (
                pl.read_csv(self.file_path)
                if self.file_path.endswith(".csv")
                else pl.read_parquet(self.file_path)
            )
            return df
        except Exception as e:
            print(f"Error loading the data: {e}")
            return None

    def _create_connection(self) -> Any:
        return create_engine(self.full_db_path)

    def _create_database(self) -> None:
        try:
            data: pl.DataFrame = self._read_data()
            data.write_database(
                table_name=self.table_name, connection=self.full_db_path
            )
            print(
                f"DB at {self.full_db_path} successfully "
                f"created/updated with {self.table_name} table."
            )

        except Exception as e:
            print(f"Error creating/updating the DB: {e}")

        return None

    def _validate_db(self) -> None:
        try:
            conn = self._create_connection()
            insp: Inspector = inspect(conn)

            table_names: list[str] = insp.get_table_names()
            print(
                f"DB Path: {self.full_db_path}"
                "\n================================"
                f"\nAvailable table Names: {table_names}"
                "\n================================"
            )
        except Exception as e:
            print(f"Error validating the DB: {e}")

    def run(self) -> None:
        self._create_database()
        self._validate_db()

In [87]:
db_ = SQLFromTabularData(file_path=fp, db_path="test.db", table_name="diabetes")
db_.run()

Error creating/updating the DB: Table 'diabetees' already exists.
DB Path: sqlite:///test.db
Available table Names: ['diabetees']
