In [1]:
import json
import logging
import re
import warnings
from datetime import datetime
from pathlib import Path
from pprint import pprint
from typing import Annotated, Any, Generator, Literal, Type, TypeVar

# Standard imports
import numpy as np
import numpy.typing as npt
import pandas as pd
import polars as pl

# Visualization
# import matplotlib.pyplot as plt

# NumPy settings
np.set_printoptions(precision=4)

# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

# Polars settings
pl.Config.set_fmt_str_lengths(1_000)
pl.Config.set_tbl_cols(n=1_000)
pl.Config.set_tbl_rows(n=200)

warnings.filterwarnings("ignore")

# Black code formatter (Optional)
%load_ext lab_black

# auto reload imports
%load_ext autoreload
%autoreload 2

In [2]:
from rich.console import Console
from rich.theme import Theme

custom_theme = Theme(
    {
        "white": "#FFFFFF",  # Bright white
        "info": "#00FF00",  # Bright green
        "warning": "#FFD700",  # Bright gold
        "error": "#FF1493",  # Deep pink
        "success": "#00FFFF",  # Cyan
        "highlight": "#FF4500",  # Orange-red
    }
)
console = Console(theme=custom_theme)


def create_path(path: str | Path) -> None:
    """
    Create parent directories for the given path if they don't exist.

    Parameters
    ----------
    path : str | Path
        The file path for which to create parent directories.

    """
    Path(path).parent.mkdir(parents=True, exist_ok=True)


def go_up_from_current_directory(*, go_up: int = 1) -> None:
    """This is used to up a number of directories.

    Params:
    -------
    go_up: int, default=1
        This indicates the number of times to go back up from the current directory.

    Returns:
    --------
    None
    """
    import os
    import sys

    CONST: str = "../"
    NUM: str = CONST * go_up

    # Goto the previous directory
    prev_directory = os.path.join(os.path.dirname(__name__), NUM)
    # Get the 'absolute path' of the previous directory
    abs_path_prev_directory = os.path.abspath(prev_directory)

    # Add the path to the System paths
    sys.path.insert(0, abs_path_prev_directory)
    print(abs_path_prev_directory)

In [3]:
go_up_from_current_directory(go_up=1)

/Users/mac/Desktop/MyProjects/batch-process


### db_models.py

```py
from contextlib import contextmanager
from datetime import datetime
from typing import Any, Generator, Type, TypeVar

from pydantic import BaseModel
from sqlalchemy import JSON, Float, String, Text, create_engine
from sqlalchemy.engine import Engine
from sqlalchemy.orm import DeclarativeBase, Mapped, Session, mapped_column

from config import app_config
from config.settings import refresh_settings

settings = refresh_settings()

if settings.ENVIRONMENT == "test":
    DATABASE_URL: str = app_config.db.db_path
elif settings.ENVIRONMENT in ["dev", "prod"]:
    DATABASE_URL = (
        f"postgresql+psycopg2://{settings.POSTGRES_USER}:{settings.POSTGRES_PASSWORD.get_secret_value()}"
        f"@localhost:{settings.POSTGRES_PORT}/{settings.POSTGRES_DB}"
    )
print(f"Connected to {settings.ENVIRONMENT!r} environment database.")

engine: Engine = create_engine(DATABASE_URL, echo=False)
T = TypeVar("T", bound="BaseModel")
D = TypeVar("D", bound="Base")


class Base(DeclarativeBase):
    pass


class NERResult(Base):
    """Data model for storing Named Entity Recognition (NER) data."""

    __tablename__: str = "ner_results"
    id: Mapped[int] = mapped_column(primary_key=True)
    status: Mapped[str] = mapped_column(String(50))
    data: Mapped[dict[str, Any]] = mapped_column(JSON)
    timestamp: Mapped[str | None] = mapped_column("timestamp", default=datetime.now)
    created_at: Mapped[str | None] = mapped_column("createdAt", default=datetime.now)

    def __repr__(self) -> str:
        """
        Returns a string representation of the NERData object.

        Returns
        -------
        str
        """
        return (
            f"{self.__class__.__name__}(id={self.id!r}, status={self.status!r}, data={self.data!r})"
        )

    def output_fields(self) -> list[str]:
        """Get the output fields."""
        return [
            "id",
            "status",
            "data",
            "timestamp",
            "created_at",
        ]


class TaskResult(Base):
    """Data model for storing task results."""

    __tablename__: str = "task_results"
    id: Mapped[int] = mapped_column(primary_key=True)
    task_id: Mapped[str] = mapped_column(String(50), unique=True, index=True)
    task_name: Mapped[str] = mapped_column(String(50), index=True)
    status: Mapped[str] = mapped_column(String(20), default="pending")
    result: Mapped[dict[str, Any]] = mapped_column(JSON)
    error_message: Mapped[str] = mapped_column(Text)
    created_at: Mapped[str | None] = mapped_column("createdAt", default=datetime.now)
    completed_at: Mapped[str] = mapped_column("completedAt", nullable=True)

    def __repr__(self) -> str:
        """
        Returns a string representation of the NERData object.

        Returns
        -------
        str
        """
        return (
            f"{self.__class__.__name__}(task_id={self.task_id!r}, task_name={self.task_name!r}, "
            f"status={self.status!r})"
        )

    def output_fields(self) -> list[str]:
        """Get the output fields."""
        return [
            "id",
            "task_id",
            "task_name",
            "status",
            "result",
            "error_message",
            "created_at",
            "completed_at",
        ]


class EmailLog(Base):
    """Data model for storing email logs."""

    __tablename__: str = "email_logs"
    id: Mapped[int] = mapped_column(primary_key=True)
    recipient: Mapped[str] = mapped_column(String(50), index=True)
    subject: Mapped[str] = mapped_column(String(100))
    body: Mapped[str] = mapped_column(Text)
    status: Mapped[str] = mapped_column(String(20), default="pending")
    created_at: Mapped[str | None] = mapped_column("createdAt", default=datetime.now)
    sent_at: Mapped[str] = mapped_column("sentAt", nullable=True)

    def __repr__(self) -> str:
        """
        Returns a string representation of the email log.

        Returns
        -------
        str
        """
        return (
            f"{self.__class__.__name__}(recipient={self.recipient!r}, subject={self.subject!r}, "
            f"status={self.status!r})"
        )

    def output_fields(self) -> list[str]:
        """Get the output fields."""
        return [
            "id",
            "recipient",
            "subject",
            "status",
            "sent_at",
            "created_at",
        ]


class DataProcessingJob(Base):
    """Data model for storing email logs."""

    __tablename__: str = "data_processing_jobs"
    id: Mapped[int] = mapped_column(primary_key=True)
    job_name: Mapped[str] = mapped_column(String(50), index=True)
    input_data: Mapped[str] = mapped_column(Text)
    output_data: Mapped[str] = mapped_column(Text)
    processing_time: Mapped[float] = mapped_column(Float)
    status: Mapped[str] = mapped_column(String(20), default="pending")
    created_at: Mapped[str | None] = mapped_column("createdAt", default=datetime.now)
    completed_at: Mapped[str] = mapped_column("completedAt", nullable=True)

    def __repr__(self) -> str:
        """
        Returns a string representation of the email log.

        Returns
        -------
        str
        """
        return (
            f"{self.__class__.__name__}(job_name={self.job_name!r}, created_at={self.created_at!r}, "
            f"status={self.status!r})"
        )

    def output_fields(self) -> list[str]:
        """Get the output fields."""
        return [
            "id",
            "job_name",
            "input_data",
            "output_data",
            "processing_time",
            "status",
            "created_at",
            "completed_at",
        ]


@contextmanager
def get_db_session() -> Generator[Session, None, None]:
    """
    Creates and manages a database session using a context manager.

    This function creates a new database session, yields it for use,
    and handles commit/rollback operations automatically. The session
    is properly closed after use, even if an exception occurs.

    Yields
    ------
    Session
        An active SQLAlchemy database session.

    Raises
    ------
    Exception
        Any exception that occurs during database operations.
    """
    session: Session = Session(engine)
    try:
        yield session
        session.commit()

    except Exception:
        session.rollback()
        raise

    finally:
        session.close()


def add_record_to_db(data: dict[str, Any], schema: Type[T], data_model: Type[D]) -> dict[str, Any]:
    """
    Add a record to the database using the provided data, schema, and data model.

    Parameters
    ----------
    data : dict[str, Any]
        Dictionary containing the data to be added to the database.
    schema : Type[T]
        Type of the schema class used for data validation and transformation.
    data_model : Type[D]
        Type of the database model class where the record will be stored.

    Returns
    -------
    dict[str, Any]
        Dictionary containing the record's fields after being added to the database.
        Returns an empty dictionary if the operation fails.

    """
    if isinstance(data, dict):
        data_dict: dict[str, Any] = schema(**data).to_data_model_dict()  # type: ignore
    with get_db_session() as db:
        record = data_model(**data_dict)
        db.add(record)
        db.flush()

        return {key: getattr(record, key) for key in record.output_fields()}  # type: ignore

    return {}


def bulk_insert_records(data: list[dict[str, Any]], schema: Type[T], data_model: Type[D]) -> None:
    """
    Bulk insert multiple records into the database using the provided data, schema, and data model.

    Parameters
    ----------
    data : list[dict[str, Any]]
        List of dictionaries containing the data to be added to the database.
    schema : Type[T]
        Type of the schema class used for data validation and transformation.
    data_model : Type[D]
        Type of the database model class where the records will be stored.

    Returns
    -------
    None
    """
    if isinstance(data, list):
        data_list: list[dict[str, Any]] = [schema(**row).to_data_model_dict() for row in data]  # type: ignore

    with get_db_session() as db:
        db.bulk_insert_mappings(data_model, data_list)  # type: ignore


def init_db() -> None:
    """
    Initialize the database connection and create all tables.

    Returns
    -------
    None
    """
    # Creates tables
    Base.metadata.create_all(engine)

```

In [4]:
from sqlalchemy import delete, insert, select, update

from schemas import EmailSchema
from schemas.db_models import EmailLog, get_db_session, init_db

Connected to 'test' environment database.


In [5]:
init_db()

## [Docs](https://docs.sqlalchemy.org/en/20/orm/queryguide/select.html)

### [Insert](https://docs.sqlalchemy.org/en/20/orm/queryguide/dml.html#orm-bulk-insert-statements)

- Old API

```python
with get_db_session() as session:
    data_dict = input_data.to_data_model_dict()
    record = EmailLog(**data_dict)
    session.add(record)
    session.flush()
    output_data = {key: getattr(record, key) for key in record.output_fields()}
```

<br>

- New API

```py
with get_db_session() as session:
    data_dict = input_data.to_data_model_dict()
    session.execute(insert(EmailLog), [data_dict])
```

In [6]:
input_data: EmailSchema = EmailSchema(
    recipient="marketing@client.com",
    subject="Partnership Proposal",
    body="We would like to discuss a potential partnership opportunity.",
    status="pending",
)
console.print(input_data)

In [7]:
with get_db_session() as session:
    data_dict = input_data.to_data_model_dict()
    record = EmailLog(**data_dict)
    session.add(record)
    session.flush()
    output_data = {key: getattr(record, key) for key in record.output_fields()}


console.print(output_data)

In [8]:
input_data_2: EmailSchema = EmailSchema(
    recipient="emeka2@example.com",
    subject="test!!!",
    body="this is an example body",
    status="pending",
)
input_data_3: EmailSchema = EmailSchema(
    recipient="john.doe@example.com",
    subject="Meeting Reminder",
    body="Hi John, just a reminder about our meeting tomorrow at 10 AM.",
    status="pending",
)
input_data_4: EmailSchema = EmailSchema(
    recipient="info@company.org",
    subject="New Product Launch",
    body="Dear valued customer, check out our exciting new product!",
    status="sent",
    created_at=datetime(2025, 7, 10, 9, 0, 0),
    sent_at="2025-07-10T09:05:00",
)
console.print((input_data_2, input_data_3, input_data_4))

### [Bulk Insert](https://docs.sqlalchemy.org/en/20/orm/queryguide/dml.html#orm-bulk-insert-statements)

- Old API

```py
with get_db_session() as session:
    data_list: list[dict[str, Any]] = [_data.to_data_model_dict() for _data in (input_data_2, input_data_3, input_data_4)]
    session.bulk_insert_mappings(EmailLog, data_list)
```

<br>

- New API

```py
with get_db_session() as session:
    data_list: list[dict[str, Any]] = [
        _data.to_data_model_dict()
        for _data in (input_data_2, input_data_3, input_data_4)
    ]
    session.execute(insert(EmailLog), data_list)
```

In [9]:
with get_db_session() as session:
    data_list: list[dict[str, Any]] = [
        _data.to_data_model_dict()
        for _data in (input_data_2, input_data_3, input_data_4)
    ]
    session.execute(insert(EmailLog), data_list)

### Select

In [None]:
# Select a single record
with get_db_session() as session:
    statement = select(EmailLog).where(EmailLog.id == 1, EmailLog.status == "pending")
    record = session.execute(statement).scalar_one()
    output_data = {key: getattr(record, key) for key in record.output_fields()}


console.print(output_data)

In [11]:
# Select all records
with get_db_session() as session:
    statement = select(EmailLog)
    record = session.execute(statement).scalars()

    output_data = [
        {key: getattr(row, key) for key in row.output_fields()} for row in record
    ]

console.print(output_data)

### [Update](https://docs.sqlalchemy.org/en/20/orm/queryguide/dml.html#orm-update-and-delete-with-custom-where-criteria)

In [12]:
with get_db_session() as session:
    statement = (
        update(EmailLog)
        .where(EmailLog.id == 1)
        .values(status="sent", sent_at=datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
    )
    # It closes the session and returns None
    session.execute(statement)

# Verify that the record was updated
with get_db_session() as session:
    statement = select(EmailLog)
    record = session.execute(statement).scalars()

    output_data = [
        {key: getattr(row, key) for key in row.output_fields()} for row in record
    ]

console.print(output_data)

### [Delete](https://docs.sqlalchemy.org/en/20/orm/queryguide/dml.html#orm-update-and-delete-with-custom-where-criteria)

In [13]:
with get_db_session() as session:
    statement = delete(EmailLog).where(EmailLog.id == 2)
    # It closes the session and returns None
    session.execute(statement)

# Verify that the record was updated
with get_db_session() as session:
    statement = select(EmailLog)
    record = session.execute(statement).scalars()

    output_data = [
        {key: getattr(row, key) for key in row.output_fields()} for row in record
    ]

console.print(output_data)