# Lab For Experimentation

In [4]:
import warnings
from typing import Any

import numpy as np
import pandas as pd
import polars as pl
import polars.selectors as cs
from rich.console import Console
from rich.theme import Theme

custom_theme = Theme(
    {
        "white": "#FFFFFF",  # Bright white
        "info": "#00FF00",  # Bright green
        "warning": "#FFD700",  # Bright gold
        "error": "#FF1493",  # Deep pink
        "success": "#00FFFF",  # Cyan
        "highlight": "#FF4500",  # Orange-red
    }
)
console = Console(theme=custom_theme)

# Visualization
# import matplotlib.pyplot as plt

# NumPy settings
np.set_printoptions(precision=4)

# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

# Polars settings
pl.Config.set_fmt_str_lengths(1_000)
pl.Config.set_tbl_cols(n=1_000)
pl.Config.set_tbl_rows(n=200)

warnings.filterwarnings("ignore")

# Black code formatter (Optional)
%load_ext lab_black

# auto reload imports
%load_ext autoreload
%autoreload 2

The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
def go_up_from_current_directory(*, go_up: int = 1) -> None:
    """This is used to up a number of directories.

    Params:
    -------
    go_up: int, default=1
        This indicates the number of times to go back up from the current directory.

    Returns:
    --------
    None
    """
    import os
    import sys

    CONST: str = "../"
    NUM: str = CONST * go_up

    # Goto the previous directory
    prev_directory = os.path.join(os.path.dirname(__name__), NUM)
    # Get the 'absolute path' of the previous directory
    abs_path_prev_directory = os.path.abspath(prev_directory)

    # Add the path to the System paths
    sys.path.insert(0, abs_path_prev_directory)
    print(abs_path_prev_directory)

In [3]:
go_up_from_current_directory(go_up=1)

/Users/mac/Desktop/Projects/Bike-Rental-Prediction


In [5]:
# Create sample data
rng = np.random.default_rng(42)
dummy_data = pl.DataFrame(
    {
        "age": rng.normal(35, 10, 1000),
        "salary": rng.exponential(50000, 1000),
        "score": rng.uniform(0, 100, 1000),
        "department": rng.choice(["Sales", "Engineering", "Marketing"], 1000),
        "experience": rng.choice(["Junior", "Mid", "Senior"], 1000, p=[0.4, 0.4, 0.2]),
        "target": rng.normal(75, 15, 1000),
    }
)
dummy_data.head()

age,salary,score,department,experience,target
f64,f64,f64,str,str,f64
38.047171,19259.722733,50.454524,"""Sales""","""Junior""",89.792611
24.600159,11388.44911,85.953079,"""Engineering""","""Mid""",83.077468
42.504512,15860.245889,44.825235,"""Sales""","""Junior""",74.819551
44.405647,6392.342913,27.948737,"""Sales""","""Junior""",70.27595
15.489648,54707.920841,85.547536,"""Engineering""","""Mid""",111.400016


In [22]:
EMPTY_DATAFRAME: str = "🚫 Empty dataframe"
summary_stats: list[Any] = []

for col in dummy_data.select(cs.numeric()).columns:
    series = dummy_data[col]

    if len(series) == 0:
        print(EMPTY_DATAFRAME)
        continue

    # Central tendency: mean, median and mode
    mean: float = series.mean().__round__(2)
    median: float = series.median().__round__(2)
    mode: list[float] = series.mode().to_list()[:5]  # Top 5 modes

    # Spread: std, variance, range, iqr_value, min, max
    std: float = series.std().__round__(2)
    variance: float = series.var().__round__(2)
    data_range: float = (series.max() - series.min()).__round__(2)
    min_value: float = series.min()
    max_value: float = series.max()

    # Others: count, missing_values, unique_values
    count: int = series.count()
    missing_values: int = series.is_null().sum()
    missing_pct: float = (missing_values / series.shape[0]).__round__(2)
    unique_values: int = series.n_unique()

    summary_stats.append(
        {
            "column": col,
            "mean": mean,
            "median": median,
            "mode": mode,
            "std": std,
            "variance": variance,
            "range": data_range,
            "min": min_value,
            "max": max_value,
            "count": count,
            "missing_values": missing_values,
            "missing_pct": missing_pct,
            "unique_values": unique_values,
        }
    )

summary_stats

[{'column': 'age',
  'mean': 34.71,
  'median': 35.06,
  'mode': [21.31840800754335,
   22.496844610768537,
   28.94999287191269,
   33.33072069891435,
   64.05067169240407],
  'std': 9.89,
  'variance': 97.86,
  'range': 68.27,
  'min': -1.484128252147836,
  'max': 66.78853679367535,
  'count': 1000,
  'missing_values': 0,
  'missing_pct': 0.0,
  'unique_values': 1000},
 {'column': 'salary',
  'mean': 50779.11,
  'median': 35946.37,
  'mode': [3970.268708037914,
   22362.40975376458,
   53140.12284063175,
   3415.9706897536016,
   9555.41836822662],
  'std': 51331.62,
  'variance': 2634935157.53,
  'range': 380729.73,
  'min': 3.5613078892469914,
  'max': 380733.2959969836,
  'count': 1000,
  'missing_values': 0,
  'missing_pct': 0.0,
  'unique_values': 1000},
 {'column': 'score',
  'mean': 49.57,
  'median': 49.89,
  'mode': [77.01869494441829,
   61.243932925318155,
   39.60652512784873,
   20.763717209391974,
   71.80623366342505],
  'std': 28.78,
  'variance': 828.26,
  'range': 9

In [55]:
import narwhals as nw
import narwhals.selectors as n_cs
import pendulum
from narwhals.typing import IntoDataFrameT, IntoFrameT

EMPTY_DATAFRAME: str = "🚫 Empty dataframe"


def _get_numeric_summary_stats(data: IntoFrameT) -> list[Any]:
    nw_data = nw.from_native(data)
    numeric_summary_stats: list[dict[str, Any]] = []

    for col in nw_data.select(n_cs.numeric()).columns:
        series = nw_data[col]

        if len(series) == 0:
            print(EMPTY_DATAFRAME)
            continue

        # Central tendency: mean, median and mode
        mean: float = series.mean().__round__(2)
        median: float = series.median().__round__(2)
        mode: list[float] = series.mode().to_list()[:5]  # Top 5 modes

        # Spread: std, variance, range, iqr_value, min, max
        std: float = series.std().__round__(2)
        variance: float = series.var().__round__(2)
        data_range: float = (series.max() - series.min()).__round__(2)
        min_value: float = series.min()
        max_value: float = series.max()

        # Others: count, missing_values, unique_values
        count: int = series.count()
        missing_values: int = series.is_null().sum()
        missing_pct: float = (missing_values / series.shape[0]).__round__(2)
        unique_values: int = series.n_unique()

        numeric_summary_stats.append(
            {
                "column": col,
                "mean": mean,
                "median": median,
                "mode": mode,
                "std": std,
                "variance": variance,
                "range": data_range,
                "min": min_value,
                "max": max_value,
                "count": count,
                "missing_values": missing_values,
                "missing_pct": missing_pct,
                "unique_values": unique_values,
            }
        )

    return numeric_summary_stats


def _get_categorical_summary_stats(data: IntoFrameT) -> list[dict[str, Any]]:
    nw_data = nw.from_native(data)
    summary_stats: list[dict[str, Any]] = []

    for col in nw_data.select(n_cs.string()).columns:
        series = nw_data[col]

        if len(series) == 0:
            print(EMPTY_DATAFRAME)
            continue

        # Frequency counts and percentages
        value_counts: list[list[Any]] = (
            series.value_counts(sort=True).to_numpy().tolist()
        )

        # Basic stats: count, missing_values, missing_pct, unique_values
        count: int = series.count()
        missing_values: int = series.is_null().sum()
        missing_pct: float = (missing_values / series.shape[0] * 100).__round__(2)
        unique_values: int = series.n_unique()

        summary_stats.append(
            {
                "column": col,
                "total_count": count,
                "unique_values": unique_values,
                "value_counts": value_counts,
                "missing_values": missing_values,
                "missing_pct": missing_pct,
            }
        )
    return summary_stats


def data_validator(data: IntoDataFrameT) -> dict[str, Any]:
    nw_data = nw.from_native(data)
    numeric_schema: dict[str, Any] = {
        k: str(v) for k, v in nw_data.select(n_cs.numeric()).collect_schema().items()
    }
    string_schema: dict[str, Any] = {
        k: str(v) for k, v in nw_data.select(n_cs.string()).collect_schema().items()
    }
    data_types: dict[str, Any] = {
        "numeric_schema": numeric_schema,
        "string_schema": string_schema,
    }
    data_shape: dict[str, int] = {
        "total_rows": nw_data.shape[0],
        "total_columns": nw_data.shape[1],
        "number_of_numeric_columns": len(numeric_schema),
        "number_of_string_columns": len(string_schema),
    }
    cardinality: dict[str, int] = {
        "num_unique_numeric_rows": {
            col: nw_data[col].n_unique()
            for col in nw_data.select(n_cs.numeric()).columns
        },
        "num_unique_string_rows": {
            col: nw_data[col].n_unique()
            for col in nw_data.select(n_cs.string()).columns
        },
    }
    total_nulls = nw_data.null_count().to_numpy().sum().item()
    null_info: dict[str, Any] = {
        "data_nulls": nw_data.null_count().to_polars().to_dicts()[0],
        "total_nulls": total_nulls,
    }
    num_duplicated_rows = nw_data.is_duplicated().sum()

    return {
        "data_schema": data_types,
        "data_shape": data_shape,
        "cardinality": cardinality,
        "summary_statistics": {
            "numeric": _get_numeric_summary_stats(data),
            "categorical": _get_categorical_summary_stats(data),
        },
        "null_info": null_info,
        "num_duplicated_rows": num_duplicated_rows,
        "memory_usage_MB": round(data.estimated_size(unit="mb"), 2),
        "validation_timestamp": pendulum.now().isoformat(timespec="seconds"),
    }


res = data_validator(dummy_data)
console.print(res)

In [54]:
type MetricFn = callable[[pd.DataFrame], dict[str, Any]]

MetricFn

MetricFn

In [None]:
from dataclasses import dataclass, field

from narwhals.typing import IntoDataFrameT, IntoFrameT

EMPTY_DATAFRAME: str = "🚫 Empty dataframe"

type InfoFn = callable[[nw.DataFrame], dict[str, Any]]
type SchemaFn = callable[[nw.DataFrame], dict[str, Any]]
type SummaryStatsFn = callable[[nw.DataFrame], list[dict[str, Any]]]


@dataclass
class DataValidatorConfig:
    data: IntoFrameT
    # Contains numeric and categorical schema functions in that order
    schema_fns: list[SchemaFn] = field(default_factory=list)
    info_fns: list[InfoFn] = field(default_factory=list)
    summary_fn: list[SummaryStatsFn] = field(default_factory=list)


def get_numeric_summary_stats(data: nw.DataFrame) -> list[dict[str, Any]]:
    numeric_summary_stats: list[dict[str, Any]] = []

    for col in data.select(n_cs.numeric()).columns:
        series = data[col]

        if len(series) == 0:
            print(EMPTY_DATAFRAME)
            continue

        # Central tendency: mean, median and mode
        mean: float = series.mean().__round__(2)
        median: float = series.median().__round__(2)
        mode: list[float] = series.mode().to_list()[:5]  # Top 5 modes

        # Spread: std, variance, range, iqr_value, min, max
        std: float = series.std().__round__(2)
        variance: float = series.var().__round__(2)
        data_range: float = (series.max() - series.min()).__round__(2)
        min_value: float = series.min()
        max_value: float = series.max()

        # Others: count, missing_values, unique_values
        count: int = series.count()
        missing_values: int = series.is_null().sum()
        missing_pct: float = (missing_values / series.shape[0]).__round__(2)
        unique_values: int = series.n_unique()

        numeric_summary_stats.append(
            {
                "column": col,
                "mean": mean,
                "median": median,
                "mode": mode,
                "std": std,
                "variance": variance,
                "range": data_range,
                "min": min_value,
                "max": max_value,
                "count": count,
                "missing_values": missing_values,
                "missing_pct": missing_pct,
                "unique_values": unique_values,
            }
        )

    return numeric_summary_stats


def get_categorical_summary_stats(data: nw.DataFrame) -> list[dict[str, Any]]:
    summary_stats: list[dict[str, Any]] = []

    for col in data.select(n_cs.string()).columns:
        series = data[col]

        if len(series) == 0:
            print(EMPTY_DATAFRAME)
            continue

        # Frequency counts and percentages
        value_counts: list[list[Any]] = (
            series.value_counts(sort=True).to_numpy().tolist()
        )

        # Basic stats: count, missing_values, missing_pct, unique_values
        count: int = series.count()
        missing_values: int = series.is_null().sum()
        missing_pct: float = (missing_values / series.shape[0] * 100).__round__(2)
        unique_values: int = series.n_unique()

        summary_stats.append(
            {
                "column": col,
                "total_count": count,
                "unique_values": unique_values,
                "value_counts": value_counts,
                "missing_values": missing_values,
                "missing_pct": missing_pct,
            }
        )
    return summary_stats


def to_nw_df(data: IntoFrameT) -> nw.DataFrame:
    return nw.from_native(data)


def get_numeric_schema(data: nw.DataFrame) -> dict[str, Any]:
    numeric_schema: dict[str, Any] = {
        k: str(v) for k, v in data.select(n_cs.numeric()).collect_schema().items()
    }
    return numeric_schema


def get_string_schema(data: nw.DataFrame) -> dict[str, Any]:
    string_schema: dict[str, Any] = {
        k: str(v) for k, v in data.select(n_cs.string()).collect_schema().items()
    }
    return string_schema


def get_cardinality_info(data: nw.DataFrame) -> dict[str, Any]:
    cardinality: dict[str, int] = {
        "num_unique_numeric_rows": {
            col: data[col].n_unique() for col in data.select(n_cs.numeric()).columns
        },
        "num_unique_string_rows": {
            col: data[col].n_unique() for col in data.select(n_cs.string()).columns
        },
    }
    return cardinality


def get_null_info(data: nw.DataFrame) -> dict[str, Any]:
    null_info: dict[str, Any] = {
        "data_nulls": data.null_count().to_polars().to_dicts()[0],
        "total_nulls": data.null_count().to_numpy().sum().item(),
    }
    return null_info


def get_duplicated_rows_info(data: nw.DataFrame) -> dict[str, Any]:
    return {"num_duplicated_rows": data.is_duplicated().sum()}


def get_memory_usage_info(data: nw.DataFrame) -> dict[str, Any]:
    return {
        "memory_usage_MB": round(data.estimated_size(unit="mb"), 2),
        "validation_timestamp": pendulum.now().isoformat(timespec="seconds"),
    }

In [None]:
def data_validator(input_data: DataValidatorConfig) -> dict[str, Any]:
    nw_data = to_nw_df(input_data.data)

    # Collect schema information
    schema: list[dict[str, Any]] = [
        schema_fn(nw_data) for schema_fn in input_data.schema_fns
    ]

    # Collect general info
    info: dict[str, Any] = {}
    for info_fn in input_data.info_fns:
        info.update(info_fn(nw_data))

    # Collect summary statistics
    summary_stats: list[dict[str, Any]] = [
        stats_fn(nw_data) for stats_fn in input_data.summary_fn
    ]

    return {
        "data_schema": (
            {"numeric": schema[0], "string": schema[1]} if len(schema) == 2 else schema
        ),
        "data_shape": {
            "total_rows": nw_data.shape[0],
            "total_columns": nw_data.shape[1],
            "number_of_numeric_columns": len(
                list(nw_data.select(n_cs.numeric()).columns)
            ),
            "number_of_string_columns": len(
                list(nw_data.select(n_cs.string()).columns)
            ),
        },
        "summary_statistics": {
            "numeric": summary_stats[0] if len(summary_stats) == 2 else [],
            "categorical": summary_stats[1] if len(summary_stats) == 2 else [],
        },
        "other_info": info,
    }


if __name__ == "__main__":
    config = DataValidatorConfig(
        data=dummy_data,
        schema_fns=[get_numeric_schema, get_string_schema],
        info_fns=[
            get_cardinality_info,
            get_null_info,
            get_duplicated_rows_info,
            get_memory_usage_info,
        ],
        summary_fn=[
            get_numeric_summary_stats,
            get_categorical_summary_stats,
        ],
    )
    res = data_validator(config)
    console.print(res)
    # Add docsting using numpy style