# Lab For Experimentation

In [4]:
import warnings
from typing import Any

import numpy as np
import pandas as pd
import polars as pl
import polars.selectors as cs
from rich.console import Console
from rich.theme import Theme

custom_theme = Theme(
    {
        "white": "#FFFFFF",  # Bright white
        "info": "#00FF00",  # Bright green
        "warning": "#FFD700",  # Bright gold
        "error": "#FF1493",  # Deep pink
        "success": "#00FFFF",  # Cyan
        "highlight": "#FF4500",  # Orange-red
    }
)
console = Console(theme=custom_theme)

# Visualization
# import matplotlib.pyplot as plt

# NumPy settings
np.set_printoptions(precision=4)

# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

# Polars settings
pl.Config.set_fmt_str_lengths(1_000)
pl.Config.set_tbl_cols(n=1_000)
pl.Config.set_tbl_rows(n=200)

warnings.filterwarnings("ignore")

# Black code formatter (Optional)
%load_ext lab_black

# auto reload imports
%load_ext autoreload
%autoreload 2

The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
def go_up_from_current_directory(*, go_up: int = 1) -> None:
    """This is used to up a number of directories.

    Params:
    -------
    go_up: int, default=1
        This indicates the number of times to go back up from the current directory.

    Returns:
    --------
    None
    """
    import os
    import sys

    CONST: str = "../"
    NUM: str = CONST * go_up

    # Goto the previous directory
    prev_directory = os.path.join(os.path.dirname(__name__), NUM)
    # Get the 'absolute path' of the previous directory
    abs_path_prev_directory = os.path.abspath(prev_directory)

    # Add the path to the System paths
    sys.path.insert(0, abs_path_prev_directory)
    print(abs_path_prev_directory)

In [3]:
go_up_from_current_directory(go_up=1)

/Users/mac/Desktop/Projects/Bike-Rental-Prediction


In [5]:
# Create sample data
rng = np.random.default_rng(42)
dummy_data = pl.DataFrame(
    {
        "age": rng.normal(35, 10, 1000),
        "salary": rng.exponential(50000, 1000),
        "score": rng.uniform(0, 100, 1000),
        "department": rng.choice(["Sales", "Engineering", "Marketing"], 1000),
        "experience": rng.choice(["Junior", "Mid", "Senior"], 1000, p=[0.4, 0.4, 0.2]),
        "target": rng.normal(75, 15, 1000),
    }
)
dummy_data.head()

age,salary,score,department,experience,target
f64,f64,f64,str,str,f64
38.047171,19259.722733,50.454524,"""Sales""","""Junior""",89.792611
24.600159,11388.44911,85.953079,"""Engineering""","""Mid""",83.077468
42.504512,15860.245889,44.825235,"""Sales""","""Junior""",74.819551
44.405647,6392.342913,27.948737,"""Sales""","""Junior""",70.27595
15.489648,54707.920841,85.547536,"""Engineering""","""Mid""",111.400016


In [8]:
import pendulum


def data_validator(data: pl.DataFrame) -> dict[str, Any]:
    numeric_schema: dict[str, Any] = {
        k: str(v) for k, v in data.select(cs.numeric()).collect_schema().items()
    }
    string_schema: dict[str, Any] = {
        k: str(v) for k, v in data.select(cs.string()).collect_schema().items()
    }
    data_types: dict[str, Any] = {
        "numeric_schema": numeric_schema,
        "string_schema": string_schema,
    }
    data_shape: dict[str, int] = {
        "total_rows": data.height,
        "total_columns": data.width,
        "number_of_numeric_columns": len(numeric_schema),
        "number_of_string_columns": len(string_schema),
    }
    data_nulls: dict[str, int] = data.null_count().sum().to_dicts()[0]
    total_nulls = data.null_count().sum_horizontal().item()
    nulls_info: dict[str, Any] = {"data_nulls": data_nulls, "total_nulls": total_nulls}
    num_duplicated_rows = data.is_duplicated().sum()

    return {
        "data_schema": data_types,
        "data_shape": data_shape,
        "nulls_info": nulls_info,
        "num_duplicated_rows": num_duplicated_rows,
        "validation_timestamp": pendulum.now().isoformat(timespec="seconds"),
    }


res = data_validator(dummy_data)
console.print(res)