# Lab For Experimentation

In [1]:
import warnings

import numpy as np
import pandas as pd
import polars as pl
from rich.console import Console
from rich.theme import Theme

custom_theme = Theme(
    {
        "white": "#FFFFFF",  # Bright white
        "info": "#00FF00",  # Bright green
        "warning": "#FFD700",  # Bright gold
        "error": "#FF1493",  # Deep pink
        "success": "#00FFFF",  # Cyan
        "highlight": "#FF4500",  # Orange-red
    }
)
console = Console(theme=custom_theme)

# Visualization
# import matplotlib.pyplot as plt

# NumPy settings
np.set_printoptions(precision=4)

# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

# Polars settings
pl.Config.set_fmt_str_lengths(1_000)
pl.Config.set_tbl_cols(n=1_000)
pl.Config.set_tbl_rows(n=200)

warnings.filterwarnings("ignore")

# Black code formatter (Optional)
%load_ext lab_black

# auto reload imports
%load_ext autoreload
%autoreload 2

In [2]:
def go_up_from_current_directory(*, go_up: int = 1) -> None:
    """This is used to up a number of directories.

    Params:
    -------
    go_up: int, default=1
        This indicates the number of times to go back up from the current directory.

    Returns:
    --------
    None
    """
    import os
    import sys

    CONST: str = "../"
    NUM: str = CONST * go_up

    # Goto the previous directory
    prev_directory = os.path.join(os.path.dirname(__name__), NUM)
    # Get the 'absolute path' of the previous directory
    abs_path_prev_directory = os.path.abspath(prev_directory)

    # Add the path to the System paths
    sys.path.insert(0, abs_path_prev_directory)
    print(abs_path_prev_directory)

In [3]:
go_up_from_current_directory(go_up=1)

/Users/mac/Desktop/Projects/Bike-Rental-Prediction


In [4]:
# Create sample data
rng = np.random.default_rng(42)
dummy_data = pl.DataFrame(
    {
        "age": rng.normal(35, 10, 1000),
        "salary": rng.exponential(50000, 1000),
        "score": rng.uniform(0, 100, 1000),
        "department": rng.choice(["Sales", "Engineering", "Marketing"], 1000),
        "experience": rng.choice(["Junior", "Mid", "Senior"], 1000, p=[0.4, 0.4, 0.2]),
        "target": rng.normal(75, 15, 1000),
    }
)
dummy_data.head()

age,salary,score,department,experience,target
f64,f64,f64,str,str,f64
38.047171,19259.722733,50.454524,"""Sales""","""Junior""",89.792611
24.600159,11388.44911,85.953079,"""Engineering""","""Mid""",83.077468
42.504512,15860.245889,44.825235,"""Sales""","""Junior""",74.819551
44.405647,6392.342913,27.948737,"""Sales""","""Junior""",70.27595
15.489648,54707.920841,85.547536,"""Engineering""","""Mid""",111.400016


In [5]:
from src.utilities.data_validator import data_validator

In [None]:
pandas_df = dummy_data.to_pandas()

In [10]:
data_validator(pandas_df)

{'data_schema': {'numeric': {'age': 'Float64',
   'salary': 'Float64',
   'score': 'Float64',
   'target': 'Float64'},
  'string': {'department': 'String', 'experience': 'String'}},
 'data_shape': {'total_rows': 1000,
  'total_columns': 6,
  'number_of_numeric_columns': 4,
  'number_of_string_columns': 2},
 'summary_statistics': {'numeric': [{'column': 'age',
    'mean': np.float64(34.71),
    'median': np.float64(35.06),
    'mode': [-1.484128252147836,
     5.354711621583488,
     5.694056241472243,
     8.27165566897062,
     9.333415590687025],
    'std': np.float64(9.89),
    'variance': np.float64(97.86),
    'range': np.float64(68.27),
    'min': np.float64(-1.484128252147836),
    'max': np.float64(66.78853679367535),
    'count': np.int64(1000),
    'missing_values': np.int64(0),
    'missing_pct': np.float64(0.0),
    'unique_values': 1000},
   {'column': 'salary',
    'mean': np.float64(50779.11),
    'median': np.float64(35946.37),
    'mode': [3.5613078892469914,
     24.2