In [1]:
import tomllib
import pandas as pd
import time
from my_pipeline.extract import extract_data
from my_pipeline.transform import transform_data
from my_pipeline.outliers import remove_outliers
from my_pipeline.normalize import normalize_data
from my_pipeline.encode import encode_categorical
from my_pipeline.load import save_data
from my_pipeline.progress import StepProgress
from my_pipeline.profiler import profile_step
from my_pipeline.logger import get_logger
from pathlib import Path

# ---------------------------------------------------
# 1. Load TOML config
# ---------------------------------------------------
CONFIG_PATH = "config/settings.toml"

with open(CONFIG_PATH, "rb") as f:
    config = tomllib.load(f)

config
# ---------------------------------------------------
# 2. Setup logger
# ---------------------------------------------------
logger = get_logger(name="pipeline_notebook", level=config["logging"]["level"])
logger.info("Notebook pipeline started")
# ---------------------------------------------------


19:20:43 | INFO | Notebook pipeline started


In [2]:
STEP_FUNCTIONS = {
    "extract": extract_data,
    "transform": transform_data,
    "outliers": remove_outliers,
    "normalize": normalize_data,
    "encode": encode_categorical,
    "load": save_data,
}

In [3]:
def run_pipeline_from_config(config):
    steps = config["steps"]
    total_steps = len(steps)
    
    progress = StepProgress(total=total_steps)
    df = None

    for step in steps:
        logger.info(f"➡ Running step: {step}")

        params = config.get(step, {})
        func = STEP_FUNCTIONS[step]

        # DISPLAY PROGRESS
        progress.update(step)

        # --- PROFILE STEP
        start = time.time()

        if step == "extract":
            df = func(params["input_path"])

        elif step == "transform":
            df = func(
                df,
                method=params["method"],
                fill_value=params.get("fill_value")
            )

        elif step == "outliers":
            df = func(
                df,
                method=params["method"],
                threshold=params["threshold"]
            )

        elif step == "normalize":
            df = func(
                df,
                method=params["method"]
            )

        elif step == "encode":
            df = func(
                df,
                method=params["method"],
                target_column=params.get("target_column")
            )

        elif step == "load":
            func(df, output_path=params["output_path"])
        
        # END PROFILE
        elapsed = time.time() - start
        logger.info(f"⏱ Step {step} finished in {elapsed:.3f}s")

    progress.finish()
    return df


In [4]:
df_final = run_pipeline_from_config(config)
df_final.head()

19:20:48 | INFO | ➡ Running step: extract


▶️  Step 1/6 [█████-------------------------]  16.7% | extract | elapsed 0.0s

19:20:48 | INFO | ⏱ Step extract finished in 0.004s
19:20:48 | INFO | ➡ Running step: transform


▶️  Step 2/6 [██████████--------------------]  33.3% | transform | elapsed 0.0s

19:20:48 | INFO | ⏱ Step transform finished in 0.004s
19:20:48 | INFO | ➡ Running step: outliers


Running transform step...
Filling missing numeric values with mean...
✔ Missing value handling done
▶️  Step 3/6 [███████████████---------------]  50.0% | outliers | elapsed 0.0s

19:20:48 | INFO | ⏱ Step outliers finished in 0.008s
19:20:48 | INFO | ➡ Running step: normalize


Applying IQR outlier removal...
▶️  Step 4/6 [████████████████████----------]  66.7% | normalize | elapsed 0.0s

19:20:48 | INFO | ⏱ Step normalize finished in 0.002s
19:20:48 | INFO | ➡ Running step: encode


Applying Min-Max Normalization...
▶️  Step 5/6 [█████████████████████████-----]  83.3% | encode | elapsed 0.0s

19:20:48 | INFO | ⏱ Step encode finished in 0.005s
19:20:48 | INFO | ➡ Running step: load


Target Encoding mainroad using target 'price'
Target Encoding guestroom using target 'price'
Target Encoding basement using target 'price'
Target Encoding hotwaterheating using target 'price'
Target Encoding airconditioning using target 'price'
Target Encoding prefarea using target 'price'
Target Encoding furnishingstatus using target 'price'
▶️  Step 6/6 [██████████████████████████████] 100.0% | load | elapsed 0.0s

19:20:48 | INFO | ⏱ Step load finished in 0.006s



Completed all 6 steps in 0.04s


Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
15,1.0,0.499426,0.649781,,0.5,0.334897,,,0.309151,0.281273,1.0,,0.358124
20,0.952381,0.401895,0.666667,,0.5,0.334897,0.288314,0.394948,0.389864,0.281273,1.0,0.277918,0.358124
22,0.938095,0.734788,0.666667,,0.0,0.334897,0.44564,0.394948,0.309151,0.41273,0.5,0.277918,0.361577
27,0.904762,0.829506,0.666667,,0.0,0.334897,0.288314,0.271289,0.309151,0.281273,0.5,0.277918,0.358124
40,0.833333,0.562572,0.666667,,0.5,0.334897,0.288314,0.394948,0.309151,0.41273,0.0,0.433284,0.361577
