In [1]:
from __future__ import annotations

import sys
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.datasets import load_breast_cancer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Make sure the local Skyulf Core is importable whether this runs from repo root or from skyulf-core/
try:
    import skyulf  # noqa: F401
except ImportError:
    here = Path.cwd()
    candidates = [here, here / 'skyulf-core', here.parent, here.parent / 'skyulf-core']
    for c in candidates:
        if (c / 'skyulf' / '__init__.py').exists():
            sys.path.insert(0, str(c))
            break

from skyulf import SkyulfPipeline
from skyulf.data.dataset import SplitDataset

np.random.seed(42)
print('Imports OK')

Imports OK


In [5]:
# Create a dataset with numeric + categorical features + missing values
raw = load_breast_cancer(as_frame=True)
df = raw.frame.copy()
df = df.rename(columns={'target': 'label'})

# Add a categorical feature derived from a numeric feature
df['radius_band'] = pd.cut(
    df['mean radius'],
    bins=[0, 12, 15, 100],
    labels=['small', 'medium', 'large'],
    include_lowest=True,
)

# Introduce missing values in a numeric column
missing_idx = np.random.choice(df.index, size=25, replace=False)
df.loc[missing_idx, 'mean texture'] = np.nan

target_col = 'label'
cat_cols = ['radius_band']
num_cols = [c for c in df.columns if c not in [target_col, *cat_cols]]

X = df[num_cols + cat_cols]
y = df[target_col]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.25,
    random_state=42,
    stratify=y,
)

train_df = X_train.copy()
train_df[target_col] = y_train
test_df = X_test.copy()
test_df[target_col] = y_test

dataset = SplitDataset(train=train_df, test=test_df, validation=None)

print('Train:', dataset.train.shape, 'Test:', dataset.test.shape)
dataset.train.head()

Train: (426, 32) Test: (143, 32)


Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,radius_band,label
517,19.89,20.26,130.5,1214.0,0.1037,0.131,0.1411,0.09431,0.1802,0.06188,...,160.5,1646.0,0.1417,0.3309,0.4185,0.1613,0.2549,0.09136,large,0
287,12.89,13.12,81.89,515.9,0.06955,0.03729,0.0226,0.01171,0.1337,0.05581,...,87.4,577.0,0.09616,0.1147,0.1186,0.05366,0.2309,0.06915,medium,1
25,17.14,16.4,116.0,912.7,0.1186,0.2276,0.2229,0.1401,0.304,0.07413,...,152.4,1461.0,0.1545,0.3949,0.3853,0.255,0.4066,0.1059,large,0
253,17.3,17.08,113.0,928.2,0.1008,0.1041,0.1266,0.08353,0.1813,0.05613,...,130.9,1222.0,0.1416,0.2405,0.3378,0.1857,0.3138,0.08113,large,0
369,22.01,21.9,147.2,1482.0,0.1063,0.1954,0.2448,0.1501,0.1824,0.0614,...,195.0,2227.0,0.1294,0.3885,0.4756,0.2432,0.2741,0.08574,large,0


In [6]:
# --- scikit-learn baseline ---
# Keep OneHotEncoder dense to mirror Skyulf's encoder output.
try:
    ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
except TypeError:  # older scikit-learn
    ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)

numeric = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler()),
    ]
)
categorical = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', ohe),
    ]
)

preprocess = ColumnTransformer(
    transformers=[
        ('num', numeric, num_cols),
        ('cat', categorical, cat_cols),
    ],
    remainder='drop',
)

sk_pipe = Pipeline(
    steps=[
        ('preprocess', preprocess),
        ('model', LogisticRegression(max_iter=1000, random_state=42)),
    ]
)

sk_pipe.fit(X_train, y_train)
sk_pred = pd.Series(sk_pipe.predict(X_test), index=X_test.index)

sk_acc = accuracy_score(y_test, sk_pred)
print(f'scikit-learn test accuracy: {sk_acc:.4f}')
sk_pred.head()

scikit-learn test accuracy: 0.9790


519    1
408    0
291    1
518    1
385    0
dtype: int32

In [7]:
# --- SkyulfPipeline equivalent ---
config = {
    'preprocessing': [
        {
            'name': 'impute_numeric',
            'transformer': 'SimpleImputer',
            'params': {'strategy': 'mean', 'columns': num_cols},
        },
        {
            'name': 'impute_categorical',
            'transformer': 'SimpleImputer',
            'params': {'strategy': 'most_frequent', 'columns': cat_cols},
        },
        {
            'name': 'encode_categorical',
            'transformer': 'OneHotEncoder',
            'params': {'columns': cat_cols, 'drop_original': True, 'handle_unknown': 'ignore'},
        },
        {
            'name': 'scale_numeric',
            'transformer': 'StandardScaler',
            'params': {'columns': num_cols},
        },
    ],
    'modeling': {
        'type': 'logistic_regression',
        'node_id': 'model',
        'params': {'max_iter': 1000, 'random_state': 42},
    },
}

sky_pipe = SkyulfPipeline(config)
metrics = sky_pipe.fit(dataset, target_column=target_col)

sky_pred = sky_pipe.predict(X_test)
sky_acc = accuracy_score(y_test, sky_pred)
delta = abs(sk_acc - sky_acc)
print(f'Skyulf test accuracy: {sky_acc:.4f}')
print(f'delta accuracy: {delta:.4f}')

# Basic correctness checks (style)
assert isinstance(sky_pred, pd.Series)
assert sky_pred.index.equals(X_test.index)
assert sky_pred.isna().sum() == 0

sky_pred.head()

Skyulf test accuracy: 0.9790
delta accuracy: 0.0000


519    1
408    0
291    1
518    1
385    0
dtype: int32

In [8]:
# --- Classification metrics: side-by-side ---
from sklearn.metrics import classification_report, confusion_matrix

# Reports as dicts -> DataFrames for side-by-side comparison
sk_report = classification_report(y_test, sk_pred, output_dict=True, zero_division=0)
sky_report = classification_report(y_test, sky_pred, output_dict=True, zero_division=0)

sk_df = pd.DataFrame(sk_report).T
sky_df = pd.DataFrame(sky_report).T

# Keep a consistent row order: class labels first, then summary rows (if present)
label_rows = [str(v) for v in sorted(pd.unique(y_test))]
summary_rows = [r for r in ['accuracy', 'macro avg', 'weighted avg'] if r in sk_df.index]
row_order = [r for r in label_rows if r in sk_df.index] + summary_rows

sk_df = sk_df.loc[row_order]
sky_df = sky_df.loc[row_order]

side_by_side = pd.concat(
    {
        'sklearn': sk_df[['precision', 'recall', 'f1-score', 'support']],
        'skyulf': sky_df[['precision', 'recall', 'f1-score', 'support']],
    },
    axis=1,
)

print('Classification report (side-by-side):')
display(side_by_side)

# Confusion matrices
labels = sorted(pd.unique(y_test))
cm_sk = confusion_matrix(y_test, sk_pred, labels=labels)
cm_sky = confusion_matrix(y_test, sky_pred, labels=labels)

cm_index = [f'true_{l}' for l in labels]
cm_cols = [f'pred_{l}' for l in labels]

cm_sk_df = pd.DataFrame(cm_sk, index=cm_index, columns=cm_cols)
cm_sky_df = pd.DataFrame(cm_sky, index=cm_index, columns=cm_cols)

print('Confusion matrix (sklearn):')
display(cm_sk_df)
print('Confusion matrix (skyulf):')
display(cm_sky_df)

Classification report (side-by-side):


Unnamed: 0_level_0,sklearn,sklearn,sklearn,sklearn,skyulf,skyulf,skyulf,skyulf
Unnamed: 0_level_1,precision,recall,f1-score,support,precision,recall,f1-score,support
0,0.962963,0.981132,0.971963,53.0,0.962963,0.981132,0.971963,53.0
1,0.988764,0.977778,0.98324,90.0,0.988764,0.977778,0.98324,90.0
accuracy,0.979021,0.979021,0.979021,0.979021,0.979021,0.979021,0.979021,0.979021
macro avg,0.975864,0.979455,0.977601,143.0,0.975864,0.979455,0.977601,143.0
weighted avg,0.979201,0.979021,0.97906,143.0,0.979201,0.979021,0.97906,143.0


Confusion matrix (sklearn):


Unnamed: 0,pred_0,pred_1
true_0,52,1
true_1,2,88


Confusion matrix (skyulf):


Unnamed: 0,pred_0,pred_1
true_0,52,1
true_1,2,88


In [9]:
assert (sk_pred.values == sky_pred.values).all()
print("Predictions match exactly.")

Predictions match exactly.


## Notes

- This notebook does **not** claim the two models will produce identical predictions; they are different implementations and may differ in preprocessing details.
- The goal is to demonstrate a familiar workflow and validate basic invariants (shapes/index alignment, no NaNs, runnable end-to-end).