# Skyulf Core — Getting Started (Notebook 3)

This notebook is a minimal, runnable smoke test for the Skyulf Core SDK in a notebook environment.

It verifies the environment, checks imports, runs a tiny end-to-end preprocessing + training example, and (optionally) runs a quick pytest smoke check.

In [3]:
# Fail-fast import check
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

from skyulf.data.dataset import SplitDataset
from skyulf.preprocessing.pipeline import FeatureEngineer
from skyulf.modeling.base import StatefulEstimator
from skyulf.modeling.classification import (
    RandomForestClassifierApplier,
    RandomForestClassifierCalculator,
)

print("Imports OK")

Imports OK


In [4]:
import logging
import random
from pathlib import Path

# Notebook is in skyulf-core/examples -> project root is two levels up
NOTEBOOK_DIR = Path.cwd()
PROJECT_ROOT = (NOTEBOOK_DIR / ".." / "..").resolve()

random.seed(42)
try:
    import numpy as np
    np.random.seed(42)
except Exception:
    pass

logging.basicConfig(level=logging.INFO, format="%(levelname)s:%(name)s:%(message)s")
print("Project root:", PROJECT_ROOT)

Project root: C:\Users\Murat\Desktop\skyulf-mlflow


End-to-end: load Iris → split → scale features → train RandomForest → predict.

In [10]:
# Load and prepare data
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df["target"] = iris.target

df.columns = ["sepal_length", "sepal_width", "petal_length", "petal_width", "target"]

feature_cols = ["sepal_length", "sepal_width", "petal_length", "petal_width"]
target_col = "target"

train_df, test_df = train_test_split(
    df, test_size=0.2, random_state=42, stratify=df[target_col]
 )

dataset = SplitDataset(
    train=train_df.reset_index(drop=True),
    test=test_df.reset_index(drop=True),
    validation=None,
 )

print("Train shape:", dataset.train.shape)
print("Test shape:", dataset.test.shape)

# Preprocess features
steps = [
    {
        "name": "scale_features",
        "transformer": "StandardScaler",
        "params": {"columns": feature_cols},
    }
]

engineer = FeatureEngineer(steps)
X_train_scaled, metrics = engineer.fit_transform(dataset.train[feature_cols])
X_test_scaled = engineer.transform(dataset.test[feature_cols])

print("Preprocessing metrics keys:", list(metrics.keys()))
print("X_train_scaled:", X_train_scaled.shape)
print("X_test_scaled:", X_test_scaled.shape)

# Train + predict
estimator = StatefulEstimator(
    calculator=RandomForestClassifierCalculator(),
    applier=RandomForestClassifierApplier(),
    node_id="iris_rf",
 )

train_model_df = X_train_scaled.copy()
train_model_df[target_col] = dataset.train[target_col].reset_index(drop=True)

test_model_df = X_test_scaled.copy()
test_model_df[target_col] = dataset.test[target_col].reset_index(drop=True)

predictions = estimator.fit_predict(
    dataset=SplitDataset(train=train_model_df, test=test_model_df, validation=None),
    target_column=target_col,
    config={"n_estimators": 200, "random_state": 42},
 )

assert "train" in predictions and "test" in predictions
assert len(predictions["train"]) == len(dataset.train)
assert len(predictions["test"]) == len(dataset.test)

print("Predictions OK")

INFO:skyulf.preprocessing.pipeline:Running step 0: scale_features (StandardScaler)
INFO:skyulf.modeling.sklearn_wrapper:Initializing RandomForestClassifier with params: {'n_estimators': 200, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 2, 'n_jobs': -1, 'random_state': 42}


0      0
1      0
2      0
3      0
4      0
      ..
145    2
146    2
147    2
148    2
149    2
Name: target, Length: 150, dtype: int32
Train shape: (120, 5)
Test shape: (30, 5)
Preprocessing metrics keys: ['mean', 'scale', 'var', 'columns']
X_train_scaled: (120, 4)
X_test_scaled: (30, 4)
Predictions OK


In [6]:
# Evaluate
from sklearn.metrics import accuracy_score, classification_report

test_acc = accuracy_score(dataset.test[target_col], predictions["test"])
print(f"Test accuracy: {test_acc:.4f}")

print(
    classification_report(
        dataset.test[target_col],
        predictions["test"],
        target_names=iris.target_names,
    )
)

Test accuracy: 0.9667
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        10
  versicolor       1.00      0.90      0.95        10
   virginica       0.91      1.00      0.95        10

    accuracy                           0.97        30
   macro avg       0.97      0.97      0.97        30
weighted avg       0.97      0.97      0.97        30



In [7]:
# Predict on new data
new_samples = pd.DataFrame(
    {
        "sepal_length": [5.1, 6.3],
        "sepal_width": [3.5, 2.5],
        "petal_length": [1.4, 4.9],
        "petal_width": [0.2, 1.5],
    }
)

new_scaled = engineer.transform(new_samples[feature_cols])

# StatefulEstimator keeps the trained model in-memory as estimator.model
new_preds = estimator.applier.predict(new_scaled, estimator.model)
print("Raw preds:", new_preds.tolist())
print("Species:", [iris.target_names[i] for i in new_preds.tolist()])

Raw preds: [0, 1]
Species: ['setosa', 'versicolor']
