In [1]:
from __future__ import annotations

import os
import sys
from pathlib import Path

import numpy as np
import pandas as pd

# Make sure the local Skyulf Core is importable whether this runs from repo root or from skyulf-core/
try:
    import skyulf  # noqa: F401
except ImportError:
    here = Path.cwd()
    candidates = [here, here / 'skyulf-core', here.parent, here.parent / 'skyulf-core']
    for c in candidates:
        if (c / 'skyulf' / '__init__.py').exists():
            sys.path.insert(0, str(c))
            break

from skyulf import SkyulfPipeline

In [2]:
def create_dummy_data(n: int = 200) -> pd.DataFrame:
    np.random.seed(42)
    df = pd.DataFrame({
        'age': np.random.randint(18, 80, n),
        'income': np.random.normal(50000, 15000, n),
        'city': np.random.choice(['New York', 'London', 'Paris'], n),
        'is_customer': np.random.choice([0, 1], n),
    })
    df.loc[0:10, 'income'] = np.nan
    return df

data = create_dummy_data()
data.head()

Unnamed: 0,age,income,city,is_customer
0,56,,Paris,1
1,69,,London,1
2,46,,Paris,1
3,32,,New York,1
4,60,,New York,1


In [3]:
config = {
    'preprocessing': [
        {
            'name': 'split_data',
            'transformer': 'TrainTestSplitter',
            'params': {
                'test_size': 0.2,
                'target_column': 'is_customer',
            },
        },
        {
            'name': 'impute_income',
            'transformer': 'SimpleImputer',
            'params': {
                'columns': ['income'],
                'strategy': 'mean',
            },
        },
        {
            'name': 'encode_city',
            'transformer': 'OneHotEncoder',
            'params': {'columns': ['city']},
        },
        {
            'name': 'scale_features',
            'transformer': 'StandardScaler',
            'params': {'columns': ['age', 'income']},
        },
    ],
    'modeling': {
        'type': 'random_forest_classifier',
        'params': {'n_estimators': 50, 'max_depth': 5},
    },
}

pipeline = SkyulfPipeline(config)
metrics = pipeline.fit(data, target_column='is_customer')
metrics

{'preprocessing': {'missing_counts': {'income': 10},
  'total_missing': 10,
  'fill_values': {'income': 51031.74519845609},
  'new_features_count': 3,
  'encoded_columns_count': 1,
  'mean': [48.71875, 51031.74519845609],
  'scale': [18.861061699636636, 14067.363767993564],
  'var': [355.7396484375, 197890723.3810581],
  'columns': ['age', 'income']},
 'modeling': {'problem_type': 'classification',
  'splits': {'train': ModelEvaluationReport(dataset_name='train', metrics={'accuracy': 0.85625, 'precision_weighted': 0.8588462752525252, 'recall_weighted': 0.85625, 'f1_weighted': 0.8558951173635331, 'precision': 0.8295454545454546, 'recall': 0.9012345679012346, 'f1': 0.863905325443787, 'g_score': 0.8556803856238504, 'roc_auc': 0.9454602281606501, 'pr_auc': 0.9476538297837627}, classification=ClassificationEvaluation(confusion_matrix=ConfusionMatrixData(labels=['0', '1'], matrix=[[0, 0], [0, 0]]), roc_curves=[CurveData(name='ROC (Class 1)', points=[CurvePoint(x=0.0, y=0.0), CurvePoint(x=0.0

In [4]:
artifact_path = 'my_model.pkl'
pipeline.save(artifact_path)

loaded = SkyulfPipeline.load(artifact_path)

new_data = pd.DataFrame({
    'age': [25, 40],
    'income': [60000, np.nan],
    'city': ['London', 'Paris'],
})

predictions = loaded.predict(new_data)
predictions

0    0
1    1
dtype: int32

In [5]:
# Cleanup
if os.path.exists(artifact_path):
    os.remove(artifact_path)
artifact_path

'my_model.pkl'