In [None]:
!pip install git+https://github.com/SAP-samples/sap-rpt-1-oss

Collecting git+https://github.com/SAP-samples/sap-rpt-1-oss
  Cloning https://github.com/SAP-samples/sap-rpt-1-oss to /tmp/pip-req-build-2vxuo3az
  Running command git clone --filter=blob:none --quiet https://github.com/SAP-samples/sap-rpt-1-oss /tmp/pip-req-build-2vxuo3az
  Resolved https://github.com/SAP-samples/sap-rpt-1-oss to commit 4310453f3bbb9c9c403b029e966565e7fe652d97
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting torcheval>=0.0.7 (from sap_rpt_oss==1.0.1)
  Downloading torcheval-0.0.7-py3-none-any.whl.metadata (8.6 kB)
Collecting pandas>=2.2.3 (from sap_rpt_oss==1.0.1)
  Downloading pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyarrow>=20.0.0 (from sap_rpt_oss==1.0.1)
  Downloading pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.2 kB)
Downloading pandas-2.3.3-cp3

In [None]:
!pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [None]:
import time

from google.colab import userdata
from huggingface_hub import login
import numpy as np
import pandas as pd
from sap_rpt_oss import SAP_RPT_OSS_Classifier, SAP_RPT_OSS_Regressor
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, KNNImputer, SimpleImputer
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.metrics import accuracy_score, f1_score, mean_squared_error, r2_score, roc_auc_score
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.utils.multiclass import type_of_target
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from ucimlrepo import fetch_ucirepo

In [None]:
HF_TOKEN=userdata.get('HF_TOKEN')
if HF_TOKEN:
    login(HF_TOKEN)
    print("Successfully logged in to Hugging Face!")
else:
    print("Token is not set. Please save the token first.")

Successfully logged in to Hugging Face!


In [None]:
def load_and_split_data():
    """Load salary dataset"""
    path = '...'
    print(f"Loading dataset from {path}...")

    # turns ? into nans and skips extra spaces
    df = pd.read_csv(path, na_values=['?', ' ?'], skipinitialspace=True)

    target_col = 'income' if 'income' in df.columns else df.columns[-1]

    X = df.drop(columns=[target_col])
    y_series = df[target_col]

    # convert targets for classification
    y = y_series.astype(str).str.strip().apply(lambda x: 1 if '>50K' in x else 0)

    categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
    numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

   # test values (for classification) not corrupted
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    return X_train, X_test, y_train, y_test, categorical_cols, numerical_cols

In [None]:
def load_gas_turbine_data():
    """Load Gas Turbine dataset"""
    print("Loading Gas Turbine dataset...")

    # fetching via ucimlrepo
    dataset = fetch_ucirepo(id=551)
    X = dataset.data.features
    y = dataset.data.targets
    df = pd.concat([X, y], axis=1)

    print(f"Data Loaded: {df.shape}")

    # Tregression task - emission preditction
    target_col = 'CO'

    # drop other target
    if 'NOx' in df.columns:
        df = df.drop(columns=['NOx'])

    y = df[target_col]
    X = df.drop(columns=[target_col])

    numerical_cols = X.select_dtypes(include=['float64', 'int64']).columns.tolist()
    categorical_cols = [] # none in this dataset, but needed for return consistency with the other function

    # same split
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    return X_train, X_test, y_train, y_test, categorical_cols, numerical_cols

In [None]:
def corrupt_data(X, numerical_cols, mechanism='MCAR', ratio=0.3, min_prob=0.05):
    """
    Corrupts data based on the specified mechanism.

    Parameters:
    - mechanism: 'MCAR' (uniform random) or 'MNAR' (U-shaped distribution).
    - ratio: Missing rate for MCAR, or corruption strength for MNAR.
    """
    X_corrupt = X.copy()

    if mechanism == 'MCAR':
        print(f"  > Applying MCAR corruption (rate={ratio})...")
        for col in numerical_cols:
            mask = np.random.rand(len(X_corrupt)) < ratio
            X_corrupt.loc[mask, col] = np.nan

    elif mechanism == 'MNAR':
        print(f"  > Applying MNAR corruption (strength={ratio})...")
        for col in numerical_cols:
            # percentile ranking
            p_rank = X_corrupt[col].rank(pct=True)

            # distance from median. x2 to normalize distance [0, 1]
            dist_from_median = 2 * np.abs(p_rank - 0.5)

            # probability of deletion = Base + (Strength * Normalized_Distance)
            prob_missing = min_prob + (ratio * dist_from_median)

            # masking
            mask = np.random.rand(len(X_corrupt)) < prob_missing
            X_corrupt.loc[mask, col] = np.nan

    else:
        raise ValueError(f"Unknown mechanism: {mechanism}")

    return X_corrupt

In [None]:
def impute_baseline(X_train, categorical_cols, numerical_cols):
    """Mean (Num) / Mode (Cat) Imputation"""
    start = time.time()

    X_filled = X_train.copy()

    if numerical_cols:
        num_imputer = SimpleImputer(strategy='mean')
        X_filled[numerical_cols] = num_imputer.fit_transform(X_train[numerical_cols])

    if categorical_cols:
        cat_imputer = SimpleImputer(strategy='most_frequent')
        X_filled[categorical_cols] = cat_imputer.fit_transform(X_train[categorical_cols])

    return X_filled, time.time() - start

In [None]:
def impute_knn(X_train, categorical_cols, numerical_cols):
    """KNN Imputation"""
    start = time.time()
    X_working = X_train.copy()

    # ordinal encoding for categoricals
    encoders = {}
    for col in categorical_cols:
        series = X_working[col].astype(str)
        series[X_train[col].isna()] = np.nan

        oe = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=np.nan, encoded_missing_value=np.nan)

        X_working[[col]] = oe.fit_transform(X_working[[col]])
        encoders[col] = oe

    imputer = KNNImputer(n_neighbors=5)
    X_filled = imputer.fit_transform(X_working)
    X_filled = pd.DataFrame(X_filled, columns=X_working.columns, index=X_working.index)

    # decode categoricals back
    for col in categorical_cols:
        oe = encoders[col]
        X_filled[[col]] = oe.inverse_transform(X_filled[[col]])

    return X_filled, time.time() - start

In [None]:
def impute_mice(X_train, categorical_cols, numerical_cols):
    """MICE Imputation (IterativeImputer)"""
    start = time.time()
    X_working = X_train.copy()

    # ordinal encoding
    encoders = {}
    for col in categorical_cols:
        # separate encoder for every column
        oe = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=np.nan, encoded_missing_value=np.nan)
        X_working[[col]] = oe.fit_transform(X_working[[col]])
        encoders[col] = oe

    imputer = IterativeImputer(max_iter=10, random_state=777)
    X_filled = imputer.fit_transform(X_working)
    X_filled = pd.DataFrame(X_filled, columns=X_working.columns, index=X_working.index)

    # decoding back
    for col in categorical_cols:
        oe = encoders[col]
        series = X_filled[col].round()

        # clip the prediction to the acceptable range
        n_categories = len(oe.categories_[0])
        series = series.clip(0, n_categories - 1)

        X_filled[col] = series
        X_filled[[col]] = oe.inverse_transform(X_filled[[col]])

    return X_filled, time.time() - start

In [None]:
def impute_sap_rpt(X_train, categorical_cols, numerical_cols):
    """SAP RPT Imputation"""
    start = time.time()
    X_imputed = X_train.copy()

    missing_cols = X_train.columns[X_train.isna().any()].tolist()

    for col in missing_cols:
        # taking rows that have the needed data as context to predict the rows with missing data
        mask_missing = X_imputed[col].isna()

        X_context = X_imputed[~mask_missing].drop(columns=[col])
        y_context = X_imputed.loc[~mask_missing, col]

        X_target = X_imputed[mask_missing].drop(columns=[col])

        if len(X_target) == 0:
            continue

        print(f"  > RPT repairing column: {col} ({len(X_target)} missing)")

        if col in numerical_cols:
            model = SAP_RPT_OSS_Regressor(max_context_size=1024)
        else:
            model = SAP_RPT_OSS_Classifier(max_context_size=1024)

        model.fit(X_context, y_context)
        preds = model.predict(X_target)
        X_imputed.loc[mask_missing, col] = preds

    return X_imputed, time.time() - start

In [None]:
def evaluate_reconstruction(X_clean, X_imputed, categorical_cols, numerical_cols):
    X_cl = X_clean.loc[X_imputed.index]

    # RMSE for numericals
    if len(numerical_cols) > 0:
        rmse = np.sqrt(mean_squared_error(X_cl[numerical_cols], X_imputed[numerical_cols]))
    else:
        rmse = 0.0

    # accuracy for categoricals
    if len(categorical_cols) > 0:
        # flatten to compare all cells at once
        acc = accuracy_score(
            X_cl[categorical_cols].astype(str).values.flatten(),
            X_imputed[categorical_cols].astype(str).values.flatten()
        )
    else:
        acc = 0.0

    return rmse, acc

In [None]:
def evaluate_downstream(X_train_imp, y_train, X_test_clean, y_test, cat_cols, num_cols):
    transformers = [('num', StandardScaler(), num_cols)]
    if cat_cols:
        transformers.append(('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols))

    preprocessor = ColumnTransformer(transformers)

    target_type = type_of_target(y_train)
    is_regression = target_type in ['continuous', 'continuous-multioutput']

    if is_regression:
        model = make_pipeline(preprocessor, RandomForestRegressor(n_estimators=50, max_depth=10, n_jobs=-1, random_state=777))
    else:
        model = make_pipeline(preprocessor, RandomForestClassifier(n_estimators=50, max_depth=10, n_jobs=-1, random_state=777))

    model.fit(X_train_imp, y_train)
    y_pred = model.predict(X_test_clean)

    if is_regression:
        return {
            'R2': r2_score(y_test, y_pred),
            'RMSE': np.sqrt(mean_squared_error(y_test, y_pred)),
            'Type': 'Regression'
        }
    else:
        y_pred_proba = model.predict_proba(X_test_clean)[:, 1]
        return {
            'AUC': roc_auc_score(y_test, y_pred_proba),
            'F1': f1_score(y_test, y_pred),
            'Type': 'Classification'
        }

In [None]:
def run_benchmark(mechanism='MCAR', corruption_ratio=0.3):
    X_train, X_test, y_train, y_test, cat_cols, num_cols = load_gas_turbine_data()
    original_std = X_train[num_cols].std()

    X_train_corrupt = corrupt_data(X_train, num_cols, mechanism=mechanism, ratio=corruption_ratio)

    results = []

    methods = {
        'Baseline (Mean/Mode)': impute_baseline,
        'KNN': impute_knn,
        'MICE': impute_mice,
        'SAP_RPT': impute_sap_rpt
    }

    print(f"\n--- Starting Benchmark ({mechanism}) ---")
    for name, func in methods.items():
        print(f"Running {name}...")

        try:
            X_imputed, time_taken = func(X_train_corrupt, cat_cols, num_cols)
        except Exception as e:
            print(f"Method {name} failed: {e}")
            continue

        rmse, acc = evaluate_reconstruction(X_train, X_imputed, cat_cols, num_cols)

        imputed_std = X_imputed[num_cols].std()
        std_recovery_ratio = (imputed_std / original_std).mean()

        downstream = evaluate_downstream(X_imputed, y_train, X_test, y_test, cat_cols, num_cols)

        res_row = {
            'Method': name,
            'Mechanism': mechanism,
            'Time (s)': round(time_taken, 2),
            'Rec_RMSE': round(rmse, 4),
            'Std_Dev_Ratio': round(std_recovery_ratio, 4),
            'Task_Type': downstream.get('Type', 'Unknown')
        }

        if downstream.get('Type') == 'Regression':
            res_row['Downstream_R2'] = round(downstream.get('R2', 0), 4)
            res_row['Downstream_RMSE'] = round(downstream.get('RMSE', 0), 4)
            print(f"    -> Done. Time: {res_row['Time (s)']}s | R2: {res_row['Downstream_R2']}")
        else:
            res_row['Downstream_AUC'] = round(downstream.get('AUC', 0), 4)
            res_row['Downstream_F1'] = round(downstream.get('F1', 0), 4)
            print(f"    -> Done. Time: {res_row['Time (s)']}s | AUC: {res_row['Downstream_AUC']}")

        results.append(res_row)

    return pd.DataFrame(results)

In [None]:
df_results = run_benchmark(mechanism='MNAR', corruption_ratio=0.7)
print("\n--- Final Results ---")
print(df_results)

Loading Gas Turbine dataset...
Data Loaded: (36733, 12)
  > Applying MNAR corruption (strength=0.7)...

--- Starting Benchmark (MNAR) ---
Running Baseline (Mean/Mode)...
    -> Done. Time: 0.02s | R2: 0.7151
Running KNN...
    -> Done. Time: 95.71s | R2: 0.683
Running MICE...
    -> Done. Time: 0.74s | R2: 0.7334
Running SAP_RPT...
  > RPT repairing column: year (11463 missing)
  > RPT repairing column: AT (11799 missing)
  > RPT repairing column: AP (11862 missing)
  > RPT repairing column: AH (11747 missing)
  > RPT repairing column: AFDP (11850 missing)
  > RPT repairing column: GTEP (11658 missing)
  > RPT repairing column: TIT (11621 missing)
  > RPT repairing column: TAT (11780 missing)
  > RPT repairing column: TEY (11851 missing)
  > RPT repairing column: CDP (11805 missing)
  > RPT repairing column: NOX (11708 missing)
    -> Done. Time: 201.66s | R2: 0.7237

--- Final Results ---
                 Method Mechanism  Time (s)  Rec_RMSE  Std_Dev_Ratio  \
0  Baseline (Mean/Mode)  