In [1]:
%reload_ext autoreload
%autoreload 2

import warnings

import polars as pl
from sklearn import set_config

warnings.filterwarnings("ignore")

_ = pl.Config.set_tbl_rows(5)
_ = pl.Config.set_tbl_cols(20)

set_config(transform_output="polars")

In [2]:
from dttk.data import load_credit_fraud_dataset

baskets_df, products_df = load_credit_fraud_dataset()

In [3]:
X_raw = baskets_df.select(["ID"])
y_raw = baskets_df.select(["fraud_flag"])

In [4]:
import skrub

vectorizer = skrub.TableVectorizer(
    specific_transformers=[("passthrough", ["basket_ID"])]
)
vectorized_products = pl.DataFrame(vectorizer.fit_transform(products_df))

In [5]:
aggregated_products = vectorized_products.group_by("basket_ID").agg(pl.all().mean())

X_train = X_raw.join(
    aggregated_products, left_on="ID", right_on="basket_ID", how="left"
)

In [6]:
from skrub import AggTarget

column_groups = ["Nbr_of_prod_purchas"]

for col in column_groups:
    aggregator = AggTarget(
        main_key=col,
        operations=["mean", "max", "min"],
    )

    aggregator.fit(X_train, y_raw)  # type: ignore

    X_train = aggregator.transform(X_train)  # type: ignore

In [7]:
from sklearn.model_selection import cross_validate
from skrub import tabular_learner

X = X_train.to_numpy()
y = y_raw.to_numpy()

model = tabular_learner("regressor")
results = cross_validate(model, X, y, scoring="r2")

print(f"R2: {results['test_score'].mean()}")

R2: 0.11037235406204257


In [8]:
from tabpfn import TabPFNRegressor

model = TabPFNRegressor(ignore_pretraining_limits=True, fit_mode="low_memory")

# Limit to 1000 rows for memory reasons
results = cross_validate(model, X[:1000], y[:1000], scoring="r2")

print(f"R2: {results['test_score'].mean()}")

R2: -0.013318538665771484


In [9]:
from flaml import AutoML

automl = AutoML(metric="r2", time_budget=10)

automl.fit(X, y, task="regression")

[flaml.automl.logger: 04-20 19:53:49] {1728} INFO - task = regression
[flaml.automl.logger: 04-20 19:53:49] {1739} INFO - Evaluation method: holdout
[flaml.automl.logger: 04-20 19:53:49] {1838} INFO - Minimizing error metric: 1-r2
[flaml.automl.logger: 04-20 19:53:49] {1955} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth', 'sgd']
[flaml.automl.logger: 04-20 19:53:49] {2258} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 04-20 19:53:50] {2393} INFO - Estimated sufficient time budget=36609s. Estimated necessary time budget=262s.
[flaml.automl.logger: 04-20 19:53:50] {2442} INFO -  at 0.6s,	estimator lgbm's best error=0.9956,	best estimator lgbm's best error=0.9956
[flaml.automl.logger: 04-20 19:53:50] {2258} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 04-20 19:53:50] {2442} INFO -  at 0.7s,	estimator lgbm's best error=0.9956,	best estimator lgbm's best error=0.9956
[flaml.automl.logger: 04-20 19:53:50] 