In [1]:
%reload_ext autoreload
%autoreload 2

import warnings

import polars as pl

warnings.filterwarnings("ignore")

_ = pl.Config.set_tbl_rows(5)
_ = pl.Config.set_tbl_cols(20)

In [2]:
from dttk.data import load_credit_fraud_dataset

baskets_df, products_df = load_credit_fraud_dataset()

In [3]:
X_raw = baskets_df.select(["ID"])
y_raw = baskets_df.select(["fraud_flag"])

In [8]:
import skrub

vectorizer = skrub.TableVectorizer(
    specific_transformers=[("passthrough", ["basket_ID"])]
)

vectorized_products = pl.DataFrame(vectorizer.fit_transform(products_df))

aggregated_products = vectorized_products.group_by("basket_ID").agg(pl.all().mean())

X_train = X_raw.join(
    aggregated_products, left_on="ID", right_on="basket_ID", how="left"
)

In [None]:
from sklearn.model_selection import cross_validate
from sklearn.pipeline import FeatureUnion, Pipeline
from skrub import AggTarget, tabular_learner

X = X_train.drop("ID").to_pandas()
y = y_raw.to_pandas()

target_encodings = FeatureUnion([
    (
        "target_encoding_nbr_of_prod_purchas",
        AggTarget(main_key="Nbr_of_prod_purchas", operations=["mean", "max", "min"]),
    ),
])

model = tabular_learner("regressor")

pipeline = Pipeline([
    ("target_encoding", target_encodings),
    ("model", model),
])

results = cross_validate(pipeline, X, y, scoring="r2")

print(f"R2: {results['test_score'].mean()}")

In [None]:
from dttk.utils import compute_feature_importance, plot_feature_importance

model.fit(X, y)

f = compute_feature_importance(model, X, y)

plot_feature_importance(
    f["importances_mean"],
    model.feature_names_in_,
)

In [None]:
from tabpfn import TabPFNRegressor

model = TabPFNRegressor(ignore_pretraining_limits=True, fit_mode="low_memory")

# Limit to 1000 rows for memory reasons
results = cross_validate(model, X[:1000], y[:1000], scoring="r2")

print(f"R2: {results['test_score'].mean()}")

In [None]:
from flaml import AutoML

automl = AutoML(metric="r2", time_budget=10)

automl.fit(X, y, task="regression")