In [1]:
%reload_ext autoreload
%autoreload 2

import warnings

import polars as pl

warnings.filterwarnings("ignore")

_ = pl.Config.set_tbl_rows(5)
_ = pl.Config.set_tbl_cols(20)

In [2]:
from dttk.data import load_credit_fraud_dataset

baskets_df, products_df = load_credit_fraud_dataset()

In [3]:
X_raw = baskets_df.select(["ID"])
y_raw = baskets_df.select(["fraud_flag"])

In [5]:
import skrub

vectorizer = skrub.TableVectorizer(
    specific_transformers=[("passthrough", ["basket_ID"])], n_jobs=-1
)

vectorized_products = pl.DataFrame(vectorizer.fit_transform(products_df))

aggregated_products = vectorized_products.group_by("basket_ID").agg(pl.all().mean())

X_train = X_raw.join(
    aggregated_products, left_on="ID", right_on="basket_ID", how="left"
)

In [6]:
X_train

ID,"item: peripherals, aerials, paper","item: fulfilment, charge, men","item: computers, computer, printers","item: televisions, cinema, home","item: living, dining, reading","item: telephones, telephone, machines","item: bedroom, bathroom, furniture","item: warranty, fragrance, and","item: fax, two, way",…,"goods_code: 239001532, 239001518, 239001531","goods_code: 239841522, 239841523, 239841491","goods_code: 239482926, 239482920, 239482921","goods_code: 240376603, 240376608, 240376609","goods_code: 239091963, 239091958, 239091952","goods_code: 240040984, 240040987, 240040959","goods_code: skucode42, skucode43, skucode44","goods_code: 237253797, 237397625, 237775721","goods_code: 238282622, 238349162, 238282623",Nbr_of_prod_purchas
i64,f32,f32,f32,f32,f32,f32,f32,f32,f32,…,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32
85517,0.064949,0.05,10.486229,0.051607,0.05,0.050502,0.051221,0.05,0.050429,…,0.058657,0.061801,0.052335,0.06101,0.058234,0.053104,0.053715,0.0546,0.05318,1.0
51113,41.356621,0.05,0.345978,0.052461,0.05,0.05401,0.051227,0.053021,0.05315,…,10.297161,0.061371,0.051582,0.05,0.066862,0.052951,0.053154,0.052466,0.052808,1.0
83008,0.051761,0.052304,0.055225,31.45616,0.053342,0.059577,0.051227,0.05,0.057309,…,0.056295,9.664343,0.072373,0.060538,0.24308,0.346795,0.151389,0.058133,0.063747,1.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
43567,0.064949,0.05,10.486229,0.051607,0.05,0.050502,0.051221,0.05,0.050429,…,0.073263,0.052561,0.053731,0.055283,0.054406,0.193967,0.057367,0.055042,0.056871,1.0
68268,0.051761,0.052304,0.055225,31.45616,0.053342,0.059577,0.051227,0.05,0.057309,…,0.072215,2.343755,0.051104,0.057986,0.069873,0.054548,0.052549,0.054347,0.054506,1.0


In [9]:
from category_encoders import TargetEncoder

target_encoder = TargetEncoder(cols=["Nbr_of_prod_purchas"])

target_encoder.fit_transform(
    X_train.to_pandas(), y_raw.to_pandas(), drop_invariant=True
)

Unnamed: 0,ID,"item: peripherals, aerials, paper","item: fulfilment, charge, men","item: computers, computer, printers","item: televisions, cinema, home","item: living, dining, reading","item: telephones, telephone, machines","item: bedroom, bathroom, furniture","item: warranty, fragrance, and","item: fax, two, way",...,"goods_code: 239001532, 239001518, 239001531","goods_code: 239841522, 239841523, 239841491","goods_code: 239482926, 239482920, 239482921","goods_code: 240376603, 240376608, 240376609","goods_code: 239091963, 239091958, 239091952","goods_code: 240040984, 240040987, 240040959","goods_code: skucode42, skucode43, skucode44","goods_code: 237253797, 237397625, 237775721","goods_code: 238282622, 238349162, 238282623",Nbr_of_prod_purchas
0,85517,0.064949,0.050000,10.486229,0.051607,0.050000,0.050502,0.051221,0.050000,0.050429,...,0.058657,0.061801,0.052335,0.061010,0.058234,0.053104,0.053715,0.054600,0.053180,0.014474
1,51113,41.356621,0.050000,0.345978,0.052461,0.050000,0.054010,0.051227,0.053021,0.053150,...,10.297161,0.061371,0.051582,0.050000,0.066862,0.052951,0.053154,0.052466,0.052808,0.014474
2,83008,0.051761,0.052304,0.055225,31.456160,0.053342,0.059577,0.051227,0.050000,0.057309,...,0.056295,9.664343,0.072373,0.060538,0.243080,0.346795,0.151389,0.058133,0.063747,0.014474
3,78712,20.710785,0.050000,5.416104,0.052034,0.050000,0.052256,0.051224,0.051511,0.051790,...,0.079829,0.357238,0.056493,0.051082,0.059819,0.052119,0.060750,0.055927,0.058321,0.014474
4,77846,0.051761,0.052304,0.055225,31.456160,0.053342,0.059577,0.051227,0.050000,0.057309,...,0.064374,0.070993,0.082190,0.050000,0.075931,0.050812,0.052743,0.052705,0.053086,0.014474
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92785,21243,20.703310,11.276594,0.197989,0.052042,0.050390,0.052230,0.051172,0.054675,0.051790,...,0.106307,0.051285,0.055450,0.050000,0.087817,0.050000,0.084096,0.062902,0.054986,0.014474
92786,45891,0.064949,0.050000,10.486229,0.051607,0.050000,0.050502,0.051221,0.050000,0.050429,...,0.058657,0.061801,0.052335,0.061010,0.058234,0.053104,0.053715,0.054600,0.053180,0.014474
92787,42613,0.050920,0.051121,0.052310,0.050490,0.081582,0.050445,14.975650,0.050000,0.050426,...,0.074491,0.115084,0.052250,0.050000,0.076832,0.051137,3.786293,0.577861,0.776001,0.014474
92788,43567,0.064949,0.050000,10.486229,0.051607,0.050000,0.050502,0.051221,0.050000,0.050429,...,0.073263,0.052561,0.053731,0.055283,0.054406,0.193967,0.057367,0.055042,0.056871,0.014474


In [11]:
from sklearn.model_selection import cross_validate
from sklearn.pipeline import FeatureUnion, Pipeline
from skrub import AggTarget, tabular_learner

X = X_train.drop("ID").to_pandas()
y = y_raw.to_pandas()

target_encodings = FeatureUnion([
    (
        "target_encoding_nbr_of_prod_purchas",
        AggTarget(main_key="Nbr_of_prod_purchas", operations=["mean", "max", "min"]),
    ),
])

model = tabular_learner("regressor")

pipeline = Pipeline([
    ("target_encoding", target_encodings),
    ("model", model),
])

results = cross_validate(pipeline, X, y, scoring="r2")

print(f"R2: {results['test_score'].mean()}")

R2: 0.08106775938260107


In [None]:
from dttk.utils import compute_feature_importance, plot_feature_importance

model.fit(X, y)

f = compute_feature_importance(model, X, y)

plot_feature_importance(
    f["importances_mean"],
    model.feature_names_in_,
)

In [None]:
from tabpfn import TabPFNRegressor

model = TabPFNRegressor(ignore_pretraining_limits=True, fit_mode="low_memory")

# Limit to 1000 rows for memory reasons
results = cross_validate(model, X[:1000], y[:1000], scoring="r2")

print(f"R2: {results['test_score'].mean()}")

In [None]:
from flaml import AutoML

automl = AutoML(metric="r2", time_budget=10)

automl.fit(X, y, task="regression")