In [1]:
import skrub
from sklearn.ensemble import HistGradientBoostingClassifier

import sempipes
from sempipes import sem_choose

dataset = skrub.datasets.fetch_credit_fraud()
products = skrub.var("products", dataset.products)
baskets = skrub.var("baskets", dataset.baskets)
baskets = baskets.skb.subsample(n=5000, how="random")

basket_ids = sempipes.as_X(baskets[["ID"]], "Shopping baskets with product transactions")
fraud_flags = sempipes.as_y(baskets["fraud_flag"], "A binary flag indicating a fraudulent shopping basket")

products = products.sem_fillna(
    target_column="make",
    nl_prompt="Infer the manufacturer from relevant product-related attributes like title or description.",
    impute_with_existing_values_only=True,
)

kept_products = products[products["basket_ID"].isin(basket_ids["ID"])]
kept_products = kept_products.sem_gen_features(
    nl_prompt="""
    Generate additional brand- and manufacturer-related product features. Make sure that they can be efficiently computed
    on large datasets, and that they work across a large number of brands and manufacturers. Use your intrinsic knowledge 
    about what products and brands fraudsters focus on to make sure that the new features are helpful for the prediction task 
    at hand.
    """,
    name="brand_features",
    how_many=5,
)

vectorizer = skrub.TableVectorizer()
vectorized_products = kept_products.skb.apply(vectorizer, exclude_cols="basket_ID")

aggregated_products = vectorized_products.groupby("basket_ID").agg("mean").reset_index()
augmented_baskets = basket_ids.merge(aggregated_products, left_on="ID", right_on="basket_ID").drop(
    columns=["ID", "basket_ID"]
)

hgb = HistGradientBoostingClassifier()
fraud_detector = augmented_baskets.skb.apply_with_sem_choose(
    hgb,
    y=fraud_flags,
    choices=sem_choose(
        name="hgb_choices", max_depth="Common range of values for the maximum depth of the learned trees"
    ),
)

2025-11-29 17:26:10,563 - INFO - SEMPIPES> Interactive mode detected, enabling code generation in preview mode.
2025-11-29 17:26:10,840 - INFO - SEMPIPES> Querying 'openai/gpt-4.1' with 2 messages...'
[92m17:26:10 - LiteLLM:INFO[0m: utils.py:3416 - 
LiteLLM completion() model= gpt-4.1; provider = openai
2025-11-29 17:26:10,850 - INFO - SEMPIPES> 
LiteLLM completion() model= gpt-4.1; provider = openai


--- sempipes.sem_fillna('make', 'Infer the manufacturer from relevant product-related attributes like title or description.')


[92m17:26:12 - LiteLLM:INFO[0m: utils.py:1301 - Wrapper: Completed Call, calling success_handler
2025-11-29 17:26:12,915 - INFO - SEMPIPES> Wrapper: Completed Call, calling success_handler


	> Fitting imputation model RandomForestClassifier(random_state=0) on columns ['item', 'model', 'goods_code'] of 108107 rows...
	> Imputing 1273 values...


2025-11-29 17:26:50,748 - INFO - SEMPIPES> Fitting sempipes.sem_gen_features('Generate additional brand- and manu...', 5) on dataframe of shape (9014, 7) in mode 'preview'.
2025-11-29 17:26:50,752 - INFO - SEMPIPES> Querying 'openai/gpt-4.1' with 2 messages...'
[92m17:26:50 - LiteLLM:INFO[0m: utils.py:3416 - 
LiteLLM completion() model= gpt-4.1; provider = openai
2025-11-29 17:26:50,754 - INFO - SEMPIPES> 
LiteLLM completion() model= gpt-4.1; provider = openai
[92m17:26:53 - LiteLLM:INFO[0m: utils.py:1301 - Wrapper: Completed Call, calling success_handler
2025-11-29 17:26:53,539 - INFO - SEMPIPES> Wrapper: Completed Call, calling success_handler
2025-11-29 17:26:53,566 - INFO - SEMPIPES> Computed 5 new feature columns: ['brand_is_apple', 'item_is_computer', 'make_cash_price_mean', 'make_is_retailer', 'make_item_combo_freq'], 


--- sempipes.apply_with_sem_choose(HistGradientBoostingClassifier(), SemChoices(name='hgb_choices', params_and_prompts={'max_depth': 'Common range of values for the maximum depth of the learned trees'}))


2025-11-29 17:26:56,928 - INFO - SEMPIPES> Querying 'openai/gpt-4.1' with 2 messages...'
[92m17:26:56 - LiteLLM:INFO[0m: utils.py:3416 - 
LiteLLM completion() model= gpt-4.1; provider = openai
2025-11-29 17:26:56,931 - INFO - SEMPIPES> 
LiteLLM completion() model= gpt-4.1; provider = openai
[92m17:26:58 - LiteLLM:INFO[0m: utils.py:1301 - Wrapper: Completed Call, calling success_handler
2025-11-29 17:26:58,102 - INFO - SEMPIPES> Wrapper: Completed Call, calling success_handler


	Suggested choices for max_depth: choose_from([7, 3, 5, 10, None], name='__sempipes__...es__max_depth')


In [2]:
fraud_detector

Unnamed: 0_level_0,fraud_flag
Unnamed: 0_level_1,fraud_flag
0.0,0.0
1.0,0.0
2.0,0.0
3.0,0.0
4.0,0.0
,
4995.0,0.0
4996.0,0.0
4997.0,0.0
4998.0,0.0

Column,Column name,dtype,Is sorted,Null values,Unique values,Mean,Std,Min,Median,Max
0,fraud_flag,Int64DType,False,0 (0.0%),2 (< 0.1%),0.0038,0.0615,0,0,1


In [3]:
sempipes.inspect_generated_code(fraud_detector, "brand_features")