In [1]:
import skrub
import skrub.datasets
import pandas as pd
from sklearn.ensemble import HistGradientBoostingClassifier

from gyyre import *

In [2]:
dataset = skrub.datasets.fetch_credit_fraud()

products = skrub.var("products", dataset.products)
baskets = skrub.var("baskets", dataset.baskets)
baskets = baskets.skb.subsample(n=5000, how='random')

basket_ids = baskets[["ID"]].skb.mark_as_X()
fraud_flags = baskets["fraud_flag"].skb.mark_as_y()

In [4]:
products_new = products.sem_fillna("make", f"Infer the manufacturer from product descriptions.", with_existing_vals=False, impute_with_llm=True)

--- gyyre.sem_fillna('make', 'Infer the manufacturer from product descriptions.')
	> Querying 'openai/gpt-4.1' with 2 messages...'
	> Fitting imputation model RandomForestClassifier(random_state=0) on columns ['item', 'model', 'goods_code'] of 108107 rows...
	> Imputing 1273 values...


In [5]:
print(products_new.loc[products["make"].isna(), "make"])
print(products.loc[products["make"].isna(), "make"])

<GetItem (<CallMethod 'isna'>, 'make')>
Result:
―――――――
44                   APPLE
45                   APPLE
46                   APPLE
440               RETAILER
441               RETAILER
                ...       
163042            RETAILER
163043            RETAILER
163044            RETAILER
163045    CROFT COLLECTION
163046            RETAILER
Name: make, Length: 1273, dtype: object
<GetItem (<CallMethod 'isna'>, 'make')>
Result:
―――――――
44        NaN
45        NaN
46        NaN
440       NaN
441       NaN
         ... 
163042    NaN
163043    NaN
163044    NaN
163045    NaN
163046    NaN
Name: make, Length: 1273, dtype: object


In [6]:
kept_products = products[products["basket_ID"].isin(basket_ids["ID"])]
kept_products = kept_products.with_sem_features("Compute additional brand-related product features", name="brand_features", how_many=5)

--- Fitting gyyre.with_sem_features('Compute additional brand-related product...', 5)
	> Querying 'openai/gpt-4.1' with 2 messages...'
	> Computed 5 new feature columns: ['Apple_in_Item', 'Brand_in_Model', 'Is_Apple', 'Is_Retailer', 'Model_Contains_Year'], removed 0 feature columns: []




In [None]:
vectorizer = skrub.TableVectorizer()

vectorized_products = kept_products.skb.apply_with_sem_choose(vectorizer, exclude_cols="basket_ID", 
    choices=sem_choose(high_cardinality="Different encoders for messy columns with potentially invalid data"))

In [None]:
aggregated_products = vectorized_products.groupby("basket_ID").agg("mean").reset_index()
augmented_baskets = basket_ids.merge(aggregated_products, left_on="ID", right_on="basket_ID").drop(columns=["ID", "basket_ID"])

In [None]:
hgb = HistGradientBoostingClassifier()
predictions = augmented_baskets.skb.apply_with_sem_choose(hgb, y=fraud_flags, 
  choices=sem_choose(learning_rate="Three learning rates to try"))

In [None]:
predictions

In [None]:
learner = predictions.skb.make_learner(fitted=False)

In [None]:
learner

In [None]:
from gyyre.optimisers._dag_summary import summarise_dag

dataset = skrub.datasets.fetch_credit_fraud()
dag_summary = summarise_dag(predictions)

model = learner.fit({
    "products": dataset.products, 
    "baskets": dataset.baskets,
    # TODO Ugly, we need a workaround to automatically set those
    "gyyre_dag_summary__brand_features": dag_summary,
    "gyyre_memory__brand_features": None,
    "gyyre_prefitted_state__brand_features": None,
})

In [None]:
model

In [None]:
import pandas as pd

new_baskets = pd.DataFrame([dict(ID="abc")])
new_products = pd.DataFrame(
    [
        dict(
            basket_ID="abc",
            item="COMPUTER",
            cash_price=200,
            make="APPLE",
            model="XXX-X",
            goods_code="239246782",
            Nbr_of_prod_purchas=1,
        )
    ]
)
model.predict_proba({"baskets": new_baskets, "products": new_products})