In [1]:
import getml
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from relbench.datasets import get_dataset
from relbench.tasks import get_task

from sklearn.metrics import roc_auc_score

if getml.engine.is_alive():
    print("Killing the engine...")
    getml.engine.shutdown()

# Enable textual output to avoid rendering issues in certain JupyterLab environments
getml.utilities.progress.FORCE_TEXTUAL_OUTPUT = True
getml.utilities.progress.FORCE_MONOCHROME_OUTPUT = True

# Launch getML engine and set project.
getml.set_project("az-item-churn")

# Download dataset and task from RelBench.
dataset = get_dataset("rel-amazon", download=True)
task = get_task("rel-amazon", "item-churn", download=True)

In [2]:
population_roles = getml.data.Roles(
    join_key=["product_id"],
    target=["churn"],
    time_stamp=["timestamp"],
)

subsets = ("train", "test", "val")
item_churn = {} # population
for subset in subsets:
    item_churn[subset] = getml.data.DataFrame.from_parquet(
        f"{task.cache_dir}/{subset}.parquet",
        name=f"population_{subset}",
        roles=population_roles,
    )

product_df = pd.read_parquet(f"{dataset.cache_dir}/db/product.parquet")

product_df["category"] = product_df["category"].apply(
    lambda x: x.tolist() if isinstance(x, np.ndarray) else (x or [])
)

product_df["category_level_2"] = product_df["category"].apply(
    lambda arr: arr[1] if len(arr) > 1 else None
)

product_df = product_df.drop(columns=["category","brand","title","description"])

product_roles = getml.data.Roles(
    join_key=["product_id"],
    numerical=["price"],
    categorical=["category_level_2"]
)

product = getml.DataFrame.from_pandas(product_df,
    name = 'product',
    roles = product_roles
)

review_roles_common = getml.data.Roles(
    time_stamp=["review_time"],
    join_key=["product_id","customer_id"],
    numerical= ["rating"],
)

review_base = getml.data.DataFrame.from_parquet(f"{dataset.cache_dir}/db/review.parquet", name = 'review', roles = review_roles_common)

review_all = review_base.with_role(["verified"], getml.data.roles.categorical)

Launching ./getML --allow-push-notifications=true --allow-remote-ips=false --home-directory=/home/jupyter/.getML --in-memory=true --install=false --launch-browser=true --log=false --project-directory=/home/jupyter/.getML/projects in /opt/conda/lib/python3.10/site-packages/getml/.getML/getml-community-1.5.0-amd64-linux...
Launched the getML Engine. The log output will be stored in /home/jupyter/.getML/logs/getml_20250117100703.log
[2K  Loading pipelines... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% • 00:00
[?25h

In [3]:


dm = getml.data.DataModel(population=item_churn["train"].to_placeholder("population-item_churn"))

dm.add(
    product.to_placeholder(), 
    review_all.to_placeholder("review_all"), 
    review_base.to_placeholder("review_customer"),
    review_base.to_placeholder("review_recent"),
)

dm.population.join(
    dm.product, on="product_id", relationship=getml.data.relationship.many_to_one
)

dm.population.join(
    dm.review_recent, on="product_id", time_stamps=("timestamp", "review_time"),
    memory = getml.data.time.days(365)
)

dm.population.join(
    dm.review_all, on="product_id", time_stamps=("timestamp", "review_time")
)

dm.review_all.join(
    dm.review_customer, on="customer_id", time_stamps=("review_time", "review_time")
)

dm.review_customer.join(
    dm.product, on="product_id", relationship=getml.data.relationship.many_to_one
)


container = getml.data.Container(**item_churn)
container.add(product, review_all=review_all, review_customer=review_base, review_recent=review_base)

dm

Unnamed: 0,data frames,staging table
0,"population-item_churn, product",POPULATION-ITEM_CHURN__STAGING_TABLE_1
1,review_all,REVIEW_ALL__STAGING_TABLE_2
2,"review_customer, product",REVIEW_CUSTOMER__STAGING_TABLE_3
3,review_recent,REVIEW_RECENT__STAGING_TABLE_4


In [5]:
pred_pipe = getml.Pipeline(
    data_model=dm,
    feature_learners=[getml.feature_learning.FastProp(
        # num_threads=64,
        n_most_frequent=0,
        num_features=1000, # muss über der anzahl an features liegen um sie nicht zu bauen
        aggregation=(
            getml.feature_learning.FastProp.agg_sets.default # | additional_aggregations
        ),
    )],
    predictors=[getml.predictors.XGBoostClassifier(max_depth = 5)],
    loss_function=getml.feature_learning.loss_functions.CrossEntropyLoss,
)

pred_pipe.fit(container.train, check = False)
pred_pipe.scores


[2K  Staging... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% • 00:30
[2K  FastProp: Trying 702 features... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% • 00:00
[2K  FastProp: Building subfeatures... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% • 01:25
[2K  FastProp: Building features... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% • 17:20
[2K  XGBoost: Training as predictor... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% • 22:48
[?25h

Time taken: 0:42:09.296499.



Unnamed: 0,date time,set used,target,accuracy,auc,cross entropy
0,2025-01-16 21:18:44,train,churn,0.7408,0.8212,0.5043


In [None]:
pred_pipe.score(container.val)
pred_pipe.score(container.test)

[2K  Staging... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% • 00:00
[2K  Preprocessing... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% • 00:00
[2K  FastProp: Building subfeatures... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% • 01:25
[2K⠋ FastProp: Building features...                                      0% • 01:25

In [7]:
pred_pipe.scores

Unnamed: 0,date time,set used,target,accuracy,auc,cross entropy
0,2025-01-16 21:18:44,train,churn,0.7408,0.8212,0.5043
1,2025-01-16 21:28:54,val,churn,0.7466,0.8252,0.5014
2,2025-01-16 21:31:57,test,churn,0.7554,0.8296,0.4824


In [8]:
pref = "_200"
target = "churn"

# performance on test already above human data science baseline
# but we think that there is more room with an hyperopt on top of extracted features
# next steps: select top x features, export them,
# run lightgbm hyperopt script on 200 most important features 

important_features=pred_pipe.features.sort(by='importance')[:200].names

col_export = important_features + [target]

print(len(pred_pipe.features))
    
pred_pipe.transform(container.train, df_name="train_transform")[col_export].to_parquet(fname=f"train_transform{pref}")
pred_pipe.transform(container.val,   df_name="val_transform")[col_export].to_parquet(fname=f"val_transform{pref}")
pred_pipe.transform(container.test,  df_name="test_transform")[col_export].to_parquet(fname=f"test_transform{pref}")

703
[2K  Staging... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% • 00:01
[2K  Preprocessing... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% • 00:00
[2K  FastProp: Building subfeatures... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% • 01:32
[2K  FastProp: Building features... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% • 22:36
[2K  Staging... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% • 00:00
[2K  Preprocessing... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% • 00:00
[2K  FastProp: Building subfeatures... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% • 01:31
[2K  FastProp: Building features... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% • 01:35
[2K  Staging... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% • 00:00
[2K  Preprocessing... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% • 00:00
[2K  FastProp: Building subfeatures... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% • 01:30
[2K  FastProp: Building features... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% • 01:27
[?25h