# How to use

In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

## Init Pyspark

In [2]:
import pyspark.sql.functions as F
from pyspark.sql import SparkSession

In [3]:
spark = (
    SparkSession.builder.master("local[*]")
    .config("spark.executor.memory", "6g")
    .config("spark.driver.memory", "6g")
    .getOrCreate()
)

spark.sparkContext.setLogLevel("ERROR")

spark.conf.set("spark.sql.repl.eagerEval.enabled", True)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/02/26 12:48:37 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Create a sample dataset

In [4]:
import numpy as np
import pandas as pd

In [5]:
def make_transactions(n=10000):
    return pd.DataFrame(
        data={
            "transaction_id": range(n),
            "consumer_id": [np.random.choice(list(range(100))) for _ in range(n)],
            "paid_value": np.random.normal(1000, 100, n),
            "discount": [np.random.choice([10, 0, 25]) for _ in range(n)],
            "product_type": [
                str(np.random.choice(["a", "b", "c", "d", "e", "f", "g", "h"])) for _ in range(n)
            ],
            "buy_type": [
                str(
                    np.random.choice(
                        ["f_1", "f_2", "f_3", "f_4", "f_5", "f_6", "f_7", "f_8", "f_9"]
                    )
                )
                for _ in range(n)
            ],
            "paymnent_date": pd.date_range("2022-01-01", "2023-01-01", periods=n),
        }
    )


In [6]:
make_transactions().head()

Unnamed: 0,transaction_id,consumer_id,paid_value,discount,product_type,buy_type,paymnent_date
0,0,84,938.706334,25,d,f_3,2022-01-01 00:00:00.000000000
1,1,74,978.164629,0,f,f_5,2022-01-01 00:52:33.915391539
2,2,95,1013.975424,25,h,f_1,2022-01-01 01:45:07.830783078
3,3,21,1000.291093,0,g,f_9,2022-01-01 02:37:41.746174617
4,4,32,1115.160083,0,f,f_3,2022-01-01 03:30:15.661566156


## Generate public

We will select the monthly active users as our public.

In [7]:
transactions = spark.createDataFrame(make_transactions())

public = transactions.select(F.col("consumer_id").alias("consumer_id_ref"), F.to_date(F.date_trunc("month", "paymnent_date")).alias("date_ref")).distinct()

## Generate Dataset

We will defime a 30 days window to build features

> In our case it means that we wil select every transaction in the window defined by ***date_ref*** and ***date_ref - 30 days***.

In [8]:
from autofeats.types import Dataset

In [9]:
df = Dataset(
    table=transactions,
    primary_key_col="transaction_id",
    table_join_key_col="consumer_id",
    table_join_date_col="paymnent_date",
    numerical_cols=["paid_value", "discount"],
    categorical_cols=["product_type", "buy_type"],
    public=public,
    public_join_key_col="consumer_id_ref",
    public_join_date_col="date_ref",
    subtract_in_end=30
)


In [10]:
from autofeats import make_features

## numerical_statistics example

In [11]:
features = make_features.run(
    df=df,
    suites=[
        "numerical_statistics",
    ],
    options={},
)

print(f"Features created: {features.columns}")

print(f"Number of features: {len(features.columns)}")

Features created: ['consumer_id_ref', 'date_ref', 'sum___paid_value', 'sum___discount', 'mean___paid_value', 'mean___discount', 'stddev___paid_value', 'stddev___discount', 'min___paid_value', 'min___discount', 'max___paid_value', 'max___discount', 'kurtosis___paid_value', 'kurtosis___discount', 'skewness___paid_value', 'skewness___discount']
Number of features: 16


In [12]:
features.limit(5)

                                                                                

consumer_id_ref,date_ref,sum___paid_value,sum___discount,mean___paid_value,mean___discount,stddev___paid_value,stddev___discount,min___paid_value,min___discount,max___paid_value,max___discount,kurtosis___paid_value,kurtosis___discount,skewness___paid_value,skewness___discount
29,2022-01-01,,,,,,,,,,,,,,
29,2022-02-01,8853.40968120304,135.0,983.7121868003378,15.0,86.83003813511638,12.24744871391589,861.5299671745906,0.0,1126.4961447050034,25.0,-0.9476942065305236,-1.7343749999999998,0.078090487490907,-0.3788861141556918
26,2022-02-01,11788.166930957694,95.0,1071.6515391779722,8.636363636363637,90.9490634918916,11.2006493318265,869.5921197423122,0.0,1165.640254915553,25.0,0.1602414056843994,-1.263311279143037,-1.1180188780284326,0.6930338494981549
26,2022-01-01,,,,,,,,,,,,,,
29,2022-03-01,7154.16997403849,120.0,1022.0242820054984,17.142857142857142,80.78779032712255,10.350983390135314,913.2907840738208,0.0,1114.7487402931952,25.0,-1.5832487921323397,-1.1907407407407404,-0.1951677930825345,-0.6211299937499416


## numerical_in_categorical_groups example

In [13]:
features = make_features.run(
    df=df,
    suites=[
        "numerical_in_categorical_groups",
    ],
    options={},
)

print(f"Features created: {features.columns}")

print(f"Number of features: {len(features.columns)}")

Features created: ['consumer_id_ref', 'date_ref', 'product_type=g__sum___paid_value', 'product_type=g__sum___discount', 'product_type=g__mean___paid_value', 'product_type=g__mean___discount', 'product_type=g__stddev___paid_value', 'product_type=g__stddev___discount', 'product_type=g__min___paid_value', 'product_type=g__min___discount', 'product_type=g__max___paid_value', 'product_type=g__max___discount', 'product_type=g__kurtosis___paid_value', 'product_type=g__kurtosis___discount', 'product_type=g__skewness___paid_value', 'product_type=g__skewness___discount', 'product_type=f__sum___paid_value', 'product_type=f__sum___discount', 'product_type=f__mean___paid_value', 'product_type=f__mean___discount', 'product_type=f__stddev___paid_value', 'product_type=f__stddev___discount', 'product_type=f__min___paid_value', 'product_type=f__min___discount', 'product_type=f__max___paid_value', 'product_type=f__max___discount', 'product_type=f__kurtosis___paid_value', 'product_type=f__kurtosis___disco

In [14]:
features.limit(5)

                                                                                

consumer_id_ref,date_ref,product_type=g__sum___paid_value,product_type=g__sum___discount,product_type=g__mean___paid_value,product_type=g__mean___discount,product_type=g__stddev___paid_value,product_type=g__stddev___discount,product_type=g__min___paid_value,product_type=g__min___discount,product_type=g__max___paid_value,product_type=g__max___discount,product_type=g__kurtosis___paid_value,product_type=g__kurtosis___discount,product_type=g__skewness___paid_value,product_type=g__skewness___discount,product_type=f__sum___paid_value,product_type=f__sum___discount,product_type=f__mean___paid_value,product_type=f__mean___discount,product_type=f__stddev___paid_value,product_type=f__stddev___discount,product_type=f__min___paid_value,product_type=f__min___discount,product_type=f__max___paid_value,product_type=f__max___discount,product_type=f__kurtosis___paid_value,product_type=f__kurtosis___discount,product_type=f__skewness___paid_value,product_type=f__skewness___discount,product_type=e__sum___paid_value,product_type=e__sum___discount,product_type=e__mean___paid_value,product_type=e__mean___discount,product_type=e__stddev___paid_value,product_type=e__stddev___discount,product_type=e__min___paid_value,product_type=e__min___discount,product_type=e__max___paid_value,product_type=e__max___discount,product_type=e__kurtosis___paid_value,product_type=e__kurtosis___discount,product_type=e__skewness___paid_value,product_type=e__skewness___discount,product_type=h__sum___paid_value,product_type=h__sum___discount,product_type=h__mean___paid_value,product_type=h__mean___discount,product_type=h__stddev___paid_value,product_type=h__stddev___discount,product_type=h__min___paid_value,product_type=h__min___discount,product_type=h__max___paid_value,product_type=h__max___discount,product_type=h__kurtosis___paid_value,product_type=h__kurtosis___discount,product_type=h__skewness___paid_value,product_type=h__skewness___discount,product_type=d__sum___paid_value,product_type=d__sum___discount,product_type=d__mean___paid_value,product_type=d__mean___discount,product_type=d__stddev___paid_value,product_type=d__stddev___discount,product_type=d__min___paid_value,product_type=d__min___discount,product_type=d__max___paid_value,product_type=d__max___discount,product_type=d__kurtosis___paid_value,product_type=d__kurtosis___discount,product_type=d__skewness___paid_value,product_type=d__skewness___discount,product_type=c__sum___paid_value,product_type=c__sum___discount,product_type=c__mean___paid_value,product_type=c__mean___discount,product_type=c__stddev___paid_value,product_type=c__stddev___discount,product_type=c__min___paid_value,product_type=c__min___discount,product_type=c__max___paid_value,product_type=c__max___discount,product_type=c__kurtosis___paid_value,product_type=c__kurtosis___discount,product_type=c__skewness___paid_value,product_type=c__skewness___discount,product_type=b__sum___paid_value,product_type=b__sum___discount,product_type=b__mean___paid_value,product_type=b__mean___discount,product_type=b__stddev___paid_value,product_type=b__stddev___discount,product_type=b__min___paid_value,product_type=b__min___discount,product_type=b__max___paid_value,product_type=b__max___discount,product_type=b__kurtosis___paid_value,product_type=b__kurtosis___discount,product_type=b__skewness___paid_value,product_type=b__skewness___discount,product_type=a__sum___paid_value,product_type=a__sum___discount,product_type=a__mean___paid_value,product_type=a__mean___discount,product_type=a__stddev___paid_value,product_type=a__stddev___discount,product_type=a__min___paid_value,product_type=a__min___discount,product_type=a__max___paid_value,product_type=a__max___discount,product_type=a__kurtosis___paid_value,product_type=a__kurtosis___discount,product_type=a__skewness___paid_value,product_type=a__skewness___discount,buy_type=f_3__sum___paid_value,buy_type=f_3__sum___discount,buy_type=f_3__mean___paid_value,buy_type=f_3__mean___discount,buy_type=f_3__stddev___paid_value,buy_type=f_3__stddev___discount,buy_type=f_3__min___paid_value,buy_type=f_3__min___discount,buy_type=f_3__max___paid_value,buy_type=f_3__max___discount,buy_type=f_3__kurtosis___paid_value,buy_type=f_3__kurtosis___discount,buy_type=f_3__skewness___paid_value,buy_type=f_3__skewness___discount,buy_type=f_5__sum___paid_value,buy_type=f_5__sum___discount,buy_type=f_5__mean___paid_value,buy_type=f_5__mean___discount,buy_type=f_5__stddev___paid_value,buy_type=f_5__stddev___discount,buy_type=f_5__min___paid_value,buy_type=f_5__min___discount,buy_type=f_5__max___paid_value,buy_type=f_5__max___discount,buy_type=f_5__kurtosis___paid_value,buy_type=f_5__kurtosis___discount,buy_type=f_5__skewness___paid_value,buy_type=f_5__skewness___discount,buy_type=f_8__sum___paid_value,buy_type=f_8__sum___discount,buy_type=f_8__mean___paid_value,buy_type=f_8__mean___discount,buy_type=f_8__stddev___paid_value,buy_type=f_8__stddev___discount,buy_type=f_8__min___paid_value,buy_type=f_8__min___discount,buy_type=f_8__max___paid_value,buy_type=f_8__max___discount,buy_type=f_8__kurtosis___paid_value,buy_type=f_8__kurtosis___discount,buy_type=f_8__skewness___paid_value,buy_type=f_8__skewness___discount,buy_type=f_4__sum___paid_value,buy_type=f_4__sum___discount,buy_type=f_4__mean___paid_value,buy_type=f_4__mean___discount,buy_type=f_4__stddev___paid_value,buy_type=f_4__stddev___discount,buy_type=f_4__min___paid_value,buy_type=f_4__min___discount,buy_type=f_4__max___paid_value,buy_type=f_4__max___discount,buy_type=f_4__kurtosis___paid_value,buy_type=f_4__kurtosis___discount,buy_type=f_4__skewness___paid_value,buy_type=f_4__skewness___discount,buy_type=f_2__sum___paid_value,buy_type=f_2__sum___discount,buy_type=f_2__mean___paid_value,buy_type=f_2__mean___discount,buy_type=f_2__stddev___paid_value,buy_type=f_2__stddev___discount,buy_type=f_2__min___paid_value,buy_type=f_2__min___discount,buy_type=f_2__max___paid_value,buy_type=f_2__max___discount,buy_type=f_2__kurtosis___paid_value,buy_type=f_2__kurtosis___discount,buy_type=f_2__skewness___paid_value,buy_type=f_2__skewness___discount,buy_type=f_9__sum___paid_value,buy_type=f_9__sum___discount,buy_type=f_9__mean___paid_value,buy_type=f_9__mean___discount,buy_type=f_9__stddev___paid_value,buy_type=f_9__stddev___discount,buy_type=f_9__min___paid_value,buy_type=f_9__min___discount,buy_type=f_9__max___paid_value,buy_type=f_9__max___discount,buy_type=f_9__kurtosis___paid_value,buy_type=f_9__kurtosis___discount,buy_type=f_9__skewness___paid_value,buy_type=f_9__skewness___discount,buy_type=f_1__sum___paid_value,buy_type=f_1__sum___discount,buy_type=f_1__mean___paid_value,buy_type=f_1__mean___discount,buy_type=f_1__stddev___paid_value,buy_type=f_1__stddev___discount,buy_type=f_1__min___paid_value,buy_type=f_1__min___discount,buy_type=f_1__max___paid_value,buy_type=f_1__max___discount,buy_type=f_1__kurtosis___paid_value,buy_type=f_1__kurtosis___discount,buy_type=f_1__skewness___paid_value,buy_type=f_1__skewness___discount,buy_type=f_7__sum___paid_value,buy_type=f_7__sum___discount,buy_type=f_7__mean___paid_value,buy_type=f_7__mean___discount,buy_type=f_7__stddev___paid_value,buy_type=f_7__stddev___discount,buy_type=f_7__min___paid_value,buy_type=f_7__min___discount,buy_type=f_7__max___paid_value,buy_type=f_7__max___discount,buy_type=f_7__kurtosis___paid_value,buy_type=f_7__kurtosis___discount,buy_type=f_7__skewness___paid_value,buy_type=f_7__skewness___discount,buy_type=f_6__sum___paid_value,buy_type=f_6__sum___discount,buy_type=f_6__mean___paid_value,buy_type=f_6__mean___discount,buy_type=f_6__stddev___paid_value,buy_type=f_6__stddev___discount,buy_type=f_6__min___paid_value,buy_type=f_6__min___discount,buy_type=f_6__max___paid_value,buy_type=f_6__max___discount,buy_type=f_6__kurtosis___paid_value,buy_type=f_6__kurtosis___discount,buy_type=f_6__skewness___paid_value,buy_type=f_6__skewness___discount
29,2022-01-01,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
29,2022-02-01,978.1700967289302,25.0,978.1700967289302,25.0,,,978.1700967289302,25.0,978.1700967289302,25.0,,,,,,,,,,,,,,,,,,,2005.366048040184,50.0,1002.6830240200918,25.0,175.09819447233872,0.0,878.8699033351803,25.0,1126.4961447050034,25.0,-2.0000000000000004,,0.0,,1908.3430273105937,10.0,954.1715136552968,5.0,48.71265890243976,7.071067811865476,919.7264622157544,0.0,988.6165650948392,10.0,-2.0,-2.0,0.0,0.0,,,,,,,,,,,,,,,1926.6506829967448,25.0,963.3253414983726,12.5,143.96039895553838,17.67766952966369,861.5299671745906,0.0,1065.1207158221546,25.0,-1.9999999999999991,-2.0,0.0,0.0,,,,,,,,,,,,,,,2034.879826126586,25.0,1017.439913063293,12.5,30.92859641042705,17.67766952966369,995.5700928088982,0.0,1039.309733317688,25.0,-2.000000000000001,-2.0,0.0,0.0,1065.1207158221546,25.0,1065.1207158221546,25.0,,,1065.1207158221546,25.0,1065.1207158221546,25.0,,,,,4811.646288406451,75.0,962.3292576812904,15.0,63.36864552021739,13.693063937629152,878.8699033351803,0.0,1039.309733317688,25.0,-1.290434009969086,-1.8333333333333333,-0.1776892000819694,-0.4082482904638631,,,,,,,,,,,,,,,988.6165650948392,10.0,988.6165650948392,10.0,,,988.6165650948392,10.0,988.6165650948392,10.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1988.026111879594,25.0,994.013055939797,12.5,187.35938091683352,17.67766952966369,861.5299671745906,0.0,1126.4961447050034,25.0,-2.0,-2.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,
26,2022-02-01,4361.685585232964,35.0,1090.421396308241,8.75,89.23134611680102,11.81453906563152,961.9595311462252,0.0,1165.640254915553,25.0,-0.8498265554253606,-1.141902428157718,-0.8841013532743225,0.6892544771146777,1148.7468889856796,25.0,1148.7468889856796,25.0,,,1148.7468889856796,25.0,1148.7468889856796,25.0,,,,,869.5921197423122,25.0,869.5921197423122,25.0,,,869.5921197423122,25.0,869.5921197423122,25.0,,,,,2168.860067298752,0.0,1084.430033649376,0.0,52.41536136399558,0.0,1047.3667761905513,0.0,1121.4932911082003,0.0,-2.000000000000001,,0.0,,1009.6418154519044,10.0,1009.6418154519044,10.0,,,1009.6418154519044,10.0,1009.6418154519044,10.0,,,,,,,,,,,,,,,,,,,1113.5043630627235,0.0,1113.5043630627235,0.0,,,1113.5043630627235,0.0,1113.5043630627235,0.0,,,,,1116.136091183358,0.0,1116.136091183358,0.0,,,1116.136091183358,0.0,1116.136091183358,0.0,,,,,869.5921197423122,25.0,869.5921197423122,25.0,,,869.5921197423122,25.0,869.5921197423122,25.0,,,,,1113.5043630627235,0.0,1113.5043630627235,0.0,,,1113.5043630627235,0.0,1113.5043630627235,0.0,,,,,961.9595311462252,0.0,961.9595311462252,0.0,,,961.9595311462252,0.0,961.9595311462252,0.0,,,,,1104.613840392189,10.0,1104.613840392189,10.0,,,1104.613840392189,10.0,1104.613840392189,10.0,,,,,3342.4789898851004,25.0,1114.1596632950334,8.333333333333334,60.60531113004448,14.433756729740644,1047.3667761905513,0.0,1165.640254915553,25.0,-1.5,-1.5000000000000004,-0.4345291137240896,0.7071067811865478,2264.8829801690376,25.0,1132.4414900845188,12.5,23.059316265924885,17.67766952966369,1116.136091183358,0.0,1148.7468889856796,25.0,-1.999999999999999,-2.0,0.0,0.0,,,,,,,,,,,,,,,1009.6418154519044,10.0,1009.6418154519044,10.0,,,1009.6418154519044,10.0,1009.6418154519044,10.0,,,,,1121.4932911082003,0.0,1121.4932911082003,0.0,,,1121.4932911082003,0.0,1121.4932911082003,0.0,,,,
26,2022-01-01,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
29,2022-03-01,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1114.7487402931952,25.0,1114.7487402931952,25.0,,,1114.7487402931952,25.0,1114.7487402931952,25.0,,,,,2017.6059546732924,35.0,1008.8029773366462,17.5,60.73613126509213,10.606601717798211,965.8560470560632,10.0,1051.7499076172292,25.0,-2.000000000000004,-2.0,3.674123667735640...,0.0,,,,,,,,,,,,,,,1065.1207158221546,25.0,1065.1207158221546,25.0,,,1065.1207158221546,25.0,1065.1207158221546,25.0,,,,,2956.694563249848,35.0,985.564854416616,11.666666666666666,101.68215577844472,12.583057392117915,913.2907840738208,0.0,1101.8368062649242,25.0,-1.4999999999999991,-1.4999999999999998,0.6460883288015307,0.2390631469295449,,,,,,,,,,,,,,,2116.870623439384,50.0,1058.435311719692,25.0,9.454589151647497,0.0,1051.7499076172292,25.0,1065.1207158221546,25.0,-1.9999999999999976,,-3.80475937533656...,,,,,,,,,,,,,,,,1101.8368062649242,0.0,1101.8368062649242,0.0,,,1101.8368062649242,0.0,1101.8368062649242,0.0,,,,,1879.146831129884,35.0,939.573415564942,17.5,37.16925390959781,10.606601717798211,913.2907840738208,10.0,965.8560470560632,25.0,-2.0,-2.0,0.0,0.0,,,,,,,,,,,,,,,1114.7487402931952,25.0,1114.7487402931952,25.0,,,1114.7487402931952,25.0,1114.7487402931952,25.0,,,,,,,,,,,,,,,,,,,941.5669729111024,10.0,941.5669729111024,10.0,,,941.5669729111024,10.0,941.5669729111024,10.0,,,,,,,,,,,,,,,,,,


## correlation example

In [15]:
features = make_features.run(
    df=df,
    suites=[
        "correlation",
    ],
    options={},
)

print(f"Features created: {features.columns}")

print(f"Number of features: {len(features.columns)}")

Features created: ['consumer_id_ref', 'date_ref', 'corr_between___paid_value_discount']
Number of features: 3


In [16]:
features.limit(5)

consumer_id_ref,date_ref,corr_between___paid_value_discount
29,2022-01-01,
29,2022-02-01,0.5035732318016096
26,2022-02-01,-0.1045322492695291
26,2022-01-01,
29,2022-03-01,0.0107765119032923


In [17]:
features = make_features.run(
    df=df,
    suites=[
        "categorical_statistics",
    ],
    options={},
)

print(f"Features created: {features.columns}")

print(f"Number of features: {len(features.columns)}")

Features created: ['consumer_id_ref', 'date_ref', 'product_type=g__count___product_type', 'product_type=f__count___product_type', 'product_type=e__count___product_type', 'product_type=h__count___product_type', 'product_type=d__count___product_type', 'product_type=c__count___product_type', 'product_type=b__count___product_type', 'product_type=a__count___product_type', 'buy_type=f_3__count___buy_type', 'buy_type=f_5__count___buy_type', 'buy_type=f_8__count___buy_type', 'buy_type=f_4__count___buy_type', 'buy_type=f_2__count___buy_type', 'buy_type=f_9__count___buy_type', 'buy_type=f_1__count___buy_type', 'buy_type=f_7__count___buy_type', 'buy_type=f_6__count___buy_type', 'count___product_type', 'countDistinct___product_type', 'count___buy_type', 'countDistinct___buy_type']
Number of features: 23


In [18]:
features.limit(5)

                                                                                

consumer_id_ref,date_ref,product_type=g__count___product_type,product_type=f__count___product_type,product_type=e__count___product_type,product_type=h__count___product_type,product_type=d__count___product_type,product_type=c__count___product_type,product_type=b__count___product_type,product_type=a__count___product_type,buy_type=f_3__count___buy_type,buy_type=f_5__count___buy_type,buy_type=f_8__count___buy_type,buy_type=f_4__count___buy_type,buy_type=f_2__count___buy_type,buy_type=f_9__count___buy_type,buy_type=f_1__count___buy_type,buy_type=f_7__count___buy_type,buy_type=f_6__count___buy_type,count___product_type,countDistinct___product_type,count___buy_type,countDistinct___buy_type
70,2022-02-01,3,0,3,1,1,1,1,1,3,1,1,3,1,1,0,0,1,11,7,11,7
46,2022-03-01,3,1,0,0,0,1,2,0,2,2,0,0,0,1,1,1,0,7,4,7,5
55,2022-03-01,2,0,1,3,2,1,0,1,2,0,1,1,1,0,0,4,1,10,6,10,6
1,2022-04-01,1,1,2,2,1,1,1,1,2,0,1,2,1,0,1,1,2,10,8,10,7
80,2022-11-01,0,3,2,0,1,1,0,0,2,0,1,1,0,1,0,1,1,7,4,7,6


## first_observation_features example

In [19]:
features = make_features.run(
    df=df,
    suites=[
        "first_observation_features",
    ],
    options={},
)

print(f"Features created: {features.columns}")

print(f"Number of features: {len(features.columns)}")

Features created: ['consumer_id_ref', 'date_ref', 'first___paid_value', 'first___discount']
Number of features: 4


In [20]:
features.limit(5)

consumer_id_ref,date_ref,first___paid_value,first___discount
0,2022-01-01,,
0,2022-02-01,1160.6002074014891,0.0
0,2022-03-01,907.0417846729196,25.0
0,2022-04-01,1146.7341607305582,10.0
0,2022-05-01,1112.5181517372505,10.0


## last_observation_features example

In [21]:
features = make_features.run(
    df=df,
    suites=[
        "last_observation_features",
    ],
    options={},
)

print(f"Features created: {features.columns}")

print(f"Number of features: {len(features.columns)}")

Features created: ['consumer_id_ref', 'date_ref', 'last___paid_value', 'last___discount']
Number of features: 4


In [22]:
features.limit(5)

consumer_id_ref,date_ref,last___paid_value,last___discount
0,2022-01-01,,
0,2022-02-01,1017.1085418841382,25.0
0,2022-03-01,1014.1861518669832,10.0
0,2022-04-01,1232.613528271461,25.0
0,2022-05-01,1014.6397369752316,25.0


## lags example

In [23]:
features = make_features.run(
    df=df,
    suites=[
        "numerical_statistics",
        "lags",
    ],
    options={"n_lags": [1, 3, 4, 5]},
)

print(f"Features created: {features.columns}")

print(f"Number of features: {len(features.columns)}")

Features created: ['consumer_id_ref', 'date_ref', 'sum___paid_value', 'sum___discount', 'mean___paid_value', 'mean___discount', 'stddev___paid_value', 'stddev___discount', 'min___paid_value', 'min___discount', 'max___paid_value', 'max___discount', 'kurtosis___paid_value', 'kurtosis___discount', 'skewness___paid_value', 'skewness___discount', 'lag=1_sum___paid_value', 'lag=1_sum___discount', 'lag=1_mean___paid_value', 'lag=1_mean___discount', 'lag=1_stddev___paid_value', 'lag=1_stddev___discount', 'lag=1_min___paid_value', 'lag=1_min___discount', 'lag=1_max___paid_value', 'lag=1_max___discount', 'lag=1_kurtosis___paid_value', 'lag=1_kurtosis___discount', 'lag=1_skewness___paid_value', 'lag=1_skewness___discount', 'lag=3_sum___paid_value', 'lag=3_sum___discount', 'lag=3_mean___paid_value', 'lag=3_mean___discount', 'lag=3_stddev___paid_value', 'lag=3_stddev___discount', 'lag=3_min___paid_value', 'lag=3_min___discount', 'lag=3_max___paid_value', 'lag=3_max___discount', 'lag=3_kurtosis___pa

In [24]:
features.limit(5)

                                                                                

consumer_id_ref,date_ref,sum___paid_value,sum___discount,mean___paid_value,mean___discount,stddev___paid_value,stddev___discount,min___paid_value,min___discount,max___paid_value,max___discount,kurtosis___paid_value,kurtosis___discount,skewness___paid_value,skewness___discount,lag=1_sum___paid_value,lag=1_sum___discount,lag=1_mean___paid_value,lag=1_mean___discount,lag=1_stddev___paid_value,lag=1_stddev___discount,lag=1_min___paid_value,lag=1_min___discount,lag=1_max___paid_value,lag=1_max___discount,lag=1_kurtosis___paid_value,lag=1_kurtosis___discount,lag=1_skewness___paid_value,lag=1_skewness___discount,lag=3_sum___paid_value,lag=3_sum___discount,lag=3_mean___paid_value,lag=3_mean___discount,lag=3_stddev___paid_value,lag=3_stddev___discount,lag=3_min___paid_value,lag=3_min___discount,lag=3_max___paid_value,lag=3_max___discount,lag=3_kurtosis___paid_value,lag=3_kurtosis___discount,lag=3_skewness___paid_value,lag=3_skewness___discount,lag=4_sum___paid_value,lag=4_sum___discount,lag=4_mean___paid_value,lag=4_mean___discount,lag=4_stddev___paid_value,lag=4_stddev___discount,lag=4_min___paid_value,lag=4_min___discount,lag=4_max___paid_value,lag=4_max___discount,lag=4_kurtosis___paid_value,lag=4_kurtosis___discount,lag=4_skewness___paid_value,lag=4_skewness___discount,lag=5_sum___paid_value,lag=5_sum___discount,lag=5_mean___paid_value,lag=5_mean___discount,lag=5_stddev___paid_value,lag=5_stddev___discount,lag=5_min___paid_value,lag=5_min___discount,lag=5_max___paid_value,lag=5_max___discount,lag=5_kurtosis___paid_value,lag=5_kurtosis___discount,lag=5_skewness___paid_value,lag=5_skewness___discount
0,2022-01-01,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
0,2022-02-01,6425.02303262522,45.0,1070.837172104203,7.5,70.58560769477131,9.874208829065749,991.1823185540896,0.0,1160.6002074014891,25.0,-1.4862128264472745,-0.3905325443786989,0.33803836307444013,0.9387234089965416,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
0,2022-03-01,1921.2279365399029,35.0,960.6139682699514,17.5,75.76250860886378,10.606601717798211,907.0417846729196,10.0,1014.1861518669832,25.0,-2.000000000000001,-2.0,-3.78583866001275...,0.0,6425.02303262522,45.0,1070.837172104203,7.5,70.58560769477131,9.874208829065749,991.1823185540896,0.0,1160.6002074014891,25.0,-1.4862128264472745,-0.3905325443786989,0.33803836307444013,0.9387234089965416,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
0,2022-04-01,7241.335698029857,90.0,1034.4765282899796,12.857142857142858,121.03820134423728,9.063269671749657,913.4362107269887,0.0,1232.613528271461,25.0,-1.0770315605844292,-0.9517958412098296,0.5482194738419417,0.3293992662501247,1921.2279365399029,35.0,960.6139682699514,17.5,75.76250860886378,10.606601717798211,907.0417846729196,10.0,1014.1861518669832,25.0,-2.000000000000001,-2.0,-3.78583866001275...,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
0,2022-05-01,13005.825538275483,155.0,1000.4481183288832,11.923076923076923,78.80920113421799,11.46343126687016,848.2943132152986,0.0,1112.5181517372505,25.0,-0.6377760480965216,-1.7071386079714457,-0.639341122500975,0.1545915837587542,7241.335698029857,90.0,1034.4765282899796,12.857142857142858,121.03820134423728,9.063269671749657,913.4362107269887,0.0,1232.613528271461,25.0,-1.0770315605844292,-0.9517958412098296,0.5482194738419417,0.3293992662501247,6425.02303262522,45.0,1070.837172104203,7.5,70.58560769477131,9.874208829065749,991.1823185540896,0.0,1160.6002074014891,25.0,-1.4862128264472745,-0.3905325443786989,0.3380383630744401,0.9387234089965416,,,,,,,,,,,,,,,,,,,,,,,,,,,,


## increase_rate example

In [25]:
features = make_features.run(
    df=df,
    suites=[
        "numerical_statistics",
        "increase_rate",
    ],
    options={},
)

print(f"Features created: {features.columns}")

print(f"Number of features: {len(features.columns)}")

Features created: ['consumer_id_ref', 'date_ref', 'sum___paid_value', 'sum___discount', 'mean___paid_value', 'mean___discount', 'stddev___paid_value', 'stddev___discount', 'min___paid_value', 'min___discount', 'max___paid_value', 'max___discount', 'kurtosis___paid_value', 'kurtosis___discount', 'skewness___paid_value', 'skewness___discount', 'increase_rate_sum___paid_value', 'increase_rate_sum___discount', 'increase_rate_mean___paid_value', 'increase_rate_mean___discount', 'increase_rate_stddev___paid_value', 'increase_rate_stddev___discount', 'increase_rate_min___paid_value', 'increase_rate_min___discount', 'increase_rate_max___paid_value', 'increase_rate_max___discount', 'increase_rate_kurtosis___paid_value', 'increase_rate_kurtosis___discount', 'increase_rate_skewness___paid_value', 'increase_rate_skewness___discount']
Number of features: 30


In [26]:
features.limit(5)



consumer_id_ref,date_ref,sum___paid_value,sum___discount,mean___paid_value,mean___discount,stddev___paid_value,stddev___discount,min___paid_value,min___discount,max___paid_value,max___discount,kurtosis___paid_value,kurtosis___discount,skewness___paid_value,skewness___discount,increase_rate_sum___paid_value,increase_rate_sum___discount,increase_rate_mean___paid_value,increase_rate_mean___discount,increase_rate_stddev___paid_value,increase_rate_stddev___discount,increase_rate_min___paid_value,increase_rate_min___discount,increase_rate_max___paid_value,increase_rate_max___discount,increase_rate_kurtosis___paid_value,increase_rate_kurtosis___discount,increase_rate_skewness___paid_value,increase_rate_skewness___discount
0,2022-01-01,,,,,,,,,,,,,,,,,,,,,,,,,,,,
0,2022-02-01,6425.02303262522,45.0,1070.837172104203,7.5,70.58560769477131,9.874208829065749,991.1823185540896,0.0,1160.6002074014891,25.0,-1.4862128264472745,-0.3905325443786989,0.33803836307444013,0.9387234089965416,,,,,,,,,,,,,,
0,2022-03-01,1921.2279365399029,35.0,960.6139682699514,17.5,75.76250860886378,10.606601717798211,907.0417846729196,10.0,1014.1861518669832,25.0,-2.000000000000001,-2.0,-3.78583866001275...,0.0,-0.7009772685974478,-0.2222222222222222,-0.10293180579234323,1.3333333333333333,0.0733421597286319,0.0741723110591494,-0.0848890585577756,,-0.12615373890232,0.0,0.3457022873237557,4.121212121212112,-1.000000000000001,-1.0
0,2022-04-01,7241.335698029857,90.0,1034.4765282899796,12.857142857142858,121.03820134423728,9.063269671749657,913.4362107269887,0.0,1232.613528271461,25.0,-1.0770315605844292,-0.9517958412098296,0.5482194738419417,0.3293992662501247,2.7691184686140744,1.5714285714285714,0.0768909910325927,-0.2653061224489795,0.5976002321823396,-0.1455067407177924,0.0070497590762866,-1.0,0.2153720754344572,0.0,-0.4614842197077856,-0.5241020793950852,-1.44807933743297...,
0,2022-05-01,13005.825538275483,155.0,1000.4481183288832,11.923076923076923,78.80920113421799,11.46343126687016,848.2943132152986,0.0,1112.5181517372505,25.0,-0.6377760480965216,-1.7071386079714457,-0.639341122500975,0.1545915837587542,0.796053391339662,0.7222222222222222,-0.03289432774018...,-0.0726495726495726,-0.3488898524682997,0.264822926167787,-0.0713152125421487,,-0.0974314931482416,0.0,-0.4078390351435519,0.7935974649789376,-2.1662138121808217,-0.5306863141541813
