# How to use

In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

## Init Pyspark

In [2]:
import pyspark.sql.functions as F
from pyspark.sql import SparkSession

In [3]:
spark = (
    SparkSession.builder.master("local[*]")
    .config("spark.executor.memory", "6g")
    .config("spark.driver.memory", "6g")
    .getOrCreate()
)

spark.conf.set("spark.sql.repl.eagerEval.enabled", True)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/02/19 17:14:55 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Import the data

In [4]:
from examples.make_data import make_transactions

In [5]:
make_transactions().head()

Unnamed: 0,transaction_id,consumer_id,paid_value,discount,product_type,buy_type,paymnent_date
0,0,29,1021.731939,10,h,f_4,2022-01-01 00:00:00.000000000
1,1,98,978.323923,10,c,f_7,2022-01-01 00:52:33.915391539
2,2,98,1017.947599,0,d,f_8,2022-01-01 01:45:07.830783078
3,3,38,1175.278194,25,a,f_2,2022-01-01 02:37:41.746174617
4,4,15,853.543907,10,a,f_1,2022-01-01 03:30:15.661566156


## Generate public

We will select the monthly active users as our public.

In [6]:
transactions = spark.createDataFrame(make_transactions())

public = transactions.select(F.col("consumer_id").alias("consumer_id_ref"), F.to_date(F.date_trunc("month", "paymnent_date")).alias("date_ref")).distinct()

## Generate Dataset

In [7]:
from autofeat.types import Dataset

In [8]:
df = Dataset(
    table=transactions,
    primary_key_col="transaction_id",
    table_join_key_col="consumer_id",
    table_join_date_col="paymnent_date",
    numerical_cols=["paid_value", "discount"],
    categorical_cols=["product_type", "buy_type"],
    public=public,
    public_join_key_col="consumer_id_ref",
    public_join_date_col="date_ref",
)


In [9]:
from autofeat import make_features

## numerical_statistics example

In [10]:
features = make_features.run(
    df=df,
    suites=[
        "numerical_statistics",
    ],
    options={},
)

print(f"Features created: {features.columns}")

print(f"Number of features: {len(features.columns)}")

Features created: ['consumer_id_ref', 'date_ref', 'sum___paid_value', 'sum___discount', 'mean___paid_value', 'mean___discount', 'stddev___paid_value', 'stddev___discount', 'min___paid_value', 'min___discount', 'max___paid_value', 'max___discount', 'kurtosis___paid_value', 'kurtosis___discount', 'skewness___paid_value', 'skewness___discount']
Number of features: 16


In [11]:
features.limit(5)

23/02/19 17:15:02 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


                                                                                

consumer_id_ref,date_ref,sum___paid_value,sum___discount,mean___paid_value,mean___discount,stddev___paid_value,stddev___discount,min___paid_value,min___discount,max___paid_value,max___discount,kurtosis___paid_value,kurtosis___discount,skewness___paid_value,skewness___discount
29,2022-01-01,,,,,,,,,,,,,,
29,2022-02-01,12884.503346711668,205.0,991.1156420547436,15.76923076923077,102.66345300595484,10.963225241337865,831.0771002092232,0.0,1213.2540898678285,25.0,-0.0916291440820411,-1.5023466666666672,0.5059625417668084,-0.4498627938975761
26,2022-02-01,7508.382270897559,145.0,938.5477838621948,18.125,122.70809047827476,9.977653603356424,699.015702424566,0.0,1117.5160783905533,25.0,0.174557660800096,-0.8945685616038936,-0.5859000078847572,-0.8306056504415191
26,2022-01-01,,,,,,,,,,,,,,
29,2022-03-01,21210.539323911595,285.0,1010.0256820910284,13.571428571428571,103.5649985708696,10.856202966836188,831.0771002092232,0.0,1213.2540898678285,25.0,-0.4240963736640388,-1.6466942148760342,0.4773598404355856,-0.0735490759050809


## numerical_in_categorical_groups example

In [12]:
features = make_features.run(
    df=df,
    suites=[
        "numerical_in_categorical_groups",
    ],
    options={},
)

print(f"Features created: {features.columns}")

print(f"Number of features: {len(features.columns)}")

Features created: ['consumer_id_ref', 'date_ref', 'product_type=g__sum___paid_value', 'product_type=g__sum___discount', 'product_type=g__mean___paid_value', 'product_type=g__mean___discount', 'product_type=g__stddev___paid_value', 'product_type=g__stddev___discount', 'product_type=g__min___paid_value', 'product_type=g__min___discount', 'product_type=g__max___paid_value', 'product_type=g__max___discount', 'product_type=g__kurtosis___paid_value', 'product_type=g__kurtosis___discount', 'product_type=g__skewness___paid_value', 'product_type=g__skewness___discount', 'product_type=None__sum___paid_value', 'product_type=None__sum___discount', 'product_type=None__mean___paid_value', 'product_type=None__mean___discount', 'product_type=None__stddev___paid_value', 'product_type=None__stddev___discount', 'product_type=None__min___paid_value', 'product_type=None__min___discount', 'product_type=None__max___paid_value', 'product_type=None__max___discount', 'product_type=None__kurtosis___paid_value', 

In [13]:
features.limit(5)


[Stage 60:>                                                         (0 + 1) / 1]

23/02/19 17:15:29 WARN DAGScheduler: Broadcasting large task binary with size 1279.3 KiB


[Stage 73:>                                                         (0 + 1) / 1]

23/02/19 17:15:35 WARN DAGScheduler: Broadcasting large task binary with size 1279.3 KiB



                                                                                

consumer_id_ref,date_ref,product_type=g__sum___paid_value,product_type=g__sum___discount,product_type=g__mean___paid_value,product_type=g__mean___discount,product_type=g__stddev___paid_value,product_type=g__stddev___discount,product_type=g__min___paid_value,product_type=g__min___discount,product_type=g__max___paid_value,product_type=g__max___discount,product_type=g__kurtosis___paid_value,product_type=g__kurtosis___discount,product_type=g__skewness___paid_value,product_type=g__skewness___discount,product_type=None__sum___paid_value,product_type=None__sum___discount,product_type=None__mean___paid_value,product_type=None__mean___discount,product_type=None__stddev___paid_value,product_type=None__stddev___discount,product_type=None__min___paid_value,product_type=None__min___discount,product_type=None__max___paid_value,product_type=None__max___discount,product_type=None__kurtosis___paid_value,product_type=None__kurtosis___discount,product_type=None__skewness___paid_value,product_type=None__skewness___discount,product_type=f__sum___paid_value,product_type=f__sum___discount,product_type=f__mean___paid_value,product_type=f__mean___discount,product_type=f__stddev___paid_value,product_type=f__stddev___discount,product_type=f__min___paid_value,product_type=f__min___discount,product_type=f__max___paid_value,product_type=f__max___discount,product_type=f__kurtosis___paid_value,product_type=f__kurtosis___discount,product_type=f__skewness___paid_value,product_type=f__skewness___discount,product_type=e__sum___paid_value,product_type=e__sum___discount,product_type=e__mean___paid_value,product_type=e__mean___discount,product_type=e__stddev___paid_value,product_type=e__stddev___discount,product_type=e__min___paid_value,product_type=e__min___discount,product_type=e__max___paid_value,product_type=e__max___discount,product_type=e__kurtosis___paid_value,product_type=e__kurtosis___discount,product_type=e__skewness___paid_value,product_type=e__skewness___discount,product_type=h__sum___paid_value,product_type=h__sum___discount,product_type=h__mean___paid_value,product_type=h__mean___discount,product_type=h__stddev___paid_value,product_type=h__stddev___discount,product_type=h__min___paid_value,product_type=h__min___discount,product_type=h__max___paid_value,product_type=h__max___discount,product_type=h__kurtosis___paid_value,product_type=h__kurtosis___discount,product_type=h__skewness___paid_value,product_type=h__skewness___discount,product_type=d__sum___paid_value,product_type=d__sum___discount,product_type=d__mean___paid_value,product_type=d__mean___discount,product_type=d__stddev___paid_value,product_type=d__stddev___discount,product_type=d__min___paid_value,product_type=d__min___discount,product_type=d__max___paid_value,product_type=d__max___discount,product_type=d__kurtosis___paid_value,product_type=d__kurtosis___discount,product_type=d__skewness___paid_value,product_type=d__skewness___discount,product_type=c__sum___paid_value,product_type=c__sum___discount,product_type=c__mean___paid_value,product_type=c__mean___discount,product_type=c__stddev___paid_value,product_type=c__stddev___discount,product_type=c__min___paid_value,product_type=c__min___discount,product_type=c__max___paid_value,product_type=c__max___discount,product_type=c__kurtosis___paid_value,product_type=c__kurtosis___discount,product_type=c__skewness___paid_value,product_type=c__skewness___discount,product_type=b__sum___paid_value,product_type=b__sum___discount,product_type=b__mean___paid_value,product_type=b__mean___discount,product_type=b__stddev___paid_value,product_type=b__stddev___discount,product_type=b__min___paid_value,product_type=b__min___discount,product_type=b__max___paid_value,product_type=b__max___discount,product_type=b__kurtosis___paid_value,product_type=b__kurtosis___discount,product_type=b__skewness___paid_value,product_type=b__skewness___discount,product_type=a__sum___paid_value,product_type=a__sum___discount,product_type=a__mean___paid_value,product_type=a__mean___discount,product_type=a__stddev___paid_value,product_type=a__stddev___discount,product_type=a__min___paid_value,product_type=a__min___discount,product_type=a__max___paid_value,product_type=a__max___discount,product_type=a__kurtosis___paid_value,product_type=a__kurtosis___discount,product_type=a__skewness___paid_value,product_type=a__skewness___discount,buy_type=f_3__sum___paid_value,buy_type=f_3__sum___discount,buy_type=f_3__mean___paid_value,buy_type=f_3__mean___discount,buy_type=f_3__stddev___paid_value,buy_type=f_3__stddev___discount,buy_type=f_3__min___paid_value,buy_type=f_3__min___discount,buy_type=f_3__max___paid_value,buy_type=f_3__max___discount,buy_type=f_3__kurtosis___paid_value,buy_type=f_3__kurtosis___discount,buy_type=f_3__skewness___paid_value,buy_type=f_3__skewness___discount,buy_type=f_5__sum___paid_value,buy_type=f_5__sum___discount,buy_type=f_5__mean___paid_value,buy_type=f_5__mean___discount,buy_type=f_5__stddev___paid_value,buy_type=f_5__stddev___discount,buy_type=f_5__min___paid_value,buy_type=f_5__min___discount,buy_type=f_5__max___paid_value,buy_type=f_5__max___discount,buy_type=f_5__kurtosis___paid_value,buy_type=f_5__kurtosis___discount,buy_type=f_5__skewness___paid_value,buy_type=f_5__skewness___discount,buy_type=None__sum___paid_value,buy_type=None__sum___discount,buy_type=None__mean___paid_value,buy_type=None__mean___discount,buy_type=None__stddev___paid_value,buy_type=None__stddev___discount,buy_type=None__min___paid_value,buy_type=None__min___discount,buy_type=None__max___paid_value,buy_type=None__max___discount,buy_type=None__kurtosis___paid_value,buy_type=None__kurtosis___discount,buy_type=None__skewness___paid_value,buy_type=None__skewness___discount,buy_type=f_8__sum___paid_value,buy_type=f_8__sum___discount,buy_type=f_8__mean___paid_value,buy_type=f_8__mean___discount,buy_type=f_8__stddev___paid_value,buy_type=f_8__stddev___discount,buy_type=f_8__min___paid_value,buy_type=f_8__min___discount,buy_type=f_8__max___paid_value,buy_type=f_8__max___discount,buy_type=f_8__kurtosis___paid_value,buy_type=f_8__kurtosis___discount,buy_type=f_8__skewness___paid_value,buy_type=f_8__skewness___discount,buy_type=f_4__sum___paid_value,buy_type=f_4__sum___discount,buy_type=f_4__mean___paid_value,buy_type=f_4__mean___discount,buy_type=f_4__stddev___paid_value,buy_type=f_4__stddev___discount,buy_type=f_4__min___paid_value,buy_type=f_4__min___discount,buy_type=f_4__max___paid_value,buy_type=f_4__max___discount,buy_type=f_4__kurtosis___paid_value,buy_type=f_4__kurtosis___discount,buy_type=f_4__skewness___paid_value,buy_type=f_4__skewness___discount,buy_type=f_2__sum___paid_value,buy_type=f_2__sum___discount,buy_type=f_2__mean___paid_value,buy_type=f_2__mean___discount,buy_type=f_2__stddev___paid_value,buy_type=f_2__stddev___discount,buy_type=f_2__min___paid_value,buy_type=f_2__min___discount,buy_type=f_2__max___paid_value,buy_type=f_2__max___discount,buy_type=f_2__kurtosis___paid_value,buy_type=f_2__kurtosis___discount,buy_type=f_2__skewness___paid_value,buy_type=f_2__skewness___discount,buy_type=f_9__sum___paid_value,buy_type=f_9__sum___discount,buy_type=f_9__mean___paid_value,buy_type=f_9__mean___discount,buy_type=f_9__stddev___paid_value,buy_type=f_9__stddev___discount,buy_type=f_9__min___paid_value,buy_type=f_9__min___discount,buy_type=f_9__max___paid_value,buy_type=f_9__max___discount,buy_type=f_9__kurtosis___paid_value,buy_type=f_9__kurtosis___discount,buy_type=f_9__skewness___paid_value,buy_type=f_9__skewness___discount,buy_type=f_1__sum___paid_value,buy_type=f_1__sum___discount,buy_type=f_1__mean___paid_value,buy_type=f_1__mean___discount,buy_type=f_1__stddev___paid_value,buy_type=f_1__stddev___discount,buy_type=f_1__min___paid_value,buy_type=f_1__min___discount,buy_type=f_1__max___paid_value,buy_type=f_1__max___discount,buy_type=f_1__kurtosis___paid_value,buy_type=f_1__kurtosis___discount,buy_type=f_1__skewness___paid_value,buy_type=f_1__skewness___discount,buy_type=f_7__sum___paid_value,buy_type=f_7__sum___discount,buy_type=f_7__mean___paid_value,buy_type=f_7__mean___discount,buy_type=f_7__stddev___paid_value,buy_type=f_7__stddev___discount,buy_type=f_7__min___paid_value,buy_type=f_7__min___discount,buy_type=f_7__max___paid_value,buy_type=f_7__max___discount,buy_type=f_7__kurtosis___paid_value,buy_type=f_7__kurtosis___discount,buy_type=f_7__skewness___paid_value,buy_type=f_7__skewness___discount,buy_type=f_6__sum___paid_value,buy_type=f_6__sum___discount,buy_type=f_6__mean___paid_value,buy_type=f_6__mean___discount,buy_type=f_6__stddev___paid_value,buy_type=f_6__stddev___discount,buy_type=f_6__min___paid_value,buy_type=f_6__min___discount,buy_type=f_6__max___paid_value,buy_type=f_6__max___discount,buy_type=f_6__kurtosis___paid_value,buy_type=f_6__kurtosis___discount,buy_type=f_6__skewness___paid_value,buy_type=f_6__skewness___discount
29,2022-01-01,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
29,2022-02-01,,,,,,,,,,,,,,,,,,,,,,,,,,,,,884.5944592908111,25.0,884.5944592908111,25.0,,,884.5944592908111,25.0,884.5944592908111,25.0,,,,,886.2905776828845,25.0,886.2905776828845,25.0,,,886.2905776828845,25.0,886.2905776828845,25.0,,,,,1213.2540898678285,10.0,1213.2540898678285,10.0,,,1213.2540898678285,10.0,1213.2540898678285,10.0,,,,,2083.8590821960192,25.0,1041.9295410980096,12.5,83.86297857953484,17.67766952966369,982.6294602539184,0.0,1101.229621942101,25.0,-2.000000000000001,-2.0,0.0,0.0,831.0771002092232,25.0,831.0771002092232,25.0,,,831.0771002092232,25.0,831.0771002092232,25.0,,,,,3909.098480634181,60.0,977.2746201585452,15.0,43.81758685407173,12.24744871391589,924.5760523472331,0.0,1017.3549481973808,25.0,-1.6199093756992955,-1.5925925925925926,-0.2604085257392011,-0.3142696805273544,3076.3295568307176,35.0,1025.4431856102392,11.666666666666666,58.84747546374615,12.583057392117915,964.2147974008212,0.0,1081.5787183634952,25.0,-1.4999999999999996,-1.4999999999999998,-0.1577994209484219,0.2390631469295449,,,,,,,,,,,,,,,3252.83718793302,45.0,1084.27906264434,15.0,112.21123983157496,8.660254037844387,1009.04705699879,10.0,1213.2540898678285,25.0,-1.4999999999999991,-1.5,0.6780444320237246,0.7071067811865476,,,,,,,,,,,,,,,1999.9844084512995,25.0,999.9922042256496,12.5,24.55462800483391,17.67766952966369,982.6294602539184,0.0,1017.3549481973808,25.0,-2.0000000000000018,-2.0,0.0,0.0,2872.114658915797,75.0,957.371552971932,25.0,124.58762864052495,0.0,884.5944592908111,25.0,1101.229621942101,25.0,-1.5000000000000009,,0.7069593492304863,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1755.6531525564565,35.0,877.8265762782282,17.5,66.11374309062336,10.606601717798211,831.0771002092232,10.0,924.5760523472331,25.0,-2.000000000000001,-2.0,0.0,0.0,1081.5787183634952,0.0,1081.5787183634952,0.0,,,1081.5787183634952,0.0,1081.5787183634952,0.0,,,,,1922.3352204915984,25.0,961.1676102457992,12.5,4.3093734017213015,17.67766952966369,958.1204230907772,0.0,964.2147974008212,25.0,-2.0000000000000004,-2.0,0.0,0.0
26,2022-02-01,1117.5160783905533,25.0,1117.5160783905533,25.0,,,1117.5160783905533,25.0,1117.5160783905533,25.0,,,,,,,,,,,,,,,,,,,1849.5291491085488,25.0,924.7645745542744,12.5,22.252096881465388,17.67766952966369,909.0299659537702,0.0,940.4991831547786,25.0,-2.0000000000000004,-2.0,0.0,0.0,1927.3001443862136,35.0,963.6500721931068,17.5,91.88965002307609,10.606601717798211,898.674277540931,10.0,1028.6258668452824,25.0,-1.999999999999999,-2.0,4.243804258292301...,0.0,,,,,,,,,,,,,,,909.0471271961792,25.0,909.0471271961792,25.0,,,909.0471271961792,25.0,909.0471271961792,25.0,,,,,,,,,,,,,,,,,,,1704.989771816064,35.0,852.4948859080321,17.5,217.05234282426636,10.606601717798211,699.015702424566,10.0,1005.974069391498,25.0,-2.000000000000001,-2.0,-5.15206917548129...,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1005.974069391498,10.0,1005.974069391498,10.0,,,1005.974069391498,10.0,1005.974069391498,10.0,,,,,4486.217845574576,100.0,897.2435691149152,20.0,121.1454565484237,11.180339887498947,699.015702424566,0.0,1028.6258668452824,25.0,-0.3224142894162622,0.2500000000000004,-0.8547657407531712,-1.5000000000000002,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,898.674277540931,10.0,898.674277540931,10.0,,,898.674277540931,10.0,898.674277540931,10.0,,,,,1117.5160783905533,25.0,1117.5160783905533,25.0,,,1117.5160783905533,25.0,1117.5160783905533,25.0,,,,
26,2022-01-01,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
29,2022-03-01,922.5596334496768,10.0,922.5596334496768,10.0,,,922.5596334496768,10.0,922.5596334496768,10.0,,,,,,,,,,,,,,,,,,,884.5944592908111,25.0,884.5944592908111,25.0,,,884.5944592908111,25.0,884.5944592908111,25.0,,,,,4047.191603288681,75.0,1011.7979008221704,18.75,117.00593730173313,12.499999999999998,886.2905776828845,0.0,1169.1982542099925,25.0,-0.9556889516290736,-0.6666666666666665,0.4596043461170481,-1.1547005383792517,2420.253837353,10.0,1210.1269186764998,5.0,4.422487910639596,7.071067811865476,1206.999747485171,0.0,1213.2540898678285,10.0,-1.9999999999999984,-2.0,0.0,0.0,3176.752015585457,35.0,1058.9173385284855,11.666666666666666,66.19860551954564,12.583057392117915,982.6294602539184,0.0,1101.229621942101,25.0,-1.4999999999999991,-1.5000000000000009,-0.6945114883456114,0.2390631469295452,1827.969447590202,25.0,913.984723795101,12.5,117.24908569927176,17.67766952966369,831.0771002092232,0.0,996.8923473809788,25.0,-2.0,-2.0,0.0,0.0,3909.098480634181,60.0,977.2746201585452,15.0,43.81758685407173,12.24744871391589,924.5760523472331,0.0,1017.3549481973808,25.0,-1.6199093756992955,-1.5925925925925926,-0.2604085257392011,-0.3142696805273544,4022.1198467195895,45.0,1005.5299616798974,11.25,62.40856934885263,10.307764064044152,945.790289888872,0.0,1081.5787183634952,25.0,-1.550060685136163,-0.961937716262976,0.2719399956535309,0.4118470835376499,991.6308410556854,25.0,991.6308410556854,25.0,,,991.6308410556854,25.0,991.6308410556854,25.0,,,,,5198.6994081620105,55.0,1039.739881632402,11.0,101.89947592262668,8.944271909999157,945.790289888872,0.0,1213.2540898678285,25.0,-0.1371650441450804,-0.4091796875000009,1.136622856368236,0.5507812500000008,,,,,,,,,,,,,,,2922.544041900976,35.0,974.1813473003252,11.666666666666666,47.95900211520848,12.583057392117915,922.5596334496768,0.0,1017.3549481973808,25.0,-1.4999999999999998,-1.5000000000000009,-0.3135717124003845,0.2390631469295452,3869.007006296775,75.0,967.2517515741938,18.75,103.62685352356036,12.499999999999998,884.5944592908111,0.0,1101.229621942101,25.0,-1.409700219854243,-0.6666666666666665,0.4619435984415551,-1.1547005383792517,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4131.85115425162,60.0,1032.962788562905,15.0,183.8064932843911,12.24744871391589,831.0771002092232,0.0,1206.999747485171,25.0,-1.8080670781329555,-1.5925925925925926,-0.1054803564093044,-0.3142696805273542,1081.5787183634952,0.0,1081.5787183634952,0.0,,,1081.5787183634952,0.0,1081.5787183634952,0.0,,,,,3015.228153881036,35.0,1005.0760512936786,11.666666666666666,76.1126726329002,12.583057392117915,958.1204230907772,0.0,1092.8929333894375,25.0,-1.4999999999999991,-1.5000000000000009,0.7020100516901682,0.2390631469295452


## correlation example

In [14]:
features = make_features.run(
    df=df,
    suites=[
        "correlation",
    ],
    options={},
)

print(f"Features created: {features.columns}")

print(f"Number of features: {len(features.columns)}")

Features created: ['consumer_id_ref', 'date_ref', 'corr_between___paid_value_discount']
Number of features: 3


In [15]:
features.limit(5)

consumer_id_ref,date_ref,corr_between___paid_value_discount
29,2022-01-01,
29,2022-02-01,-0.3072521522599351
26,2022-02-01,0.0378809425240437
26,2022-01-01,
29,2022-03-01,-0.2310407779936052


In [16]:
features = make_features.run(
    df=df,
    suites=[
        "categorical_statistics",
    ],
    options={},
)

print(f"Features created: {features.columns}")

print(f"Number of features: {len(features.columns)}")

Features created: ['consumer_id_ref', 'date_ref', 'product_type=g__count___product_type', 'product_type=None__count___product_type', 'product_type=f__count___product_type', 'product_type=e__count___product_type', 'product_type=h__count___product_type', 'product_type=d__count___product_type', 'product_type=c__count___product_type', 'product_type=b__count___product_type', 'product_type=a__count___product_type', 'buy_type=f_3__count___buy_type', 'buy_type=f_5__count___buy_type', 'buy_type=None__count___buy_type', 'buy_type=f_8__count___buy_type', 'buy_type=f_4__count___buy_type', 'buy_type=f_2__count___buy_type', 'buy_type=f_9__count___buy_type', 'buy_type=f_1__count___buy_type', 'buy_type=f_7__count___buy_type', 'buy_type=f_6__count___buy_type', 'count___product_type', 'countDistinct___product_type', 'count___buy_type', 'countDistinct___buy_type']
Number of features: 25


In [17]:
features.limit(5)

consumer_id_ref,date_ref,product_type=g__count___product_type,product_type=None__count___product_type,product_type=f__count___product_type,product_type=e__count___product_type,product_type=h__count___product_type,product_type=d__count___product_type,product_type=c__count___product_type,product_type=b__count___product_type,product_type=a__count___product_type,buy_type=f_3__count___buy_type,buy_type=f_5__count___buy_type,buy_type=None__count___buy_type,buy_type=f_8__count___buy_type,buy_type=f_4__count___buy_type,buy_type=f_2__count___buy_type,buy_type=f_9__count___buy_type,buy_type=f_1__count___buy_type,buy_type=f_7__count___buy_type,buy_type=f_6__count___buy_type,count___product_type,countDistinct___product_type,count___buy_type,countDistinct___buy_type
70,2022-02-01,2,0,3,0,1,2,1,2,4,3,0,0,1,5,1,3,1,1,0,15,7,15,7
46,2022-03-01,1,0,1,0,1,1,5,1,1,0,1,0,2,1,1,2,2,1,1,11,7,11,8
55,2022-03-01,2,0,3,1,2,0,3,2,1,0,1,0,1,1,1,4,3,1,2,14,7,14,8
80,2022-11-01,2,0,1,3,0,1,2,3,4,3,1,0,3,3,0,2,1,1,2,16,7,16,8
1,2022-04-01,1,0,1,4,1,0,2,1,6,3,3,0,0,1,2,1,1,1,4,16,7,16,8


## first_observation_features example

In [18]:
features = make_features.run(
    df=df,
    suites=[
        "first_observation_features",
    ],
    options={},
)

print(f"Features created: {features.columns}")

print(f"Number of features: {len(features.columns)}")

Features created: ['consumer_id_ref', 'date_ref', 'first___paid_value', 'first___discount']
Number of features: 4


In [19]:
features.limit(5)

consumer_id_ref,date_ref,first___paid_value,first___discount
0,2022-01-01,,
0,2022-02-01,724.6474816548626,0.0
0,2022-03-01,724.6474816548626,0.0
0,2022-04-01,724.6474816548626,0.0
0,2022-05-01,989.642360404834,0.0


## last_observation_features example

In [20]:
features = make_features.run(
    df=df,
    suites=[
        "last_observation_features",
    ],
    options={},
)

print(f"Features created: {features.columns}")

print(f"Number of features: {len(features.columns)}")

Features created: ['consumer_id_ref', 'date_ref', 'last___paid_value', 'last___discount']
Number of features: 4


In [21]:
features.limit(5)

consumer_id_ref,date_ref,last___paid_value,last___discount
0,2022-01-01,,
0,2022-02-01,989.642360404834,0.0
0,2022-03-01,1068.3206046873418,25.0
0,2022-04-01,940.2798461945032,25.0
0,2022-05-01,1047.8739332006205,10.0


## lags example

In [22]:
features = make_features.run(
    df=df,
    suites=[
        "numerical_statistics",
        "lags",
    ],
    options={"n_lags": [1, 3, 4, 5]},
)

print(f"Features created: {features.columns}")

print(f"Number of features: {len(features.columns)}")

Features created: ['consumer_id_ref', 'date_ref', 'sum___paid_value', 'sum___discount', 'mean___paid_value', 'mean___discount', 'stddev___paid_value', 'stddev___discount', 'min___paid_value', 'min___discount', 'max___paid_value', 'max___discount', 'kurtosis___paid_value', 'kurtosis___discount', 'skewness___paid_value', 'skewness___discount', 'lag=1_sum___paid_value', 'lag=1_sum___discount', 'lag=1_mean___paid_value', 'lag=1_mean___discount', 'lag=1_stddev___paid_value', 'lag=1_stddev___discount', 'lag=1_min___paid_value', 'lag=1_min___discount', 'lag=1_max___paid_value', 'lag=1_max___discount', 'lag=1_kurtosis___paid_value', 'lag=1_kurtosis___discount', 'lag=1_skewness___paid_value', 'lag=1_skewness___discount', 'lag=3_sum___paid_value', 'lag=3_sum___discount', 'lag=3_mean___paid_value', 'lag=3_mean___discount', 'lag=3_stddev___paid_value', 'lag=3_stddev___discount', 'lag=3_min___paid_value', 'lag=3_min___discount', 'lag=3_max___paid_value', 'lag=3_max___discount', 'lag=3_kurtosis___pa

In [23]:
features.limit(5)

consumer_id_ref,date_ref,sum___paid_value,sum___discount,mean___paid_value,mean___discount,stddev___paid_value,stddev___discount,min___paid_value,min___discount,max___paid_value,max___discount,kurtosis___paid_value,kurtosis___discount,skewness___paid_value,skewness___discount,lag=1_sum___paid_value,lag=1_sum___discount,lag=1_mean___paid_value,lag=1_mean___discount,lag=1_stddev___paid_value,lag=1_stddev___discount,lag=1_min___paid_value,lag=1_min___discount,lag=1_max___paid_value,lag=1_max___discount,lag=1_kurtosis___paid_value,lag=1_kurtosis___discount,lag=1_skewness___paid_value,lag=1_skewness___discount,lag=3_sum___paid_value,lag=3_sum___discount,lag=3_mean___paid_value,lag=3_mean___discount,lag=3_stddev___paid_value,lag=3_stddev___discount,lag=3_min___paid_value,lag=3_min___discount,lag=3_max___paid_value,lag=3_max___discount,lag=3_kurtosis___paid_value,lag=3_kurtosis___discount,lag=3_skewness___paid_value,lag=3_skewness___discount,lag=4_sum___paid_value,lag=4_sum___discount,lag=4_mean___paid_value,lag=4_mean___discount,lag=4_stddev___paid_value,lag=4_stddev___discount,lag=4_min___paid_value,lag=4_min___discount,lag=4_max___paid_value,lag=4_max___discount,lag=4_kurtosis___paid_value,lag=4_kurtosis___discount,lag=4_skewness___paid_value,lag=4_skewness___discount,lag=5_sum___paid_value,lag=5_sum___discount,lag=5_mean___paid_value,lag=5_mean___discount,lag=5_stddev___paid_value,lag=5_stddev___discount,lag=5_min___paid_value,lag=5_min___discount,lag=5_max___paid_value,lag=5_max___discount,lag=5_kurtosis___paid_value,lag=5_kurtosis___discount,lag=5_skewness___paid_value,lag=5_skewness___discount
0,2022-01-01,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
0,2022-02-01,8002.543578036849,55.0,1000.317947254606,6.875,118.5476910585547,8.838834764831844,724.6474816548626,0.0,1092.4934907204822,25.0,1.8855027332009808,0.1335510204081629,-1.764323227323885,1.0496613364713396,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
0,2022-03-01,18183.86450421856,180.0,1010.2146946788088,10.0,88.82691760886172,11.504474832710557,724.6474816548626,0.0,1114.2036899159368,25.0,4.00337819774453,-1.6,-1.8871738944593603,0.447213595499958,8002.543578036849,55.0,1000.317947254606,6.875,118.5476910585547,8.838834764831844,724.6474816548626,0.0,1092.4934907204822,25.0,1.8855027332009808,0.1335510204081629,-1.764323227323885,1.0496613364713396,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
0,2022-04-01,23073.719219865256,250.0,1003.2051834724024,10.869565217391305,91.68364350589783,11.246431142310668,724.6474816548626,0.0,1166.5396308621655,25.0,1.930602398031068,-1.6379455566406262,-1.00759236996,0.3177718469055919,18183.86450421856,180.0,1010.2146946788088,10.0,88.82691760886172,11.504474832710557,724.6474816548626,0.0,1114.2036899159368,25.0,4.00337819774453,-1.6,-1.8871738944593603,0.447213595499958,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
0,2022-05-01,27898.085780020647,335.0,996.3602064293088,11.964285714285714,89.04801902004334,11.331640863356228,783.539838218848,0.0,1169.5751292317634,25.0,0.0157659972115529,-1.7348126758682978,-0.04146256322646...,0.1421119415997587,23073.719219865256,250.0,1003.2051834724024,10.869565217391305,91.68364350589783,11.246431142310668,724.6474816548626,0.0,1166.5396308621655,25.0,1.930602398031068,-1.6379455566406262,-1.00759236996,0.3177718469055919,8002.543578036849,55.0,1000.317947254606,6.875,118.5476910585547,8.838834764831844,724.6474816548626,0.0,1092.4934907204822,25.0,1.8855027332009808,0.1335510204081629,-1.764323227323885,1.0496613364713396,,,,,,,,,,,,,,,,,,,,,,,,,,,,


## increase_rate example

In [24]:
features = make_features.run(
    df=df,
    suites=[
        "numerical_statistics",
        "increase_rate",
    ],
    options={},
)

print(f"Features created: {features.columns}")

print(f"Number of features: {len(features.columns)}")

Features created: ['consumer_id_ref', 'date_ref', 'sum___paid_value', 'sum___discount', 'mean___paid_value', 'mean___discount', 'stddev___paid_value', 'stddev___discount', 'min___paid_value', 'min___discount', 'max___paid_value', 'max___discount', 'kurtosis___paid_value', 'kurtosis___discount', 'skewness___paid_value', 'skewness___discount', 'increase_rate_sum___paid_value', 'increase_rate_sum___discount', 'increase_rate_mean___paid_value', 'increase_rate_mean___discount', 'increase_rate_stddev___paid_value', 'increase_rate_stddev___discount', 'increase_rate_min___paid_value', 'increase_rate_min___discount', 'increase_rate_max___paid_value', 'increase_rate_max___discount', 'increase_rate_kurtosis___paid_value', 'increase_rate_kurtosis___discount', 'increase_rate_skewness___paid_value', 'increase_rate_skewness___discount']
Number of features: 30


In [25]:
features.limit(5)

consumer_id_ref,date_ref,sum___paid_value,sum___discount,mean___paid_value,mean___discount,stddev___paid_value,stddev___discount,min___paid_value,min___discount,max___paid_value,max___discount,kurtosis___paid_value,kurtosis___discount,skewness___paid_value,skewness___discount,increase_rate_sum___paid_value,increase_rate_sum___discount,increase_rate_mean___paid_value,increase_rate_mean___discount,increase_rate_stddev___paid_value,increase_rate_stddev___discount,increase_rate_min___paid_value,increase_rate_min___discount,increase_rate_max___paid_value,increase_rate_max___discount,increase_rate_kurtosis___paid_value,increase_rate_kurtosis___discount,increase_rate_skewness___paid_value,increase_rate_skewness___discount
0,2022-01-01,,,,,,,,,,,,,,,,,,,,,,,,,,,,
0,2022-02-01,8002.543578036849,55.0,1000.317947254606,6.875,118.5476910585547,8.838834764831844,724.6474816548626,0.0,1092.4934907204822,25.0,1.8855027332009808,0.1335510204081629,-1.764323227323885,1.0496613364713396,,,,,,,,,,,,,,
0,2022-03-01,18183.86450421856,180.0,1010.2146946788088,10.0,88.82691760886172,11.504474832710557,724.6474816548626,0.0,1114.2036899159368,25.0,4.00337819774453,-1.6,-1.8871738944593603,0.447213595499958,1.272260604006526,2.272727272727273,0.009893601780678211,0.4545454545454545,-0.2507073160540334,0.301582746911937,0.0,,0.019872154278134,0.0,1.123241789709885,-12.980440097799542,0.0696304765662551,-0.5739448715874762
0,2022-04-01,23073.719219865256,250.0,1003.2051834724024,10.869565217391305,91.68364350589783,11.246431142310668,724.6474816548626,0.0,1166.5396308621655,25.0,1.930602398031068,-1.6379455566406262,-1.00759236996,0.3177718469055919,0.2689117439536728,0.3888888888888889,-0.00693863516669...,0.0869565217391304,0.03216058796068272,-0.02242985396136...,0.0,,0.0469716097872351,0.0,-0.5177566788172165,0.0237159729003913,-0.466083982552834,-0.2894405489834404
0,2022-05-01,27898.085780020647,335.0,996.3602064293088,11.964285714285714,89.04801902004334,11.331640863356228,783.539838218848,0.0,1169.5751292317634,25.0,0.0157659972115529,-1.7348126758682978,-0.04146256322646...,0.1421119415997587,0.2090849123275222,0.34,-0.00682310773096...,0.1007142857142856,-0.02874694313042...,0.00757660096499315,0.0812703529024817,,0.0026021390866542,0.0,-0.9918336384396746,0.0591394010838448,-0.9588498638312356,-0.552786242759953
