# Feature Creation

In [1]:
# Standard imports
import numpy as np
import pandas as pd

# Built-in library
import itertools
import re
import json
from pathlib import Path
import typing as tp

import warnings

warnings.filterwarnings("error")

# pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

# Black code formatter (Optional)
%load_ext lab_black
# auto reload imports
%load_ext autoreload
%autoreload 2

In [2]:
# Helper function
def load_data(*, filename: Path) -> pd.DataFrame:
    """This returns the data as a Pandas DF.

    Params:
    -------
    filename (Path): The input filepath.

    Returns:
    --------
    data (Pandas DF): The loaded DF.
    """

    data = (
        pd.read_csv(filename) if filename.endswith("csv") else pd.read_parquet(filename)
    )
    print(f"Data shape: {data.shape}\n")
    return data


def generate_unique_IDs(*, num: int) -> tp.List[str]:
    """This returns a list containing the genrated unique IDs."""
    import uuid

    IDs = [str(uuid.uuid4()) for x in range(num)]

    return IDs

In [3]:
import featuretools as ft


# Load data
data = ft.demo.load_mock_customer()
transactions_df = data["transactions"].merge(data["sessions"]).merge(data["customers"])

transactions_df.sample(10)

Unnamed: 0,transaction_id,session_id,transaction_time,product_id,amount,customer_id,device,session_start,zip_code,join_date,birthday
264,380,21,2014-01-01 05:14:10,5,57.09,4,desktop,2014-01-01 05:02:15,60091,2011-04-08 20:08:14,2006-08-15
19,244,10,2014-01-01 02:34:55,2,116.95,2,tablet,2014-01-01 02:31:40,13244,2012-04-15 23:31:04,1986-08-18
314,299,6,2014-01-01 01:32:05,4,64.99,1,tablet,2014-01-01 01:23:25,60091,2011-04-17 10:48:33,1994-07-18
290,78,4,2014-01-01 00:54:10,1,37.5,1,mobile,2014-01-01 00:44:25,60091,2011-04-17 10:48:33,1994-07-18
379,457,27,2014-01-01 06:37:35,1,19.16,1,mobile,2014-01-01 06:34:20,60091,2011-04-17 10:48:33,1994-07-18
335,477,9,2014-01-01 02:30:35,3,41.7,1,desktop,2014-01-01 02:15:25,60091,2011-04-17 10:48:33,1994-07-18
293,103,4,2014-01-01 00:57:25,5,20.79,1,mobile,2014-01-01 00:44:25,60091,2011-04-17 10:48:33,1994-07-18
271,390,22,2014-01-01 05:21:45,2,54.83,4,desktop,2014-01-01 05:21:45,60091,2011-04-08 20:08:14,2006-08-15
404,476,29,2014-01-01 07:24:10,4,121.59,1,mobile,2014-01-01 07:10:05,60091,2011-04-17 10:48:33,1994-07-18
179,90,3,2014-01-01 00:35:45,1,75.73,4,mobile,2014-01-01 00:28:10,60091,2011-04-08 20:08:14,2006-08-15


In [4]:
# Add another DF: 2nd DF is a list of products involved in those transactions.
products_df = data["products"]
products_df.head()

Unnamed: 0,product_id,brand
0,1,B
1,2,B
2,3,B
3,4,B
4,5,A


### Create an Entity Set

In [5]:
# Create an entity set
es = ft.EntitySet(id="customer_data")

es

Entityset: customer_data
  DataFrames:
  Relationships:
    No relationships

### Adding dataframes
To get started, we add the transactions dataframe to the EntitySet. In the call to add_dataframe, we specify three important parameters:

* The index parameter specifies the column that uniquely identifies rows in the dataframe.

* The time_index parameter tells Featuretools when the data was created.

* The logical_types parameter indicates that id should be interpreted as a Categorical column, even though it is just an integer in the underlying data.

In [6]:
from woodwork.logical_types import Categorical, PostalCode

es = es.add_dataframe(
    dataframe_name="transactions",
    dataframe=transactions_df,
    index="transaction_id",
    time_index="transaction_time",
    logical_types={
        "product_id": Categorical,
        "zip_code": PostalCode,
    },
)

es

Entityset: customer_data
  DataFrames:
    transactions [Rows: 500, Columns: 11]
  Relationships:
    No relationships

You can also use a setter on the EntitySet object to add dataframes

```python
es["transactions"] = transactions_df

````

es.plot()

In [7]:
# Check the schema/datatypes
es["transactions"].ww.schema

Unnamed: 0_level_0,Logical Type,Semantic Tag(s)
Column,Unnamed: 1_level_1,Unnamed: 2_level_1
transaction_id,Integer,['index']
session_id,Integer,['numeric']
transaction_time,Datetime,['time_index']
product_id,Categorical,['category']
amount,Double,['numeric']
customer_id,Integer,['numeric']
device,Categorical,['category']
session_start,Datetime,[]
zip_code,PostalCode,['category']
join_date,Datetime,[]


In [8]:
# Add products to the EntitySet
es = es.add_dataframe(
    dataframe_name="products", dataframe=products_df, index="product_id"
)

es

Entityset: customer_data
  DataFrames:
    transactions [Rows: 500, Columns: 11]
    products [Rows: 5, Columns: 2]
  Relationships:
    No relationships

### Adding a Relationship
We want to relate these two dataframes by the columns called “product_id” in each dataframe. Each product has multiple transactions associated with it, so it is called the parent dataframe, while the transactions dataframe is known as the child dataframe. When specifying relationships, we need four parameters: the parent dataframe name, the parent column name, the child dataframe name, and the child column name. Note that each relationship must denote a one-to-many relationship rather than a relationship which is one-to-one or many-to-many.

In [9]:
es = es.add_relationship("products", "product_id", "transactions", "product_id")
es

Entityset: customer_data
  DataFrames:
    transactions [Rows: 500, Columns: 11]
    products [Rows: 5, Columns: 2]
  Relationships:
    transactions.product_id -> products.product_id

### Creating a dataframe from an existing table
When working with raw data, it is common to have sufficient information to justify the creation of new dataframes. In order to create a new dataframe and relationship for sessions, we “normalize” the `transactions` dataframe.

In [10]:
es = es.normalize_dataframe(
    base_dataframe_name="transactions",
    new_dataframe_name="sessions",
    index="session_id",
    make_time_index="session_start",
    additional_columns=[
        "device",
        "customer_id",
        "zip_code",
        "session_start",
        "join_date",
    ],
)
es

Entityset: customer_data
  DataFrames:
    transactions [Rows: 500, Columns: 6]
    products [Rows: 5, Columns: 2]
    sessions [Rows: 35, Columns: 6]
  Relationships:
    transactions.product_id -> products.product_id
    transactions.session_id -> sessions.session_id

Looking at the output above, we see this method did two operations:

1. It created a new dataframe called “sessions” based on the “session_id” and “session_start” columns in “transactions”

2. It added a relationship connecting “transactions” and “sessions”

If we look at the schema from the transactions dataframe and the new sessions dataframe, we see two more operations that were performed automatically:


In [11]:
es["transactions"].ww.schema

Unnamed: 0_level_0,Logical Type,Semantic Tag(s)
Column,Unnamed: 1_level_1,Unnamed: 2_level_1
transaction_id,Integer,['index']
session_id,Integer,"['foreign_key', 'numeric']"
transaction_time,Datetime,['time_index']
product_id,Categorical,"['category', 'foreign_key']"
amount,Double,['numeric']
birthday,Datetime,[]


In [12]:
es["sessions"].ww.schema

Unnamed: 0_level_0,Logical Type,Semantic Tag(s)
Column,Unnamed: 1_level_1,Unnamed: 2_level_1
session_id,Integer,['index']
device,Categorical,['category']
customer_id,Integer,['numeric']
zip_code,PostalCode,['category']
session_start,Datetime,['time_index']
join_date,Datetime,[]


1. It removed “device”, “customer_id”, “zip_code” and “join_date” from “transactions” and created a new columns in the sessions dataframe. This reduces redundant information as the those properties of a session don’t change between transactions.

2. It copied and marked “session_start” as a time index column into the new sessions dataframe to indicate the beginning of a session. If the base dataframe has a time index and make_time_index is not set, normalize_dataframe will create a time index for the new dataframe. In this case it would create a new time index called “first_transactions_time” using the time of the first transaction of each session. If we don’t want this time index to be created, we can set make_time_index=False.

If we look at the dataframes, we can see what normalize_dataframe did to the actual data.



In [13]:
es["sessions"].head(5)

Unnamed: 0,session_id,device,customer_id,zip_code,session_start,join_date
1,1,desktop,2,13244,2014-01-01 00:00:00,2012-04-15 23:31:04
2,2,mobile,5,60091,2014-01-01 00:17:20,2010-07-17 05:27:50
3,3,mobile,4,60091,2014-01-01 00:28:10,2011-04-08 20:08:14
4,4,mobile,1,60091,2014-01-01 00:44:25,2011-04-17 10:48:33
5,5,mobile,4,60091,2014-01-01 01:11:30,2011-04-08 20:08:14


In [14]:
es["transactions"].head(5)

Unnamed: 0,transaction_id,session_id,transaction_time,product_id,amount,birthday
298,298,1,2014-01-01 00:00:00,5,127.64,1986-08-18
2,2,1,2014-01-01 00:01:05,2,109.48,1986-08-18
308,308,1,2014-01-01 00:02:10,3,95.06,1986-08-18
116,116,1,2014-01-01 00:03:15,4,78.92,1986-08-18
371,371,1,2014-01-01 00:04:20,3,31.54,1986-08-18


In [15]:
es = es.normalize_dataframe(
    base_dataframe_name="sessions",
    new_dataframe_name="customers",
    index="customer_id",
    make_time_index="join_date",
    additional_columns=["zip_code", "join_date"],
)

es

Entityset: customer_data
  DataFrames:
    transactions [Rows: 500, Columns: 6]
    products [Rows: 5, Columns: 2]
    sessions [Rows: 35, Columns: 4]
    customers [Rows: 5, Columns: 3]
  Relationships:
    transactions.product_id -> products.product_id
    transactions.session_id -> sessions.session_id
    sessions.customer_id -> customers.customer_id

### Using the EntitySet
Finally, we are ready to use this `EntitySet` with any functionality within Featuretools. For example, let’s build a feature matrix for each product in our dataset.

In [16]:
feature_matrix, feature_defs = ft.dfs(entityset=es, target_dataframe_name="products")

feature_matrix

Unnamed: 0_level_0,COUNT(transactions),MAX(transactions.amount),MEAN(transactions.amount),MIN(transactions.amount),SKEW(transactions.amount),STD(transactions.amount),SUM(transactions.amount),MODE(transactions.DAY(birthday)),MODE(transactions.DAY(transaction_time)),MODE(transactions.MONTH(birthday)),MODE(transactions.MONTH(transaction_time)),MODE(transactions.WEEKDAY(birthday)),MODE(transactions.WEEKDAY(transaction_time)),MODE(transactions.YEAR(birthday)),MODE(transactions.YEAR(transaction_time)),MODE(transactions.sessions.device),NUM_UNIQUE(transactions.DAY(birthday)),NUM_UNIQUE(transactions.DAY(transaction_time)),NUM_UNIQUE(transactions.MONTH(birthday)),NUM_UNIQUE(transactions.MONTH(transaction_time)),NUM_UNIQUE(transactions.WEEKDAY(birthday)),NUM_UNIQUE(transactions.WEEKDAY(transaction_time)),NUM_UNIQUE(transactions.YEAR(birthday)),NUM_UNIQUE(transactions.YEAR(transaction_time)),NUM_UNIQUE(transactions.sessions.device)
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
1,102,149.56,73.429314,6.84,0.125525,42.479989,7489.79,18,1,7,1,0,2,1994,2014,desktop,4,1,3,1,4,1,5,1,3
2,92,149.95,76.319891,5.73,0.151934,46.336308,7021.43,18,1,8,1,0,2,2006,2014,desktop,4,1,3,1,4,1,5,1,3
3,96,148.31,73.00125,5.89,0.223938,38.871405,7008.12,18,1,8,1,0,2,2006,2014,desktop,4,1,3,1,4,1,5,1,3
4,106,146.46,76.311038,5.81,-0.132077,42.492501,8088.97,18,1,7,1,0,2,1994,2014,desktop,4,1,3,1,4,1,5,1,3
5,104,149.02,76.264904,5.91,0.098248,42.131902,7931.55,18,1,7,1,0,2,1994,2014,mobile,4,1,3,1,4,1,5,1,3


### Running DFS
Typically, without automated feature engineering, a data scientist would write code to aggregate data for a customer, and apply different statistical functions resulting in features quantifying the customer’s behavior. 

An expert might be interested in features such as: total number of sessions or month the customer signed up.

These features can be generated by DFS when we specify the target_dataframe as customers and "count" and "month" as primitives.

In [18]:
feature_matrix, feature_defs = ft.dfs(
    entityset=es,
    target_dataframe_name="customers",
    agg_primitives=["count"],
    trans_primitives=["month"],
    max_depth=1,
)
feature_matrix

Unnamed: 0_level_0,zip_code,COUNT(sessions),MONTH(join_date)
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5,60091,6,7
4,60091,8,4
1,60091,8,4
3,13244,6,8
2,13244,7,4


In the example above, "count" is an aggregation primitive because it computes a single value based on many sessions related to one customer. "month" is called a transform primitive because it takes one value for a customer transforms it to another.

### Creating “Deep Features”
The name Deep Feature Synthesis comes from the algorithm’s ability to stack primitives to generate more complex features. Each time we stack a primitive we increase the “depth” of a feature. The max_depth parameter controls the maximum depth of the features returned by DFS. Let us try running DFS with max_depth=2

In [19]:
feature_matrix, feature_defs = ft.dfs(
    entityset=es,
    target_dataframe_name="customers",
    agg_primitives=["mean", "sum", "mode"],
    trans_primitives=["month", "hour"],
    max_depth=2,
)
feature_matrix

Unnamed: 0_level_0,zip_code,MODE(sessions.device),MEAN(transactions.amount),MODE(transactions.product_id),SUM(transactions.amount),HOUR(join_date),MONTH(join_date),MEAN(sessions.MEAN(transactions.amount)),MEAN(sessions.SUM(transactions.amount)),MODE(sessions.HOUR(session_start)),MODE(sessions.MODE(transactions.product_id)),MODE(sessions.MONTH(session_start)),SUM(sessions.MEAN(transactions.amount)),MODE(transactions.sessions.device)
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
5,60091,mobile,80.375443,5,6349.66,5,7,78.705187,1058.276667,0,3,1,472.231119,mobile
4,60091,mobile,80.070459,2,8727.68,20,4,81.207189,1090.96,1,1,1,649.657515,mobile
1,60091,mobile,71.631905,4,9025.62,10,4,72.77414,1128.2025,6,4,1,582.193117,mobile
3,13244,desktop,67.06043,1,6236.62,15,8,67.539577,1039.436667,5,1,1,405.237462,desktop
2,13244,desktop,77.422366,4,7200.28,23,4,78.415122,1028.611429,3,3,1,548.905851,desktop


In [27]:
feature_matrix[["MEAN(sessions.SUM(transactions.amount))"]]

Unnamed: 0_level_0,MEAN(sessions.SUM(transactions.amount))
customer_id,Unnamed: 1_level_1
5,1058.276667
4,1090.96
1,1128.2025
3,1039.436667
2,1028.611429


With a depth of 2, a number of features are generated using the supplied primitives. The algorithm to synthesize these definitions is described in this [paper](https://www.jmaxkanter.com/papers/DSAA_DSM_2015.pdf). In the returned feature matrix, let us understand one of the depth 2 features.


### Explanation

For each customer this feature

1. calculates the `sum` of all transaction amounts per session to get total amount per session,

2. then applies the `mean` to the total amounts across multiple sessions to identify the `average` amount spent per session

We call this feature a `“deep feature”` with a depth of 2.

In [26]:
feature_matrix[["MODE(sessions.HOUR(session_start))"]]

Unnamed: 0_level_0,MODE(sessions.HOUR(session_start))
customer_id,Unnamed: 1_level_1
5,0
4,1
1,6
3,5
2,3


### Explanation

For each customer this feature calculates

1. The hour of the day each customer's session started, then

2. uses the statistical function `mode` to identify the most common hour he or she started a session

Stacking results in features that are more expressive than individual primitives themselves. This enables the automatic creation of complex patterns for machine learning.

In [None]:
##

### Changing Target DataFrame
DFS is powerful because we can create a feature matrix for any dataframe in our dataset. If we switch our target dataframe to `“sessions”`, we can synthesize features for each session instead of each customer. Now, we can use these features to predict the outcome of a session.

In [28]:
feature_matrix, feature_defs = ft.dfs(
    entityset=es,
    target_dataframe_name="sessions",
    agg_primitives=["mean", "sum", "mode"],
    trans_primitives=["month", "hour"],
    max_depth=2,
)
feature_matrix.head(5)

Unnamed: 0_level_0,device,customer_id,MEAN(transactions.amount),MODE(transactions.product_id),SUM(transactions.amount),HOUR(session_start),MONTH(session_start),customers.zip_code,MODE(transactions.HOUR(birthday)),MODE(transactions.HOUR(transaction_time)),MODE(transactions.MONTH(birthday)),MODE(transactions.MONTH(transaction_time)),customers.MODE(sessions.device),customers.MEAN(transactions.amount),customers.MODE(transactions.product_id),customers.SUM(transactions.amount),customers.HOUR(join_date),customers.MONTH(join_date)
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,desktop,2,76.813125,3,1229.01,0,1,13244,0,0,8,1,desktop,77.422366,4,7200.28,23,4
2,mobile,5,74.696,5,746.96,0,1,60091,0,0,7,1,mobile,80.375443,5,6349.66,5,7
3,mobile,4,88.6,1,1329.0,0,1,60091,0,0,8,1,mobile,80.070459,2,8727.68,20,4
4,mobile,1,64.5572,5,1613.93,0,1,60091,0,0,7,1,mobile,71.631905,4,9025.62,10,4
5,mobile,4,70.638182,5,777.02,1,1,60091,0,1,8,1,mobile,80.070459,2,8727.68,20,4


In [29]:
# DFS also builds deep features based on a parent dataframe,
# in this case the `customer` of a particular `session`. For example,
# the feature below calculates the mean transaction amount of the customer of the session.
feature_matrix[["customers.MEAN(transactions.amount)"]].head(5)

Unnamed: 0_level_0,customers.MEAN(transactions.amount)
session_id,Unnamed: 1_level_1
1,77.422366
2,80.375443
3,80.070459
4,71.631905
5,80.070459


### Feature Primitives
Feature primitives are the building blocks of Featuretools. They define individual computations that can be applied to raw datasets to create new features. Because a primitive only constrains the input and output data types, they can be applied across datasets and can stack to create new calculations.

Why primitives?
The space of potential functions that humans use to create a feature is expansive. By breaking common feature engineering calculations down into primitive components, we are able to capture the underlying structure of the features humans create today.

A primitive only constrains the input and output data types. This means they can be used to transfer calculations known in one domain to another. Consider a feature which is often calculated by data scientists for transactional or event logs data: `average time between events`. This feature is incredibly valuable in predicting fraudulent behavior or future customer engagement.

DFS achieves the same feature by stacking two primitives `"time_since_previous"` and `"mean"`

In [30]:
es = ft.demo.load_mock_customer(return_entityset=True)

feature_defs = ft.dfs(
    entityset=es,
    target_dataframe_name="customers",
    agg_primitives=["mean"],
    trans_primitives=["time_since_previous"],
    features_only=True,  #  If True, returns the list of features w/o calculating the feature matrix.
)

feature_defs

[<Feature: zip_code>,
 <Feature: MEAN(transactions.amount)>,
 <Feature: TIME_SINCE_PREVIOUS(join_date)>,
 <Feature: MEAN(sessions.MEAN(transactions.amount))>,
 <Feature: MEAN(sessions.TIME_SINCE_PREVIOUS(session_start))>]

### Note: 

The primitive arguments to DFS (eg. agg_primitives and trans_primitives in the example above) accept `snake_case`, `camelCase`, or `TitleCase` strings of included Featuretools primitives (ie. time_since_previous, timeSincePrevious, and TimeSincePrevious are all acceptable inputs).

In [31]:
feature_matrix, feature_defs = ft.dfs(
    entityset=es,
    target_dataframe_name="customers",
    agg_primitives=["mean", "max", "min", "std", "skew"],
    trans_primitives=["time_since_previous"],
)

feature_matrix[
    [
        "MEAN(sessions.TIME_SINCE_PREVIOUS(session_start))",
        "MAX(sessions.TIME_SINCE_PREVIOUS(session_start))",
        "MIN(sessions.TIME_SINCE_PREVIOUS(session_start))",
        "STD(sessions.TIME_SINCE_PREVIOUS(session_start))",
        "SKEW(sessions.TIME_SINCE_PREVIOUS(session_start))",
    ]
]

Unnamed: 0_level_0,MEAN(sessions.TIME_SINCE_PREVIOUS(session_start)),MAX(sessions.TIME_SINCE_PREVIOUS(session_start)),MIN(sessions.TIME_SINCE_PREVIOUS(session_start)),STD(sessions.TIME_SINCE_PREVIOUS(session_start)),SKEW(sessions.TIME_SINCE_PREVIOUS(session_start))
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
5,1007.5,1170.0,715.0,157.884451,-1.507217
4,999.375,1625.0,650.0,308.688904,1.065177
1,966.875,1170.0,715.0,171.754341,-0.254557
3,888.333333,1170.0,650.0,177.613813,0.434581
2,725.833333,975.0,520.0,194.638554,0.162631


### Aggregation vs Transform Primitive
In the example above, we use two types of primitives.

**Aggregation primitives**: These primitives take `related instances` as an input and output a single value. They are applied across a parent-child relationship in an EntitySet. E.g: `"count"`, `"sum"`, `"avg_time_between"`.

**Transform primitives**: These primitives take `one or more columns` from a dataframe as an input and output a new column for that dataframe. They are applied to a single dataframe. E.g: `"hour"`, `"time_since_previous"`, `"absolute"`.

For a DataFrame that lists and describes each built-in primitive in Featuretools, call:


```python
# list and describe each built-in primitive in Featuretools
ft.list_primitives()
```

In [34]:
ft.list_primitives().head(10)

Unnamed: 0,name,type,dask_compatible,spark_compatible,description,valid_inputs,return_type
0,n_most_common,aggregation,False,False,Determines the `n` most common elements.,<ColumnSchema (Semantic Tags = ['category'])>,
1,time_since_first,aggregation,False,False,Calculates the time elapsed since the first datetime (in seconds).,<ColumnSchema (Logical Type = Datetime) (Semantic Tags = ['time_index'])>,<ColumnSchema (Logical Type = Double) (Semantic Tags = ['numeric'])>
2,mode,aggregation,False,False,Determines the most commonly repeated value.,<ColumnSchema (Semantic Tags = ['category'])>,
3,max,aggregation,True,True,"Calculates the highest value, ignoring `NaN` values.",<ColumnSchema (Semantic Tags = ['numeric'])>,<ColumnSchema (Semantic Tags = ['numeric'])>
4,median,aggregation,False,False,Determines the middlemost number in a list of values.,<ColumnSchema (Semantic Tags = ['numeric'])>,<ColumnSchema (Semantic Tags = ['numeric'])>
5,count_below_mean,aggregation,False,False,Determines the number of values that are below the mean.,<ColumnSchema (Semantic Tags = ['numeric'])>,<ColumnSchema (Logical Type = IntegerNullable) (Semantic Tags = ['numeric'])>
6,count_above_mean,aggregation,False,False,Calculates the number of values that are above the mean.,<ColumnSchema (Semantic Tags = ['numeric'])>,<ColumnSchema (Logical Type = IntegerNullable) (Semantic Tags = ['numeric'])>
7,mean,aggregation,True,True,Computes the average for a list of values.,<ColumnSchema (Semantic Tags = ['numeric'])>,<ColumnSchema (Semantic Tags = ['numeric'])>
8,min,aggregation,True,True,"Calculates the smallest value, ignoring `NaN` values.",<ColumnSchema (Semantic Tags = ['numeric'])>,<ColumnSchema (Semantic Tags = ['numeric'])>
9,percent_true,aggregation,True,False,Determines the percent of `True` values.,"<ColumnSchema (Logical Type = BooleanNullable)>, <ColumnSchema (Logical Type = Boolean)>",<ColumnSchema (Logical Type = Double) (Semantic Tags = ['numeric'])>


### Defining Custom Primitives

The library of primitives in Featuretools is constantly expanding. Users can define their own primitive using the APIs below. To define a primitive, a user will

1. Specify the type of primitive Aggregation or Transform

2. Define the input and output data types

3. Write a function in python to do the calculation

4. Annotate with attributes to constrain how it is applied

Once a primitive is defined, it can stack with existing primitives to generate complex patterns. This enables primitives known to be important for one domain to automatically be transfered to another.

In [35]:
from featuretools.primitives import AggregationPrimitive, TransformPrimitive
from featuretools.tests.testing_utils import make_ecommerce_entityset
from woodwork.column_schema import ColumnSchema
from woodwork.logical_types import Datetime, NaturalLanguage
import pandas as pd

### Simple Custom Primitives

In [48]:
class Absolute(TransformPrimitive):
    name = "absolute"
    input_types = [ColumnSchema(semantic_tags={"numeric"})]
    return_type = ColumnSchema(semantic_tags={"numeric"})

    # Create a decorator
    def get_function(self):
        def absolute(column):
            return abs(column)

        return absolute


class Maximum(AggregationPrimitive):
    name = "maximum"
    input_types = [ColumnSchema(semantic_tags={"numeric"})]
    return_type = ColumnSchema(semantic_tags={"numeric"})

    # Create a decorator
    def get_function(self):
        def maximum(column):
            return max(column)

        return maximum


class WordCount(TransformPrimitive):
    """
    Counts the number of words in each row of the column. Returns a list
    of the counts for each row.
    """

    name = "word_count"
    input_types = [ColumnSchema(logical_type=NaturalLanguage)]
    return_type = ColumnSchema(semantic_tags={"numeric"})

    def get_function(self):
        def word_count(column):
            word_counts = [len(value.split(None)) for value in column]
            return word_counts

        return word_count

Because we defined an aggregation primitive, the function takes in a list of values but only returns one.

Now that we’ve defined two primitives, we can use them with the dfs function as if they were built-in primitives.

In [39]:
feature_matrix, feature_defs = ft.dfs(
    entityset=es,
    target_dataframe_name="sessions",
    agg_primitives=[Maximum],
    trans_primitives=[Absolute],
    max_depth=2,
)

feature_matrix.head(5)[
    [
        "customers.MAXIMUM(transactions.amount)",
        "MAXIMUM(transactions.ABSOLUTE(amount))",
    ]
]

Unnamed: 0_level_0,customers.MAXIMUM(transactions.amount),MAXIMUM(transactions.ABSOLUTE(amount))
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,146.81,141.66
2,149.02,135.25
3,149.95,147.73
4,139.43,129.0
5,149.95,139.2


#### Word Count Example

In [53]:
# Word Count Example
es = make_ecommerce_entityset()
es

Entityset: ecommerce
  DataFrames:
    régions [Rows: 2, Columns: 2]
    stores [Rows: 6, Columns: 3]
    products [Rows: 6, Columns: 4]
    customers [Rows: 3, Columns: 15]
    sessions [Rows: 6, Columns: 6]
    log [Rows: 17, Columns: 17]
    cohorts [Rows: 2, Columns: 3]
  Relationships:
    customers.cohort -> cohorts.cohort
    customers.région_id -> régions.id
    stores.région_id -> régions.id
    sessions.customer_id -> customers.id
    log.session_id -> sessions.id
    log.product_id -> products.id

In [54]:
es["customers"]

Unnamed: 0,id,age,région_id,cohort,loves_ice_cream,favorite_quote,signup_date,upgrade_date,cancel_date,cancel_reason,engagement_level,full_name,email,phone_number,birthday
2,2,56,United States,0,True,All members of the working classes must seize the means of production.,2011-04-06,2011-04-07,2012-01-06,reason_1,2,James Brown,team@featuretools.com,1-(555)-555-5555,1993-04-20
0,0,33,United States,0,True,The proletariat have nothing to lose but their chains,2011-04-08,2011-04-10,2011-06-08,reason_1,1,Mr. John Doe,john.smith@example.com,555-555-5555,1993-03-08
1,1,25,United States,1,False,Capitalism deprives us all of self-determination,2011-04-09,2011-04-11,2011-10-09,reason_2,3,"Doe, Mrs. Jane",,555-555-5555,1926-08-02


In [47]:
feature_matrix, features = ft.dfs(
    entityset=es,
    target_dataframe_name="sessions",
    agg_primitives=["sum", "mean", "std"],
    trans_primitives=[WordCount],
)

feature_matrix[
    [
        "customers.WORD_COUNT(favorite_quote)",
        "STD(log.WORD_COUNT(comments))",
        "SUM(log.WORD_COUNT(comments))",
        "MEAN(log.WORD_COUNT(comments))",
    ]
]

Unnamed: 0_level_0,customers.WORD_COUNT(favorite_quote),STD(log.WORD_COUNT(comments)),SUM(log.WORD_COUNT(comments)),MEAN(log.WORD_COUNT(comments))
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,9.0,540.43686,2500.0,500.0
1,9.0,583.70255,1732.0,433.0
2,9.0,,246.0,246.0
3,6.0,883.883476,1256.0,628.0
4,6.0,0.0,9.0,3.0
5,12.0,19.79899,68.0,34.0


In [63]:
feature_matrix

Unnamed: 0_level_0,customer_id,device_type,device_name,MEAN(log.value),MEAN(log.value_2),MEAN(log.value_many_nans),STD(log.value),STD(log.value_2),STD(log.value_many_nans),SUM(log.value),SUM(log.value_2),SUM(log.value_many_nans),customers.age,customers.région_id,customers.cohort,customers.loves_ice_cream,customers.cancel_reason,customers.engagement_level,MEAN(log.WORD_COUNT(comments)),MEAN(log.products.rating),STD(log.WORD_COUNT(comments)),STD(log.products.rating),SUM(log.WORD_COUNT(comments)),SUM(log.products.rating),customers.MEAN(log.value),customers.MEAN(log.value_2),customers.MEAN(log.value_many_nans),customers.STD(log.value),customers.STD(log.value_2),customers.STD(log.value_many_nans),customers.SUM(log.value),customers.SUM(log.value_2),customers.SUM(log.value_many_nans),customers.WORD_COUNT(favorite_quote),customers.cohorts.cohort_name,customers.régions.language
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1
0,0,0,PC,10.0,4.0,,7.905694,3.162278,,50.0,20.0,0.0,33,United States,0,True,reason_1,1,500.0,4.6,540.43686,0.547723,2500.0,23.0,5.6,2.6,1.2,7.07421,2.716207,1.30384,56.0,26.0,6.0,9.0,Early Adopters,en
1,0,1,Mobile,1.5,1.5,1.5,1.290994,1.290994,1.290994,6.0,6.0,6.0,33,United States,0,True,reason_1,1,433.0,3.75,583.70255,1.5,1732.0,15.0,5.6,2.6,1.2,7.07421,2.716207,1.30384,56.0,26.0,6.0,9.0,Early Adopters,en
2,0,1,Mobile,0.0,0.0,0.0,,,,0.0,0.0,0.0,33,United States,0,True,reason_1,1,246.0,1.5,,,246.0,1.5,5.6,2.6,1.2,7.07421,2.716207,1.30384,56.0,26.0,6.0,9.0,Early Adopters,en
3,1,0,PC,2.5,1.0,,3.535534,1.414214,,5.0,2.0,0.0,25,United States,1,False,reason_2,3,628.0,4.25,883.883476,1.06066,1256.0,8.5,5.2,2.2,3.0,5.80517,2.48998,3.0,26.0,11.0,9.0,6.0,Late Adopters,en
4,1,0,PC,7.0,3.0,3.0,7.0,3.0,3.0,21.0,9.0,9.0,25,United States,1,False,reason_2,3,3.0,5.0,0.0,0.0,9.0,15.0,5.2,2.2,3.0,5.80517,2.48998,3.0,26.0,11.0,9.0,6.0,Late Adopters,en
5,2,1,Mobile,,,,,,,0.0,0.0,0.0,56,United States,0,True,reason_1,2,34.0,5.0,19.79899,0.0,68.0,10.0,,,,,,,0.0,0.0,0.0,12.0,Early Adopters,en


In [55]:
es["log"].head()

Unnamed: 0,id,session_id,product_id,datetime,value,value_2,latlong,latlong2,zipcode,countrycode,subregioncode,value_many_nans,priority_level,purchased,url,email_address,comments
0,0,0,coke zero,2011-04-09 10:30:00,0.0,0.0,"(0.0, 0.0)","(0.0, 0.0)",2116,US,US-AZ,,0,True,https://www.featuretools.com/,john.smith@example.com,"When it comes to Coca-Cola products, people tend to be die-hard fans. Many of us know someone who can't go a day without a Diet Coke (or two or three). And while Diet Coke has been a leading sugar-free soft drink since it was first released in 1982, it came to light that young adult males shied away from this beverage — identifying diet cola as a woman's drink. The company's answer to that predicament came in 2005 - in the form of a shiny black can - with the release of Coca-Cola Zero. While Diet Coke was created with its own flavor profile and not as a sugar-free version of the original..."
1,1,0,coke zero,2011-04-09 10:30:06,5.0,2.0,"(5.0, 2.0)","(2.0, -5.0)",2116,US,US-AZ,,0,True,https://www.featuretools.com/,,I loved it
2,2,0,coke zero,2011-04-09 10:30:12,10.0,4.0,"(10.0, 4.0)","(4.0, -10.0)",2116,US,US-AZ,,1,True,amazon.com,team@featuretools.com,I loved it
3,3,0,car,2011-04-09 10:30:18,15.0,6.0,"(15.0, 6.0)","(6.0, -15.0)",2116,US,US-AZ,,1,True,amazon.com,john.smith@example.com,"The full-size pickup truck and the V-8 engine were supposed to be inseparable, like the internet and cat videos. You can’t have one without the other—or so we thought. In America’s most popular vehicle, the Ford F-150, two turbocharged six-cylinder engines marketed under the EcoBoost name have dethroned the naturally aspirated V-8. Ford’s new 2.7-liter twin-turbo V-6 is the popular choice, while the 3.5-liter twin-turbo V-6 is the top performer. The larger six allows for greater hauling capacity, accelerates the truck more quickly, and swills less gas in EPA testing than the V-8 alternat..."
4,4,0,car,2011-04-09 10:30:24,20.0,8.0,"(20.0, 8.0)","(8.0, -20.0)",2116,US,US-AZ,,1,True,www.featuretools.com,,"THE GOOD The Tesla Model S 90D's electric drivetrain is substantially more efficient than any internal combustion engine, and gives the car smooth and quick acceleration. All-wheel drive comes courtesy of a smart dual motor system. The new Autopilot feature eases the stress of stop-and-go traffic and long road trips. THE BAD Even at Tesla's Supercharger stations, recharging the battery takes significantly longer than refilling an internal combustion engine car's gas tank, limiting where you can drive. Tesla hasn't improved its infotainment system much from the Model S' launch. THE BOTTO..."


In [64]:
feature_matrix, _ = ft.dfs(
    entityset=es,
    target_dataframe_name="log",
    agg_primitives=["sum", "mean", "std"],
    trans_primitives=[WordCount],
)

feature_matrix

Unnamed: 0_level_0,session_id,product_id,value,value_2,zipcode,countrycode,subregioncode,value_many_nans,priority_level,purchased,WORD_COUNT(comments),sessions.customer_id,sessions.device_type,sessions.device_name,products.department,products.rating,sessions.MEAN(log.value),sessions.MEAN(log.value_2),sessions.MEAN(log.value_many_nans),sessions.STD(log.value),sessions.STD(log.value_2),sessions.STD(log.value_many_nans),sessions.SUM(log.value),sessions.SUM(log.value_2),sessions.SUM(log.value_many_nans),sessions.customers.age,sessions.customers.région_id,sessions.customers.cohort,sessions.customers.loves_ice_cream,sessions.customers.cancel_reason,sessions.customers.engagement_level,products.MEAN(log.value),products.MEAN(log.value_2),products.MEAN(log.value_many_nans),products.STD(log.value),products.STD(log.value_2),products.STD(log.value_many_nans),products.SUM(log.value),products.SUM(log.value_2),products.SUM(log.value_many_nans)
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1
0,0,coke zero,0.0,0.0,02116,US,US-AZ,,0,True,535.0,0,0,PC,food,5.0,10.0,4.0,,7.905694,3.162278,,50.0,20.0,0.0,33,United States,0,True,reason_1,1,5.857143,2.428571,3.0,5.080307,2.149197,3.0,41.0,17.0,9.0
1,0,coke zero,5.0,2.0,02116,US,US-AZ,,0,True,3.0,0,0,PC,food,5.0,10.0,4.0,,7.905694,3.162278,,50.0,20.0,0.0,33,United States,0,True,reason_1,1,5.857143,2.428571,3.0,5.080307,2.149197,3.0,41.0,17.0,9.0
2,0,coke zero,10.0,4.0,02116,US,US-AZ,,1,True,3.0,0,0,PC,food,5.0,10.0,4.0,,7.905694,3.162278,,50.0,20.0,0.0,33,United States,0,True,reason_1,1,5.857143,2.428571,3.0,5.080307,2.149197,3.0,41.0,17.0,9.0
3,0,car,15.0,6.0,02116,US,US-AZ,,1,True,653.0,0,0,PC,electronics,4.0,10.0,4.0,,7.905694,3.162278,,50.0,20.0,0.0,33,United States,0,True,reason_1,1,17.5,7.0,,3.535534,1.414214,,35.0,14.0,0.0
4,0,car,20.0,8.0,02116,US,US-AZ,,1,True,1306.0,0,0,PC,electronics,4.0,10.0,4.0,,7.905694,3.162278,,50.0,20.0,0.0,33,United States,0,True,reason_1,1,17.5,7.0,,3.535534,1.414214,,35.0,14.0,0.0
5,1,toothpaste,0.0,0.0,02116-3899,AL,US-MT,0.0,1,True,1306.0,0,1,Mobile,health,4.5,1.5,1.5,1.5,1.290994,1.290994,1.290994,6.0,6.0,6.0,33,United States,0,True,reason_1,1,1.0,1.0,1.0,1.0,1.0,1.0,3.0,3.0,3.0
6,1,toothpaste,1.0,1.0,02116-3899,AL,US-MT,1.0,1,True,174.0,0,1,Mobile,health,4.5,1.5,1.5,1.5,1.290994,1.290994,1.290994,6.0,6.0,6.0,33,United States,0,True,reason_1,1,1.0,1.0,1.0,1.0,1.0,1.0,3.0,3.0,3.0
7,1,toothpaste,2.0,2.0,02116-3899,AL,US-MT,2.0,0,True,173.0,0,1,Mobile,health,4.5,1.5,1.5,1.5,1.290994,1.290994,1.290994,6.0,6.0,6.0,33,United States,0,True,reason_1,1,1.0,1.0,1.0,1.0,1.0,1.0,3.0,3.0,3.0
8,1,brown bag,3.0,3.0,02116-3899,AL,US-MT,3.0,0,True,79.0,0,1,Mobile,food,1.5,1.5,1.5,1.5,1.290994,1.290994,1.290994,6.0,6.0,6.0,33,United States,0,True,reason_1,1,1.5,1.5,1.5,2.12132,2.12132,2.12132,3.0,3.0,3.0
9,2,brown bag,0.0,0.0,0,,,0.0,0,True,246.0,0,1,Mobile,food,1.5,0.0,0.0,0.0,,,,0.0,0.0,0.0,33,United States,0,True,reason_1,1,1.5,1.5,1.5,2.12132,2.12132,2.12132,3.0,3.0,3.0
