# Feature Tools の公式ドキュメント
- 公式ページ : https://docs.featuretools.com/index.html

## What is Featuretools?

In [1]:
import featuretools as ft

In [2]:
data = ft.demo.load_mock_customer()
customers_df = data["customers"]
sessions_df = data["sessions"]
transactions_df = data["transactions"]
print(customers_df.shape)
print(sessions_df.shape)
print(transactions_df.shape)

(5, 4)
(35, 4)
(500, 5)


featuretoolsでは、3つの表はそれぞれ**entity** と呼ばれる

In [3]:
customers_df.head(2)

Unnamed: 0,customer_id,zip_code,join_date,date_of_birth
0,1,60091,2011-04-17 10:48:33,1994-07-18
1,2,13244,2012-04-15 23:31:04,1986-08-18


In [4]:
sessions_df.head(2)

Unnamed: 0,session_id,customer_id,device,session_start
0,1,2,desktop,2014-01-01 00:00:00
1,2,5,mobile,2014-01-01 00:17:20


In [5]:
transactions_df.head(2)

Unnamed: 0,transaction_id,session_id,transaction_time,product_id,amount
0,298,1,2014-01-01 00:00:00,5,127.64
1,2,1,2014-01-01 00:01:05,2,109.48


entitiyについての集合を登録する

In [6]:
entities = {
    "customers" : (customers_df, "customer_id"),
    "sessions" : (sessions_df, "session_id", "session_start"),
    "transactions" : (transactions_df, "transaction_id", "transaction_time")
}

entityの中で親と子の関係を作る　**(parent_entity, parent_variable, child_entity, child_variable)** の順

In [7]:
relationships = [
    ("sessions", "session_id", "transactions", "session_id"),
    ("customers", "customer_id", "sessions", "customer_id")
]

**entityの集合**、**関係**、**特徴を計算したいtarget = customers**の3つの引数をとって、特徴ベクトルを計算する

In [8]:
feature_matrix_customers, features_defs = ft.dfs(
    entities=entities, 
    relationships=relationships,
    target_entity="customers"
)
print(feature_matrix_customers.shape)

(5, 73)


In [9]:
feature_matrix_customers.head(2)

Unnamed: 0_level_0,zip_code,MONTH(join_date),DAY(join_date),STD(transactions.amount),MODE(sessions.device),MONTH(date_of_birth),SUM(transactions.amount),YEAR(join_date),YEAR(date_of_birth),WEEKDAY(join_date),...,MEAN(sessions.SUM(transactions.amount)),MEAN(sessions.MEAN(transactions.amount)),SKEW(sessions.COUNT(transactions)),SUM(sessions.MEAN(transactions.amount)),MAX(sessions.STD(transactions.amount)),MEAN(sessions.COUNT(transactions)),STD(sessions.MAX(transactions.amount)),SUM(sessions.NUM_UNIQUE(transactions.product_id)),MIN(sessions.NUM_UNIQUE(transactions.product_id)),NUM_UNIQUE(sessions.MONTH(session_start))
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60091,4,17,40.442059,mobile,7,9025.62,2011,1994,6,...,1128.2025,72.77414,1.946018,582.193117,46.905665,15.75,7.322191,40,5,1
2,13244,4,15,37.705178,desktop,8,7200.28,2012,1986,6,...,1028.611429,78.415122,-0.303276,548.905851,47.93592,13.285714,17.221593,35,5,1


In [10]:
features_defs[0:3]

[<Feature: zip_code>, <Feature: MONTH(join_date)>, <Feature: DAY(join_date)>]

targetを**sessions**に変えてみる

In [11]:
feature_matrix_sessions, features_defs = ft.dfs(
    entities=entities,
    relationships=relationships,
    target_entity="sessions"
)
print(feature_matrix_sessions.shape)

(35, 44)


In [12]:
feature_matrix_sessions.head(2)

Unnamed: 0_level_0,device,customer_id,MIN(transactions.amount),NUM_UNIQUE(transactions.product_id),MAX(transactions.amount),customers.zip_code,YEAR(session_start),WEEKDAY(session_start),MEAN(transactions.amount),DAY(session_start),...,customers.MIN(transactions.amount),customers.COUNT(sessions),customers.MODE(sessions.device),customers.SUM(transactions.amount),customers.YEAR(date_of_birth),customers.DAY(date_of_birth),customers.WEEKDAY(date_of_birth),customers.NUM_UNIQUE(transactions.product_id),customers.MONTH(date_of_birth),NUM_UNIQUE(transactions.WEEKDAY(transaction_time))
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,desktop,2,20.91,5,141.66,13244,2014,2,76.813125,1,...,8.73,7,desktop,7200.28,1986,18,0,5,8,1
2,mobile,5,9.32,5,135.25,60091,2014,2,74.696,1,...,7.55,6,mobile,6349.66,1984,28,5,5,7,1


## Representing Data with EntitySets


entityとrelationのcollectionである**EntitySet**を定義することで、より簡単にデータの特徴エンジニアリングができるので、featuretoolsではまずこれを定義することを推奨している

ここでは、先にマージ済みの**transactions**と**product**の2つのentityがある場合から考える

In [13]:
transactions_df = data["transactions"].merge(data["sessions"]).merge(data["customers"])
products_df = data["products"]
print(transactions_df.shape)
print(products_df.shape)

(500, 11)
(5, 2)


In [14]:
transactions_df.sample(3)

Unnamed: 0,transaction_id,session_id,transaction_time,product_id,amount,customer_id,device,session_start,zip_code,join_date,date_of_birth
479,267,34,2014-01-01 08:38:55,5,58.47,3,desktop,2014-01-01 08:24:50,13244,2011-08-13 15:42:34,2003-11-21
166,297,32,2014-01-01 08:04:15,5,20.65,5,mobile,2014-01-01 08:02:05,60091,2010-07-17 05:27:50,1984-07-28
145,357,28,2014-01-01 07:04:40,2,97.2,5,mobile,2014-01-01 06:50:35,60091,2010-07-17 05:27:50,1984-07-28


In [15]:
products_df.sample(3)

Unnamed: 0,product_id,brand
0,1,B
3,4,B
2,3,B


**EntitySet**をまず初期化

In [16]:
es = ft.EntitySet()

**EntitySetにtransactions_dfを追加**

In [17]:
es = es.entity_from_dataframe(
    entity_id="transactions", 
    dataframe= transactions_df, 
    index = "transaction_id",
    time_index = "transaction_time",
    variable_types = {"product_id" : ft.variable_types.Categorical}
)

In [18]:
es

Entityset: None
  Entities:
    transactions [Rows: 500, Columns: 11]
  Relationships:
    No relationships

In [19]:
es["transactions"].variables

[<Variable: transaction_id (dtype = index)>,
 <Variable: amount (dtype = numeric)>,
 <Variable: zip_code (dtype = categorical)>,
 <Variable: session_start (dtype: datetime, format: None)>,
 <Variable: date_of_birth (dtype: datetime, format: None)>,
 <Variable: join_date (dtype: datetime, format: None)>,
 <Variable: device (dtype = categorical)>,
 <Variable: customer_id (dtype = numeric)>,
 <Variable: transaction_time (dtype: datetime_time_index, format: None)>,
 <Variable: session_id (dtype = numeric)>,
 <Variable: product_id (dtype = categorical)>]

**product_df**も追加

In [20]:
es = es.entity_from_dataframe(
    entity_id = "products",
    dataframe = products_df,
    index = "product_id"
)

In [21]:
es

Entityset: None
  Entities:
    transactions [Rows: 500, Columns: 11]
    products [Rows: 5, Columns: 2]
  Relationships:
    No relationships

続いて、Entity間の関係を追加していく。
このデータだと親Entityが**productsのproduct_id**、子Entityが**transactionsのproduct_id**であるので、その順番で登録する

In [22]:
new_relationship = ft.Relationship(es["products"]["product_id"], es["transactions"]["product_id"])

In [23]:
es = es.add_relationship(new_relationship)

In [24]:
es

Entityset: None
  Entities:
    transactions [Rows: 500, Columns: 11]
    products [Rows: 5, Columns: 2]
  Relationships:
    transactions.product_id -> products.product_id

ここで、transactionsから新しく**sessions**のEntityを定義し、transactionsから取り出すことを行う。

以下の**normalize_entity**により、entityとrelationを同時に作成することができる

In [25]:
es = es.normalize_entity(
    base_entity_id = "transactions", 
    new_entity_id="sessions", 
    index = "session_id",
    make_time_index = "session_start",
    additional_variables = ["device", "customer_id", "zip_code", "session_start", "join_date"]
)

In [26]:
es

Entityset: None
  Entities:
    transactions [Rows: 500, Columns: 6]
    products [Rows: 5, Columns: 2]
    sessions [Rows: 35, Columns: 6]
  Relationships:
    transactions.product_id -> products.product_id
    transactions.session_id -> sessions.session_id

In [27]:
es["transactions"].df.head(3)

Unnamed: 0,transaction_id,amount,date_of_birth,transaction_time,session_id,product_id
298,298,127.64,1986-08-18,2014-01-01 00:00:00,1,5
2,2,109.48,1986-08-18,2014-01-01 00:01:05,1,2
308,308,95.06,1986-08-18,2014-01-01 00:02:10,1,3


In [28]:
es["transactions"].variables

[<Variable: transaction_id (dtype = index)>,
 <Variable: amount (dtype = numeric)>,
 <Variable: date_of_birth (dtype: datetime, format: None)>,
 <Variable: transaction_time (dtype: datetime_time_index, format: None)>,
 <Variable: session_id (dtype = id)>,
 <Variable: product_id (dtype = categorical)>]

In [29]:
es["sessions"].df.head(3)

Unnamed: 0,session_id,zip_code,session_start,join_date,device,customer_id
1,1,13244,2014-01-01 00:00:00,2012-04-15 23:31:04,desktop,2
2,2,60091,2014-01-01 00:17:20,2010-07-17 05:27:50,mobile,5
3,3,60091,2014-01-01 00:28:10,2011-04-08 20:08:14,mobile,4


In [30]:
es["sessions"].variables

[<Variable: session_id (dtype = index)>,
 <Variable: zip_code (dtype = categorical)>,
 <Variable: session_start (dtype: datetime_time_index, format: None)>,
 <Variable: join_date (dtype: datetime, format: None)>,
 <Variable: device (dtype = categorical)>,
 <Variable: customer_id (dtype = numeric)>]

**customers**も同様に作る

In [31]:
es = es.normalize_entity(base_entity_id="sessions",
   ....:                          new_entity_id="customers",
   ....:                          index="customer_id",
   ....:                          make_time_index="join_date",
   ....:                          additional_variables=["zip_code", "join_date"])

In [32]:
es

Entityset: None
  Entities:
    customers [Rows: 5, Columns: 3]
    transactions [Rows: 500, Columns: 6]
    products [Rows: 5, Columns: 2]
    sessions [Rows: 35, Columns: 4]
  Relationships:
    transactions.product_id -> products.product_id
    transactions.session_id -> sessions.session_id
    sessions.customer_id -> customers.customer_id

準備完了！**EntitySet**を使って特徴を作ってみる

In [33]:
feature_matrix, feature_defs = ft.dfs(entityset = es, target_entity = "products")
print(feature_matrix.shape)

(5, 30)


In [34]:
feature_matrix

Unnamed: 0_level_0,brand,MEAN(transactions.amount),SKEW(transactions.amount),MAX(transactions.amount),STD(transactions.amount),MODE(transactions.session_id),COUNT(transactions),NUM_UNIQUE(transactions.session_id),SUM(transactions.amount),MIN(transactions.amount),...,NUM_UNIQUE(transactions.DAY(date_of_birth)),NUM_UNIQUE(transactions.MONTH(transaction_time)),MODE(transactions.DAY(date_of_birth)),MODE(transactions.sessions.customer_id),NUM_UNIQUE(transactions.sessions.customer_id),NUM_UNIQUE(transactions.DAY(transaction_time)),MODE(transactions.MONTH(transaction_time)),MODE(transactions.DAY(transaction_time)),MODE(transactions.YEAR(date_of_birth)),NUM_UNIQUE(transactions.YEAR(transaction_time))
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,B,73.429314,0.125525,149.56,42.479989,3,102,34,7489.79,6.84,...,4,1,18,1,5,1,1,1,1994,1
2,B,76.319891,0.151934,149.95,46.336308,28,92,34,7021.43,5.73,...,4,1,18,4,5,1,1,1,2006,1
3,B,73.00125,0.223938,148.31,38.871405,1,96,35,7008.12,5.89,...,4,1,18,4,5,1,1,1,2006,1
4,B,76.311038,-0.132077,146.46,42.492501,29,106,34,8088.97,5.81,...,4,1,18,1,5,1,1,1,1994,1
5,A,76.264904,0.098248,149.02,42.131902,4,104,34,7931.55,5.91,...,4,1,18,1,5,1,1,1,1994,1


## Deep Feature Synthesis (DFS)

自動で特徴エンジニアリングを行う機能の詳細をみていく

まず、サンプルのEntitySetをロードする

In [35]:
es = ft.demo.load_mock_customer(return_entityset=True)
es

Entityset: transactions
  Entities:
    customers [Rows: 5, Columns: 4]
    transactions [Rows: 500, Columns: 5]
    products [Rows: 5, Columns: 2]
    sessions [Rows: 35, Columns: 4]
  Relationships:
    transactions.product_id -> products.product_id
    transactions.session_id -> sessions.session_id
    sessions.customer_id -> customers.customer_id

作る特徴として、**カスタマーがサインアップ(join_date)した月**と**カスタマーのセッションの合計数**をつくる

featuretoolsでは、**aggregation**と**transform**の基本コンポーネント(primitive)が揃っている

In [36]:
feature_matrix, feature_defs = ft.dfs(
    entityset = es,
    target_entity = "customers",
    agg_primitives = ["count"],
    trans_primitives = ["month"],
    max_depth = 1
)

In [37]:
feature_matrix

Unnamed: 0_level_0,zip_code,MONTH(join_date),COUNT(sessions),MONTH(date_of_birth)
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,60091,4,8,7
2,13244,4,7,8
3,13244,8,6,11
4,60091,4,8,8
5,60091,7,6,7


**max_depth**パラメータで特徴をどんどん深掘り(=deep)できる。やってみる

In [38]:
feature_matrix, feature_defs = ft.dfs(
    entityset=es,
    target_entity="customers",
    agg_primitives=["mean", "sum", "mode"],
    trans_primitives=["month", "hour"],
    max_depth=2
)
print(feature_matrix.shape)

(5, 15)


In [39]:
feature_matrix

Unnamed: 0_level_0,zip_code,HOUR(join_date),MONTH(date_of_birth),MODE(transactions.product_id),MEAN(transactions.amount),HOUR(date_of_birth),MODE(sessions.device),SUM(transactions.amount),MONTH(join_date),MODE(sessions.MONTH(session_start)),MODE(sessions.MODE(transactions.product_id)),MEAN(sessions.SUM(transactions.amount)),MEAN(sessions.MEAN(transactions.amount)),SUM(sessions.MEAN(transactions.amount)),MODE(sessions.HOUR(session_start))
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,60091,10,7,4,71.631905,0,mobile,9025.62,4,1,4,1128.2025,72.77414,582.193117,6
2,13244,23,8,4,77.422366,0,desktop,7200.28,4,1,3,1028.611429,78.415122,548.905851,3
3,13244,15,11,1,67.06043,0,desktop,6236.62,8,1,1,1039.436667,67.539577,405.237462,5
4,60091,20,8,2,80.070459,0,mobile,8727.68,4,1,1,1090.96,81.207189,649.657515,1
5,60091,5,7,5,80.375443,0,mobile,6349.66,7,1,3,1058.276667,78.705187,472.231119,0


- セッションごとのトランザクションの合計
- それの平均値

を**max_depth=2**の効果で返している

In [40]:
feature_matrix[['MEAN(sessions.SUM(transactions.amount))']]

Unnamed: 0_level_0,MEAN(sessions.SUM(transactions.amount))
customer_id,Unnamed: 1_level_1
1,1128.2025
2,1028.611429
3,1039.436667
4,1090.96
5,1058.276667


- セッションを開始した時間
- その最頻値

を返している

In [41]:
feature_matrix[['MODE(sessions.HOUR(session_start))']]

Unnamed: 0_level_0,MODE(sessions.HOUR(session_start))
customer_id,Unnamed: 1_level_1
1,6
2,3
3,5
4,1
5,0


target_entityを**sessions**に変更する。自動的に親Entityであるcustomersをもとに、各特徴を計算する

In [42]:
feature_matrix, feature_defs = ft.dfs(
    entityset=es,
    target_entity="sessions",
    agg_primitives=["mean", "sum", "mode"],
    trans_primitives=["month", "hour"],
    max_depth=2
)
print(feature_matrix.shape)

(35, 19)


In [43]:
feature_matrix.head(10)

Unnamed: 0_level_0,device,customer_id,MODE(transactions.product_id),MEAN(transactions.amount),SUM(transactions.amount),HOUR(session_start),customers.zip_code,MONTH(session_start),customers.MODE(sessions.device),MODE(transactions.products.brand),customers.MONTH(date_of_birth),customers.MODE(transactions.product_id),customers.SUM(transactions.amount),customers.MEAN(transactions.amount),MODE(transactions.MONTH(transaction_time)),customers.HOUR(join_date),MODE(transactions.HOUR(transaction_time)),customers.HOUR(date_of_birth),customers.MONTH(join_date)
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,desktop,2,3,76.813125,1229.01,0,13244,1,desktop,B,8,4,7200.28,77.422366,1,23,0,0,4
2,mobile,5,5,74.696,746.96,0,60091,1,mobile,A,7,5,6349.66,80.375443,1,5,0,0,7
3,mobile,4,1,88.6,1329.0,0,60091,1,mobile,B,8,2,8727.68,80.070459,1,20,0,0,4
4,mobile,1,5,64.5572,1613.93,0,60091,1,mobile,B,7,4,9025.62,71.631905,1,10,0,0,4
5,mobile,4,5,70.638182,777.02,1,60091,1,mobile,B,8,2,8727.68,80.070459,1,20,1,0,4
6,tablet,1,4,84.44,1266.6,1,60091,1,mobile,B,7,4,9025.62,71.631905,1,10,1,0,4
7,tablet,3,1,62.791333,941.87,1,13244,1,desktop,B,11,1,6236.62,67.06043,1,15,1,0,8
8,tablet,4,1,75.081111,1351.46,1,60091,1,mobile,B,8,2,8727.68,80.070459,1,20,2,0,4
9,desktop,1,1,70.135333,1052.03,2,60091,1,mobile,B,7,4,9025.62,71.631905,1,10,2,0,4
10,tablet,2,2,88.042667,1320.64,2,13244,1,desktop,B,8,4,7200.28,77.422366,1,23,2,0,4


customerごとのtransaction amountの平均値。同じcustomer_idをもつsession_idは同じ値になっていることがわかる

In [44]:
feature_matrix[['customers.MEAN(transactions.amount)']].head(10)

Unnamed: 0_level_0,customers.MEAN(transactions.amount)
session_id,Unnamed: 1_level_1
1,77.422366
2,80.375443
3,80.070459
4,71.631905
5,80.070459
6,71.631905
7,67.06043
8,80.070459
9,71.631905
10,77.422366


## Feature primitives

feature primitivesを詳細にみていく

**features_only = True**にすることで、　特徴の定義のみを得る。すばやく特徴の検査ができる

ここでは、各customerごとに、各sessionごとの時間の差の特徴を作る

In [45]:
feature_defs = ft.dfs(
    entityset=es,
    target_entity="customers",
    agg_primitives=["mean"],
    trans_primitives=["time_since_previous"],
    features_only=True
)

In [46]:
feature_defs

[<Feature: zip_code>,
 <Feature: MEAN(transactions.amount)>,
 <Feature: MEAN(sessions.time_since_previous_by_customer_id)>,
 <Feature: MEAN(sessions.MEAN(transactions.amount))>]

各sessionごとの時間の差の特徴をいろいろ作ってみる

In [47]:
feature_matrix, feature_defs = ft.dfs(
    entityset=es,
    target_entity="customers",
    agg_primitives=["mean", "max", "min", "std", "skew"],
    trans_primitives=["time_since_previous"]
)
print(feature_matrix.shape)

(5, 32)


In [48]:
feature_matrix[[
    "MEAN(sessions.time_since_previous_by_customer_id)",
    "MAX(sessions.time_since_previous_by_customer_id)",
    "MIN(sessions.time_since_previous_by_customer_id)",
    "STD(sessions.time_since_previous_by_customer_id)",
    "SKEW(sessions.time_since_previous_by_customer_id)"
]]

Unnamed: 0_level_0,MEAN(sessions.time_since_previous_by_customer_id),MAX(sessions.time_since_previous_by_customer_id),MIN(sessions.time_since_previous_by_customer_id),STD(sessions.time_since_previous_by_customer_id),SKEW(sessions.time_since_previous_by_customer_id)
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,3305.714286,7345.0,1040.0,2046.357391,1.438363
2,4907.5,13325.0,520.0,5229.127795,0.996087
3,5096.0,10075.0,1170.0,4084.82466,0.40709
4,2516.428571,6435.0,650.0,1975.727111,1.445854
5,5577.0,16120.0,2080.0,5949.613013,2.132658


featuretoolsの中にある、primitives一覧 : **ft.list_primitives()**

In [49]:
print(ft.list_primitives().shape)

(62, 3)


In [50]:
import pandas as pd
pd.set_option("display.max_colwidth", 100)
pd.set_option("display.max_rows", 62)

In [51]:
ft.list_primitives()

Unnamed: 0,name,type,description
0,num_true,aggregation,Finds the number of 'True' values in a boolean.
1,mode,aggregation,Finds the most common element in a categorical feature.
2,n_most_common,aggregation,Finds the N most common elements in a categorical feature.
3,time_since_last,aggregation,Time since last related instance.
4,count,aggregation,Counts the number of non null values.
5,max,aggregation,Finds the maximum non-null value of a numeric feature.
6,last,aggregation,Returns the last value.
7,num_unique,aggregation,Returns the number of unique categorical variables.
8,median,aggregation,Finds the median value of any feature with well-ordered values.
9,trend,aggregation,Calculates the slope of the linear trend of variable overtime.


In [52]:
pd.set_option("display.max_rows", 50)

自分で設定するcustome featureの作り方を説明する

- Aggregation か Transform かを決める
- input と output のデータ形式を定義する
- 処理を書く
- 特徴名をアノテートする

例として、シンプルなtransformとaggregation primitivesを作ってみる

In [53]:
from featuretools.primitives import make_agg_primitive, make_trans_primitive
from featuretools.variable_types import Text, Numeric

In [54]:
def absolute(column):
    return abs(column)
Absolute = make_trans_primitive(
    function=absolute,
    input_types=[Numeric],
    return_type=Numeric
)

In [55]:
def maximum(column):
    return max(column)
Maximum = make_agg_primitive(
    function=maximum,
    input_types=[Numeric],
    return_type=Numeric
)

In [56]:
feature_matrix, feature_defs = ft.dfs(
    entityset=es,
    target_entity="sessions",
    agg_primitives=[Maximum],
    trans_primitives=[Absolute],
    max_depth=2
)

In [57]:
feature_matrix[["customers.MAXIMUM(transactions.amount)", "MAXIMUM(transactions.ABSOLUTE(amount))"]].head(10)

Unnamed: 0_level_0,customers.MAXIMUM(transactions.amount),MAXIMUM(transactions.ABSOLUTE(amount))
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,146.81,141.66
2,149.02,135.25
3,149.95,147.73
4,139.43,129.0
5,149.95,139.2
6,139.43,139.23
7,149.15,146.31
8,149.95,143.85
9,139.43,132.72
10,146.81,142.44


別の例で、word count特徴を作ってみる

実行例とデータが違うので、動かないみたいなので定義のみ

In [58]:
def word_count(column):
    '''
    Counts the number of words in each row of the column. Returns a list　of the counts for each row.
    '''
    word_counts = []
    for value in column:
        words = value.split(None)
        word_counts.append(len(words))
    return word_counts
WordCount = make_trans_primitive(
    function=word_count,
    input_types=[Text],
    return_type=Numeric
)

複数のInput Typeに基づいた特徴も作れる

In [59]:
from featuretools.variable_types import Datetime, Timedelta, Variable

In [60]:
def mean_sunday(numeric, datetime):
    '''
   　Finds the mean of non-null values of a feature that occurred on Sundays
   　'''
    days = pd.DatetimeIndex(datetime).weekday.values
    df = pd.DataFrame({'numeric': numeric, 'time': days})
    return df[df['time'] == 6]['numeric'].mean()
MeanSunday = make_agg_primitive(
    function=mean_sunday,
    input_types=[Numeric, Datetime],
    return_type=Numeric
)

In [61]:
feature_matrix, features = ft.dfs(
    entityset=es,
    target_entity="sessions",
    agg_primitives=[MeanSunday],
    trans_primitives=[],
    max_depth=1
)

In [62]:
feature_matrix.head(3)

Unnamed: 0_level_0,device,customer_id,"MEAN_SUNDAY(transactions.amount, transaction_time)",customers.zip_code
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,desktop,2,,13244
2,mobile,5,,60091
3,mobile,4,,60091


## Handling Time

featuretoolsは時間的な特徴を扱うのが得意

In [63]:
es_mc = ft.demo.load_mock_customer(return_entityset=True, random_seed=0)

In [64]:
es_mc

Entityset: transactions
  Entities:
    customers [Rows: 5, Columns: 4]
    transactions [Rows: 500, Columns: 5]
    products [Rows: 5, Columns: 2]
    sessions [Rows: 35, Columns: 4]
  Relationships:
    transactions.product_id -> products.product_id
    transactions.session_id -> sessions.session_id
    sessions.customer_id -> customers.customer_id

In [65]:
es_mc['transactions'].df.head(3)

Unnamed: 0,transaction_id,transaction_time,session_id,product_id,amount
298,298,2014-01-01 00:00:00,1,5,127.64
2,2,2014-01-01 00:01:05,1,2,109.48
308,308,2014-01-01 00:02:10,1,3,95.06


In [66]:
es_mc['customers'].df.head(3)

Unnamed: 0,customer_id,zip_code,date_of_birth,join_date
5,5,60091,1984-07-28,2010-07-17 05:27:50
4,4,60091,2006-08-15,2011-04-08 20:08:14
1,1,60091,1994-07-18,2011-04-17 10:48:33


In [67]:
ct = pd.DataFrame()
ct['customer_id'] = [1, 2, 3]
ct['time'] = pd.to_datetime(['2014-1-1 04:00', '2014-1-1 04:00', '2014-1-1 04:00'])
ct['label'] = [True, True, False]

**cutoff** 時間より前についての各特徴量を計算してくれる

In [68]:
fm, features = ft.dfs(
    entityset=es_mc,
    target_entity='customers',
    cutoff_time=ct,
    cutoff_time_in_index=True
)

In [69]:
fm

Unnamed: 0_level_0,Unnamed: 1_level_0,zip_code,MONTH(join_date),DAY(join_date),STD(transactions.amount),MODE(sessions.device),MONTH(date_of_birth),SUM(transactions.amount),YEAR(join_date),YEAR(date_of_birth),WEEKDAY(join_date),...,MEAN(sessions.MEAN(transactions.amount)),SKEW(sessions.COUNT(transactions)),SUM(sessions.MEAN(transactions.amount)),MAX(sessions.STD(transactions.amount)),MEAN(sessions.COUNT(transactions)),STD(sessions.MAX(transactions.amount)),SUM(sessions.NUM_UNIQUE(transactions.product_id)),MIN(sessions.NUM_UNIQUE(transactions.product_id)),NUM_UNIQUE(sessions.MONTH(session_start)),label
customer_id,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,2014-01-01 04:00:00,60091,4,17,42.309717,tablet,7,4958.19,2011,1994,6,...,76.150425,1.614843,304.6017,46.905665,16.75,5.027226,20,5,1,True
2,2014-01-01 04:00:00,13244,4,15,39.289512,desktop,8,4150.3,2012,1986,6,...,85.197948,-0.169238,340.791792,47.93592,12.25,3.470527,20,5,1,True
3,2014-01-01 04:00:00,13244,8,13,47.264797,tablet,11,941.87,2011,2003,5,...,62.791333,,62.791333,47.264797,15.0,,5,5,1,False
