# Feature Tools の公式ドキュメント
- 公式ページ : https://docs.featuretools.com/index.html

# What is Featuretools?

In [1]:
import featuretools as ft

In [2]:
print(ft.__version__)

0.5.1


In [3]:
data = ft.demo.load_mock_customer()
customers_df = data["customers"]
sessions_df = data["sessions"]
transactions_df = data["transactions"]
print(customers_df.shape)
print(sessions_df.shape)
print(transactions_df.shape)

(5, 4)
(35, 4)
(500, 5)


featuretoolsでは、3つの表はそれぞれ**entity** と呼ばれる

In [4]:
customers_df.head(2)

Unnamed: 0,customer_id,zip_code,join_date,date_of_birth
0,1,60091,2011-04-17 10:48:33,1994-07-18
1,2,13244,2012-04-15 23:31:04,1986-08-18


In [5]:
sessions_df.head(2)

Unnamed: 0,session_id,customer_id,device,session_start
0,1,2,desktop,2014-01-01 00:00:00
1,2,5,mobile,2014-01-01 00:17:20


In [6]:
transactions_df.head(2)

Unnamed: 0,transaction_id,session_id,transaction_time,product_id,amount
0,298,1,2014-01-01 00:00:00,5,127.64
1,2,1,2014-01-01 00:01:05,2,109.48


entitiyについての集合を登録する

In [7]:
entities = {
    "customers" : (customers_df, "customer_id"),
    "sessions" : (sessions_df, "session_id", "session_start"),
    "transactions" : (transactions_df, "transaction_id", "transaction_time")
}

entityの中で親と子の関係を作る　**(parent_entity, parent_variable, child_entity, child_variable)** の順

In [8]:
relationships = [
    ("sessions", "session_id", "transactions", "session_id"),
    ("customers", "customer_id", "sessions", "customer_id")
]

**entityの集合**、**関係**、**特徴を計算したいtarget = customers**の3つの引数をとって、特徴ベクトルを計算する

In [9]:
feature_matrix_customers, features_defs = ft.dfs(
    entities=entities, 
    relationships=relationships,
    target_entity="customers"
)
print(feature_matrix_customers.shape)

(5, 73)


In [10]:
feature_matrix_customers.head(2)

Unnamed: 0_level_0,zip_code,COUNT(sessions),NUM_UNIQUE(sessions.device),MODE(sessions.device),SUM(transactions.amount),STD(transactions.amount),MAX(transactions.amount),SKEW(transactions.amount),MIN(transactions.amount),MEAN(transactions.amount),...,NUM_UNIQUE(sessions.MODE(transactions.product_id)),NUM_UNIQUE(sessions.DAY(session_start)),NUM_UNIQUE(sessions.YEAR(session_start)),NUM_UNIQUE(sessions.MONTH(session_start)),NUM_UNIQUE(sessions.WEEKDAY(session_start)),MODE(sessions.MODE(transactions.product_id)),MODE(sessions.DAY(session_start)),MODE(sessions.YEAR(session_start)),MODE(sessions.MONTH(session_start)),MODE(sessions.WEEKDAY(session_start))
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60091,8,3,mobile,9025.62,40.442059,139.43,0.019698,5.81,71.631905,...,4,1,1,1,1,4,1,2014,1,2
2,13244,7,3,desktop,7200.28,37.705178,146.81,0.098259,8.73,77.422366,...,4,1,1,1,1,3,1,2014,1,2


In [11]:
features_defs[0:3]

[<Feature: zip_code>,
 <Feature: COUNT(sessions)>,
 <Feature: NUM_UNIQUE(sessions.device)>]

targetを**sessions**に変えてみる

In [12]:
feature_matrix_sessions, features_defs = ft.dfs(
    entities=entities,
    relationships=relationships,
    target_entity="sessions"
)
print(feature_matrix_sessions.shape)

(35, 44)


In [13]:
feature_matrix_sessions.head(2)

Unnamed: 0_level_0,customer_id,device,SUM(transactions.amount),STD(transactions.amount),MAX(transactions.amount),SKEW(transactions.amount),MIN(transactions.amount),MEAN(transactions.amount),COUNT(transactions),NUM_UNIQUE(transactions.product_id),...,customers.NUM_UNIQUE(transactions.product_id),customers.MODE(transactions.product_id),customers.DAY(join_date),customers.DAY(date_of_birth),customers.YEAR(join_date),customers.YEAR(date_of_birth),customers.MONTH(join_date),customers.MONTH(date_of_birth),customers.WEEKDAY(join_date),customers.WEEKDAY(date_of_birth)
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2,desktop,1229.01,41.600976,141.66,0.295458,20.91,76.813125,16,5,...,5,4,15,18,2012,1986,4,8,6,0
2,5,mobile,746.96,45.893591,135.25,-0.16055,9.32,74.696,10,5,...,5,5,17,28,2010,1984,7,7,5,5


# Representing Data with EntitySets


entityとrelationのcollectionである**EntitySet**を定義することで、より簡単にデータの特徴エンジニアリングができるので、featuretoolsではまずこれを定義することを推奨している

ここでは、先にマージ済みの**transactions**と**product**の2つのentityがある場合から考える

In [14]:
transactions_df = data["transactions"].merge(data["sessions"]).merge(data["customers"])
products_df = data["products"]
print(transactions_df.shape)
print(products_df.shape)

(500, 11)
(5, 2)


In [15]:
transactions_df.sample(3)

Unnamed: 0,transaction_id,session_id,transaction_time,product_id,amount,customer_id,device,session_start,zip_code,join_date,date_of_birth
304,422,4,2014-01-01 01:09:20,2,8.72,1,mobile,2014-01-01 00:44:25,60091,2011-04-17 10:48:33,1994-07-18
420,491,7,2014-01-01 01:53:45,2,146.31,3,tablet,2014-01-01 01:39:40,13244,2011-08-13 15:42:34,2003-11-21
447,242,23,2014-01-01 05:41:15,4,20.06,3,desktop,2014-01-01 05:32:35,13244,2011-08-13 15:42:34,2003-11-21


In [16]:
products_df.sample(3)

Unnamed: 0,product_id,brand
3,4,B
2,3,B
1,2,B


**EntitySet**をまず初期化

In [17]:
es = ft.EntitySet()

**EntitySetにtransactions_dfを追加**

In [18]:
es = es.entity_from_dataframe(
    entity_id="transactions", 
    dataframe= transactions_df, 
    index = "transaction_id",
    time_index = "transaction_time",
    variable_types = {"product_id" : ft.variable_types.Categorical}
)

In [19]:
es

Entityset: None
  Entities:
    transactions [Rows: 500, Columns: 11]
  Relationships:
    No relationships

In [20]:
es["transactions"].variables

[<Variable: transaction_id (dtype = index)>,
 <Variable: session_id (dtype = numeric)>,
 <Variable: transaction_time (dtype: datetime_time_index, format: None)>,
 <Variable: amount (dtype = numeric)>,
 <Variable: customer_id (dtype = numeric)>,
 <Variable: device (dtype = categorical)>,
 <Variable: session_start (dtype: datetime, format: None)>,
 <Variable: zip_code (dtype = categorical)>,
 <Variable: join_date (dtype: datetime, format: None)>,
 <Variable: date_of_birth (dtype: datetime, format: None)>,
 <Variable: product_id (dtype = categorical)>]

**product_df**も追加

In [21]:
es = es.entity_from_dataframe(
    entity_id = "products",
    dataframe = products_df,
    index = "product_id"
)

In [22]:
es

Entityset: None
  Entities:
    transactions [Rows: 500, Columns: 11]
    products [Rows: 5, Columns: 2]
  Relationships:
    No relationships

続いて、Entity間の関係を追加していく。
このデータだと親Entityが**productsのproduct_id**、子Entityが**transactionsのproduct_id**であるので、その順番で登録する

In [23]:
new_relationship = ft.Relationship(es["products"]["product_id"], es["transactions"]["product_id"])

In [24]:
es = es.add_relationship(new_relationship)

In [25]:
es

Entityset: None
  Entities:
    transactions [Rows: 500, Columns: 11]
    products [Rows: 5, Columns: 2]
  Relationships:
    transactions.product_id -> products.product_id

ここで、transactionsから新しく**sessions**のEntityを定義し、transactionsから取り出すことを行う。

以下の**normalize_entity**により、entityとrelationを同時に作成することができる

In [26]:
es = es.normalize_entity(
    base_entity_id = "transactions", 
    new_entity_id="sessions", 
    index = "session_id",
    make_time_index = "session_start",
    additional_variables = ["device", "customer_id", "zip_code", "session_start", "join_date"]
)

In [27]:
es

Entityset: None
  Entities:
    transactions [Rows: 500, Columns: 6]
    products [Rows: 5, Columns: 2]
    sessions [Rows: 35, Columns: 6]
  Relationships:
    transactions.product_id -> products.product_id
    transactions.session_id -> sessions.session_id

In [28]:
es["transactions"].df.head(3)

Unnamed: 0,transaction_id,session_id,transaction_time,amount,date_of_birth,product_id
298,298,1,2014-01-01 00:00:00,127.64,1986-08-18,5
2,2,1,2014-01-01 00:01:05,109.48,1986-08-18,2
308,308,1,2014-01-01 00:02:10,95.06,1986-08-18,3


In [29]:
es["transactions"].variables

[<Variable: transaction_id (dtype = index)>,
 <Variable: session_id (dtype = id)>,
 <Variable: transaction_time (dtype: datetime_time_index, format: None)>,
 <Variable: amount (dtype = numeric)>,
 <Variable: date_of_birth (dtype: datetime, format: None)>,
 <Variable: product_id (dtype = categorical)>]

In [30]:
es["sessions"].df.head(3)

Unnamed: 0,session_id,device,customer_id,zip_code,session_start,join_date
1,1,desktop,2,13244,2014-01-01 00:00:00,2012-04-15 23:31:04
2,2,mobile,5,60091,2014-01-01 00:17:20,2010-07-17 05:27:50
3,3,mobile,4,60091,2014-01-01 00:28:10,2011-04-08 20:08:14


In [31]:
es["sessions"].variables

[<Variable: session_id (dtype = index)>,
 <Variable: device (dtype = categorical)>,
 <Variable: customer_id (dtype = numeric)>,
 <Variable: zip_code (dtype = categorical)>,
 <Variable: session_start (dtype: datetime_time_index, format: None)>,
 <Variable: join_date (dtype: datetime, format: None)>]

**customers**も同様に作る

In [32]:
es = es.normalize_entity(base_entity_id="sessions",
   ....:                          new_entity_id="customers",
   ....:                          index="customer_id",
   ....:                          make_time_index="join_date",
   ....:                          additional_variables=["zip_code", "join_date"])

In [33]:
es

Entityset: None
  Entities:
    transactions [Rows: 500, Columns: 6]
    products [Rows: 5, Columns: 2]
    sessions [Rows: 35, Columns: 4]
    customers [Rows: 5, Columns: 3]
  Relationships:
    transactions.product_id -> products.product_id
    transactions.session_id -> sessions.session_id
    sessions.customer_id -> customers.customer_id

準備完了！**EntitySet**を使って特徴を作ってみる

In [34]:
feature_matrix, feature_defs = ft.dfs(entityset = es, target_entity = "products")
print(feature_matrix.shape)

(5, 30)


In [35]:
feature_matrix

Unnamed: 0_level_0,brand,SUM(transactions.amount),STD(transactions.amount),MAX(transactions.amount),SKEW(transactions.amount),MIN(transactions.amount),MEAN(transactions.amount),COUNT(transactions),NUM_UNIQUE(transactions.session_id),MODE(transactions.session_id),...,MODE(transactions.DAY(transaction_time)),MODE(transactions.DAY(date_of_birth)),MODE(transactions.YEAR(transaction_time)),MODE(transactions.YEAR(date_of_birth)),MODE(transactions.MONTH(transaction_time)),MODE(transactions.MONTH(date_of_birth)),MODE(transactions.WEEKDAY(transaction_time)),MODE(transactions.WEEKDAY(date_of_birth)),MODE(transactions.sessions.device),MODE(transactions.sessions.customer_id)
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,B,7489.79,42.479989,149.56,0.125525,6.84,73.429314,102,34,3,...,1,18,2014,1994,1,7,2,0,desktop,1
2,B,7021.43,46.336308,149.95,0.151934,5.73,76.319891,92,34,28,...,1,18,2014,2006,1,8,2,0,desktop,4
3,B,7008.12,38.871405,148.31,0.223938,5.89,73.00125,96,35,1,...,1,18,2014,2006,1,8,2,0,desktop,4
4,B,8088.97,42.492501,146.46,-0.132077,5.81,76.311038,106,34,29,...,1,18,2014,1994,1,7,2,0,desktop,1
5,A,7931.55,42.131902,149.02,0.098248,5.91,76.264904,104,34,4,...,1,18,2014,1994,1,7,2,0,mobile,1


# Deep Feature Synthesis (DFS)

自動で特徴エンジニアリングを行う機能の詳細をみていく

まず、サンプルのEntitySetをロードする

In [36]:
es = ft.demo.load_mock_customer(return_entityset=True)
es

Entityset: transactions
  Entities:
    transactions [Rows: 500, Columns: 5]
    products [Rows: 5, Columns: 2]
    sessions [Rows: 35, Columns: 4]
    customers [Rows: 5, Columns: 4]
  Relationships:
    transactions.product_id -> products.product_id
    transactions.session_id -> sessions.session_id
    sessions.customer_id -> customers.customer_id

作る特徴として、**カスタマーがサインアップ(join_date)した月**と**カスタマーのセッションの合計数**をつくる

featuretoolsでは、**aggregation**と**transform**の基本コンポーネント(primitive)が揃っている

In [37]:
feature_matrix, feature_defs = ft.dfs(
    entityset = es,
    target_entity = "customers",
    agg_primitives = ["count"],
    trans_primitives = ["month"],
    max_depth = 1
)

In [38]:
feature_matrix

Unnamed: 0_level_0,zip_code,COUNT(sessions),MONTH(join_date),MONTH(date_of_birth)
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,60091,8,4,7
2,13244,7,4,8
3,13244,6,8,11
4,60091,8,4,8
5,60091,6,7,7


**max_depth**パラメータで特徴をどんどん深掘り(=deep)できる。やってみる

In [39]:
feature_matrix, feature_defs = ft.dfs(
    entityset=es,
    target_entity="customers",
    agg_primitives=["mean", "sum", "mode"],
    trans_primitives=["month", "hour"],
    max_depth=2
)
print(feature_matrix.shape)

(5, 15)


In [40]:
feature_matrix

Unnamed: 0_level_0,zip_code,MODE(sessions.device),MEAN(transactions.amount),SUM(transactions.amount),MODE(transactions.product_id),MONTH(join_date),MONTH(date_of_birth),HOUR(join_date),HOUR(date_of_birth),MEAN(sessions.MEAN(transactions.amount)),MEAN(sessions.SUM(transactions.amount)),SUM(sessions.MEAN(transactions.amount)),MODE(sessions.MODE(transactions.product_id)),MODE(sessions.MONTH(session_start)),MODE(sessions.HOUR(session_start))
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,60091,mobile,71.631905,9025.62,4,4,7,10,0,72.77414,1128.2025,582.193117,4,1,6
2,13244,desktop,77.422366,7200.28,4,4,8,23,0,78.415122,1028.611429,548.905851,3,1,3
3,13244,desktop,67.06043,6236.62,1,8,11,15,0,67.539577,1039.436667,405.237462,1,1,5
4,60091,mobile,80.070459,8727.68,2,4,8,20,0,81.207189,1090.96,649.657515,1,1,1
5,60091,mobile,80.375443,6349.66,5,7,7,5,0,78.705187,1058.276667,472.231119,3,1,0


- セッションごとのトランザクションの合計
- それの平均値

を**max_depth=2**の効果で返している

In [41]:
feature_matrix[['MEAN(sessions.SUM(transactions.amount))']]

Unnamed: 0_level_0,MEAN(sessions.SUM(transactions.amount))
customer_id,Unnamed: 1_level_1
1,1128.2025
2,1028.611429
3,1039.436667
4,1090.96
5,1058.276667


- セッションを開始した時間
- その最頻値

を返している

In [42]:
feature_matrix[['MODE(sessions.HOUR(session_start))']]

Unnamed: 0_level_0,MODE(sessions.HOUR(session_start))
customer_id,Unnamed: 1_level_1
1,6
2,3
3,5
4,1
5,0


target_entityを**sessions**に変更する。自動的に親Entityであるcustomersをもとに、各特徴を計算する

In [43]:
feature_matrix, feature_defs = ft.dfs(
    entityset=es,
    target_entity="sessions",
    agg_primitives=["mean", "sum", "mode"],
    trans_primitives=["month", "hour"],
    max_depth=2
)
print(feature_matrix.shape)

(35, 19)


In [44]:
feature_matrix.head(10)

Unnamed: 0_level_0,customer_id,device,MEAN(transactions.amount),SUM(transactions.amount),MODE(transactions.product_id),MONTH(session_start),HOUR(session_start),customers.zip_code,MODE(transactions.MONTH(transaction_time)),MODE(transactions.HOUR(transaction_time)),MODE(transactions.products.brand),customers.MODE(sessions.device),customers.MEAN(transactions.amount),customers.SUM(transactions.amount),customers.MODE(transactions.product_id),customers.MONTH(join_date),customers.MONTH(date_of_birth),customers.HOUR(join_date),customers.HOUR(date_of_birth)
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,2,desktop,76.813125,1229.01,3,1,0,13244,1,0,B,desktop,77.422366,7200.28,4,4,8,23,0
2,5,mobile,74.696,746.96,5,1,0,60091,1,0,A,mobile,80.375443,6349.66,5,7,7,5,0
3,4,mobile,88.6,1329.0,1,1,0,60091,1,0,B,mobile,80.070459,8727.68,2,4,8,20,0
4,1,mobile,64.5572,1613.93,5,1,0,60091,1,0,B,mobile,71.631905,9025.62,4,4,7,10,0
5,4,mobile,70.638182,777.02,5,1,1,60091,1,1,B,mobile,80.070459,8727.68,2,4,8,20,0
6,1,tablet,84.44,1266.6,4,1,1,60091,1,1,B,mobile,71.631905,9025.62,4,4,7,10,0
7,3,tablet,62.791333,941.87,1,1,1,13244,1,1,B,desktop,67.06043,6236.62,1,8,11,15,0
8,4,tablet,75.081111,1351.46,1,1,1,60091,1,2,B,mobile,80.070459,8727.68,2,4,8,20,0
9,1,desktop,70.135333,1052.03,1,1,2,60091,1,2,B,mobile,71.631905,9025.62,4,4,7,10,0
10,2,tablet,88.042667,1320.64,2,1,2,13244,1,2,B,desktop,77.422366,7200.28,4,4,8,23,0


customerごとのtransaction amountの平均値。同じcustomer_idをもつsession_idは同じ値になっていることがわかる

In [45]:
feature_matrix[['customers.MEAN(transactions.amount)']].head(10)

Unnamed: 0_level_0,customers.MEAN(transactions.amount)
session_id,Unnamed: 1_level_1
1,77.422366
2,80.375443
3,80.070459
4,71.631905
5,80.070459
6,71.631905
7,67.06043
8,80.070459
9,71.631905
10,77.422366


# Feature primitives

feature primitivesを詳細にみていく

**features_only = True**にすることで、　特徴の定義のみを得る。すばやく特徴の検査ができる

ここでは、各customerごとに、各sessionごとの時間の差の特徴を作る

In [46]:
feature_defs = ft.dfs(
    entityset=es,
    target_entity="customers",
    agg_primitives=["mean"],
    trans_primitives=["time_since_previous"],
    features_only=True
)

In [47]:
feature_defs

[<Feature: zip_code>,
 <Feature: MEAN(transactions.amount)>,
 <Feature: MEAN(sessions.MEAN(transactions.amount))>,
 <Feature: MEAN(sessions.time_since_previous_by_customer_id)>]

各sessionごとの時間の差の特徴をいろいろ作ってみる

In [48]:
feature_matrix, feature_defs = ft.dfs(
    entityset=es,
    target_entity="customers",
    agg_primitives=["mean", "max", "min", "std", "skew"],
    trans_primitives=["time_since_previous"]
)
print(feature_matrix.shape)

(5, 32)


In [49]:
feature_matrix[[
    "MEAN(sessions.time_since_previous_by_customer_id)",
    "MAX(sessions.time_since_previous_by_customer_id)",
    "MIN(sessions.time_since_previous_by_customer_id)",
    "STD(sessions.time_since_previous_by_customer_id)",
    "SKEW(sessions.time_since_previous_by_customer_id)"
]]

Unnamed: 0_level_0,MEAN(sessions.time_since_previous_by_customer_id),MAX(sessions.time_since_previous_by_customer_id),MIN(sessions.time_since_previous_by_customer_id),STD(sessions.time_since_previous_by_customer_id),SKEW(sessions.time_since_previous_by_customer_id)
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,3305.714286,7345.0,1040.0,2046.357391,1.438363
2,4907.5,13325.0,520.0,5229.127795,0.996087
3,5096.0,10075.0,1170.0,4084.82466,0.40709
4,2516.428571,6435.0,650.0,1975.727111,1.445854
5,5577.0,16120.0,2080.0,5949.613013,2.132658


featuretoolsの中にある、primitives一覧 : **ft.list_primitives()**

In [50]:
print(ft.list_primitives().shape)

(62, 3)


In [51]:
import pandas as pd
pd.set_option("display.max_colwidth", 100)
pd.set_option("display.max_rows", 62)

In [52]:
ft.list_primitives()

Unnamed: 0,name,type,description
0,min,aggregation,Finds the minimum non-null value of a numeric feature.
1,sum,aggregation,Sums elements of a numeric or boolean feature.
2,std,aggregation,Finds the standard deviation of a numeric feature ignoring null values.
3,trend,aggregation,Calculates the slope of the linear trend of variable overtime.
4,skew,aggregation,Computes the skewness of a data set.
5,time_since_last,aggregation,Time since last related instance.
6,count,aggregation,Counts the number of non null values.
7,num_unique,aggregation,Returns the number of unique categorical variables.
8,n_most_common,aggregation,Finds the N most common elements in a categorical feature.
9,median,aggregation,Finds the median value of any feature with well-ordered values.


In [53]:
pd.set_option("display.max_rows", 50)

自分で設定するcustome featureの作り方を説明する

- Aggregation か Transform かを決める
- input と output のデータ形式を定義する
- 処理を書く
- 特徴名をアノテートする

例として、シンプルなtransformとaggregation primitivesを作ってみる

In [54]:
from featuretools.primitives import make_agg_primitive, make_trans_primitive
from featuretools.variable_types import Text, Numeric

In [55]:
def absolute(column):
    return abs(column)
Absolute = make_trans_primitive(
    function=absolute,
    input_types=[Numeric],
    return_type=Numeric
)

In [56]:
def maximum(column):
    return max(column)
Maximum = make_agg_primitive(
    function=maximum,
    input_types=[Numeric],
    return_type=Numeric
)

In [57]:
feature_matrix, feature_defs = ft.dfs(
    entityset=es,
    target_entity="sessions",
    agg_primitives=[Maximum],
    trans_primitives=[Absolute],
    max_depth=2
)

In [58]:
feature_matrix[["customers.MAXIMUM(transactions.amount)", "MAXIMUM(transactions.ABSOLUTE(amount))"]].head(10)

Unnamed: 0_level_0,customers.MAXIMUM(transactions.amount),MAXIMUM(transactions.ABSOLUTE(amount))
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,146.81,141.66
2,149.02,135.25
3,149.95,147.73
4,139.43,129.0
5,149.95,139.2
6,139.43,139.23
7,149.15,146.31
8,149.95,143.85
9,139.43,132.72
10,146.81,142.44


別の例で、word count特徴を作ってみる

実行例とデータが違うので、動かないみたいなので定義のみ

In [59]:
def word_count(column):
    '''
    Counts the number of words in each row of the column. Returns a list　of the counts for each row.
    '''
    word_counts = []
    for value in column:
        words = value.split(None)
        word_counts.append(len(words))
    return word_counts
WordCount = make_trans_primitive(
    function=word_count,
    input_types=[Text],
    return_type=Numeric
)

複数のInput Typeに基づいた特徴も作れる

In [60]:
from featuretools.variable_types import Datetime, Timedelta, Variable

In [61]:
def mean_sunday(numeric, datetime):
    '''
   　Finds the mean of non-null values of a feature that occurred on Sundays
   　'''
    days = pd.DatetimeIndex(datetime).weekday.values
    df = pd.DataFrame({'numeric': numeric, 'time': days})
    return df[df['time'] == 6]['numeric'].mean()
MeanSunday = make_agg_primitive(
    function=mean_sunday,
    input_types=[Numeric, Datetime],
    return_type=Numeric
)

In [62]:
feature_matrix, features = ft.dfs(
    entityset=es,
    target_entity="sessions",
    agg_primitives=[MeanSunday],
    trans_primitives=[],
    max_depth=1
)

In [63]:
feature_matrix.head(3)

Unnamed: 0_level_0,customer_id,device,"MEAN_SUNDAY(transactions.amount, transaction_time)",customers.zip_code
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,2,desktop,,13244
2,5,mobile,,60091
3,4,mobile,,60091


# Handling Time

featuretoolsは時間的な特徴を扱うのが得意

In [64]:
es_mc = ft.demo.load_mock_customer(return_entityset=True, random_seed=0)

In [65]:
es_mc

Entityset: transactions
  Entities:
    transactions [Rows: 500, Columns: 5]
    products [Rows: 5, Columns: 2]
    sessions [Rows: 35, Columns: 4]
    customers [Rows: 5, Columns: 4]
  Relationships:
    transactions.product_id -> products.product_id
    transactions.session_id -> sessions.session_id
    sessions.customer_id -> customers.customer_id

In [66]:
es_mc['transactions'].df.head(3)

Unnamed: 0,transaction_id,session_id,transaction_time,amount,product_id
298,298,1,2014-01-01 00:00:00,127.64,5
2,2,1,2014-01-01 00:01:05,109.48,2
308,308,1,2014-01-01 00:02:10,95.06,3


In [67]:
es_mc['customers'].df.head(3)

Unnamed: 0,customer_id,join_date,date_of_birth,zip_code
5,5,2010-07-17 05:27:50,1984-07-28,60091
4,4,2011-04-08 20:08:14,2006-08-15,60091
1,1,2011-04-17 10:48:33,1994-07-18,60091


In [68]:
ct = pd.DataFrame()
ct['customer_id'] = [1, 2, 3]
ct['time'] = pd.to_datetime(['2014-1-1 04:00', '2014-1-1 04:00', '2014-1-1 04:00'])
ct['label'] = [True, True, False]

**cutoff** 時間より前についての各特徴量を計算してくれる

In [69]:
fm, features = ft.dfs(
    entityset=es_mc,
    target_entity='customers',
    cutoff_time=ct,
    cutoff_time_in_index=True
)

In [70]:
fm

Unnamed: 0_level_0,Unnamed: 1_level_0,zip_code,COUNT(sessions),NUM_UNIQUE(sessions.device),MODE(sessions.device),SUM(transactions.amount),STD(transactions.amount),MAX(transactions.amount),SKEW(transactions.amount),MIN(transactions.amount),MEAN(transactions.amount),...,NUM_UNIQUE(sessions.DAY(session_start)),NUM_UNIQUE(sessions.YEAR(session_start)),NUM_UNIQUE(sessions.MONTH(session_start)),NUM_UNIQUE(sessions.WEEKDAY(session_start)),MODE(sessions.MODE(transactions.product_id)),MODE(sessions.DAY(session_start)),MODE(sessions.YEAR(session_start)),MODE(sessions.MONTH(session_start)),MODE(sessions.WEEKDAY(session_start)),label
customer_id,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,2014-01-01 04:00:00,60091,4,3,tablet,4958.19,42.309717,139.23,-0.006928,5.81,74.002836,...,1,1,1,1,4,1,2014,1,2,True
2,2014-01-01 04:00:00,13244,4,2,desktop,4150.3,39.289512,146.81,-0.134786,12.07,84.7,...,1,1,1,1,2,1,2014,1,2,True
3,2014-01-01 04:00:00,13244,1,1,tablet,941.87,47.264797,146.31,0.618455,8.19,62.791333,...,1,1,1,1,1,1,2014,1,2,False


**Flights**データでもやってみる。レコードが時間にわたって格納されているもの

In [72]:
es_flight = ft.demo.load_flight(nrows=100)

Downloading data from s3...


In [73]:
es_flight

Entityset: Flight Data
  Entities:
    trip_logs [Rows: 100, Columns: 22]
    flights [Rows: 13, Columns: 9]
    airlines [Rows: 1, Columns: 1]
    airports [Rows: 6, Columns: 3]
  Relationships:
    trip_logs.flight_id -> flights.flight_id
    flights.carrier -> airlines.carrier
    flights.dest -> airports.dest

In [88]:
es_flight['trip_logs'].df.head(3)

Unnamed: 0,trip_log_id,flight_date,scheduled_dep_time,dep_delay,taxi_out,taxi_in,arr_delay,scheduled_elapsed_time,air_time,distance,carrier_delay,weather_delay,national_airspace_delay,security_delay,late_aircraft_delay,dep_time,arr_time,scheduled_arr_time,time_index,flight_id,cancelled,diverted
82,82,2017-01-01,2017-01-01 06:38:00,-5.0,12.0,6.0,-6.0,6060000000000,82.0,507.0,0.0,0.0,0.0,0.0,0.0,2017-01-01 06:33:00,2017-01-01 08:13:00,2017-01-01 08:19:00,2016-09-03 06:38:00,AA-495:TPA->CLT,0.0,0.0
92,92,2017-01-01,2017-01-01 07:00:00,-6.0,28.0,15.0,5.0,12180000000000,171.0,1067.0,0.0,0.0,0.0,0.0,0.0,2017-01-01 06:54:00,2017-01-01 10:28:00,2017-01-01 10:23:00,2016-09-03 07:00:00,AA-496:PIT->DFW,0.0,0.0
46,46,2017-01-01,2017-01-01 09:25:00,-2.0,18.0,8.0,-3.0,4620000000000,50.0,226.0,0.0,0.0,0.0,0.0,0.0,2017-01-01 09:23:00,2017-01-01 10:39:00,2017-01-01 10:42:00,2016-09-03 09:25:00,AA-495:CLT->ATL,0.0,0.0


ややこしくなってきたのでスキップ

# Tuning Deep Feature Synthesis

In [119]:
es = ft.demo.load_mock_customer(return_entityset=True)

In [120]:
es

Entityset: transactions
  Entities:
    transactions [Rows: 500, Columns: 5]
    products [Rows: 5, Columns: 2]
    sessions [Rows: 35, Columns: 4]
    customers [Rows: 5, Columns: 4]
  Relationships:
    transactions.product_id -> products.product_id
    transactions.session_id -> sessions.session_id
    sessions.customer_id -> customers.customer_id

## Using “Seed Features”
ドメイン知識に応じた特徴設計をするために、**seed_features**を使う

In [121]:
expensive_purchase = ft.Feature(es["transactions"]["amount"]) > 125

In [122]:
 feature_matrix, feature_defs = ft.dfs(
     entityset=es, 
     target_entity="customers",
     agg_primitives=["percent_true"],
     seed_features=[expensive_purchase]
 )
print(feature_matrix.shape)

(5, 10)


In [123]:
feature_matrix[['PERCENT_TRUE(transactions.amount > 125)']]

Unnamed: 0_level_0,PERCENT_TRUE(transactions.amount > 125)
customer_id,Unnamed: 1_level_1
1,0.119048
2,0.129032
3,0.182796
4,0.220183
5,0.227848


## Add “interesting” values to variables

手がかり = interesting を持つ特徴量を使って条件付けした特徴設計をするときに便利な機能

ここでは、deviceごとに特徴量を計算したいという例を考える

In [124]:
import pprint

In [125]:
feature_matrix, feature_defs = ft.dfs(
    entityset=es,
    target_entity="customers",
    agg_primitives=["count", "avg_time_between"],
    trans_primitives=[]
)
print(feature_matrix.shape)
pprint.pprint(feature_defs)

(5, 5)
[<Feature: zip_code>,
 <Feature: COUNT(sessions)>,
 <Feature: AVG_TIME_BETWEEN(sessions.session_start)>,
 <Feature: COUNT(transactions)>,
 <Feature: AVG_TIME_BETWEEN(transactions.transaction_time)>]


In [132]:
es["sessions"].df["device"].unique()

array(['desktop', 'mobile', 'tablet'], dtype=object)

In [133]:
es["sessions"]["device"].interesting_values = ["desktop", "mobile", "tablet"] # deviceにある各値で条件付け

In [134]:
feature_matrix, feature_defs = ft.dfs(
    entityset=es,
    target_entity="customers",
    agg_primitives=["count", "avg_time_between"],
    where_primitives=["count", "avg_time_between"],
    trans_primitives=[]
)
print(feature_matrix.shape)
pprint.pprint(feature_defs)

(5, 11)
[<Feature: zip_code>,
 <Feature: COUNT(sessions)>,
 <Feature: AVG_TIME_BETWEEN(sessions.session_start)>,
 <Feature: COUNT(transactions)>,
 <Feature: AVG_TIME_BETWEEN(transactions.transaction_time)>,
 <Feature: COUNT(sessions WHERE device = mobile)>,
 <Feature: COUNT(sessions WHERE device = tablet)>,
 <Feature: COUNT(sessions WHERE device = desktop)>,
 <Feature: AVG_TIME_BETWEEN(sessions.session_start WHERE device = mobile)>,
 <Feature: AVG_TIME_BETWEEN(sessions.session_start WHERE device = tablet)>,
 <Feature: AVG_TIME_BETWEEN(sessions.session_start WHERE device = desktop)>]


条件づけされた値ごとに特徴が計算されている

In [136]:
feature_matrix

Unnamed: 0_level_0,zip_code,COUNT(sessions),AVG_TIME_BETWEEN(sessions.session_start),COUNT(transactions),AVG_TIME_BETWEEN(transactions.transaction_time),COUNT(sessions WHERE device = mobile),COUNT(sessions WHERE device = tablet),COUNT(sessions WHERE device = desktop),AVG_TIME_BETWEEN(sessions.session_start WHERE device = mobile),AVG_TIME_BETWEEN(sessions.session_start WHERE device = tablet),AVG_TIME_BETWEEN(sessions.session_start WHERE device = desktop)
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,60091,8,3305.714286,126,192.92,3,3,2,11570.0,8807.5,7150.0
2,13244,7,4907.5,93,328.532609,2,2,3,1690.0,5330.0,6890.0
3,13244,6,5096.0,93,287.554348,1,1,4,,,4745.0
4,60091,8,2516.428571,109,168.518519,4,1,3,3336.666667,,4127.5
5,60091,6,5577.0,79,363.333333,3,1,2,13942.5,,9685.0


## Encoding categorical features

one hot encodingをやってみる

他にどんなencodingがあるか

In [139]:
feature_matrix, feature_defs = ft.dfs(
    entityset = es,
    target_entity = "customers",
    agg_primitives=["mode"],
    max_depth=1
)

In [140]:
feature_matrix

Unnamed: 0_level_0,zip_code,MODE(sessions.device),DAY(join_date),DAY(date_of_birth),YEAR(join_date),YEAR(date_of_birth),MONTH(join_date),MONTH(date_of_birth),WEEKDAY(join_date),WEEKDAY(date_of_birth)
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,60091,mobile,17,18,2011,1994,4,7,6,0
2,13244,desktop,15,18,2012,1986,4,8,6,0
3,13244,desktop,13,21,2011,2003,8,11,5,4
4,60091,mobile,8,15,2011,2006,4,8,4,1
5,60091,mobile,17,28,2010,1984,7,7,5,5


In [145]:
feature_matrix_enc, features_enc = ft.encode_features(feature_matrix, feature_defs)

In [146]:
feature_matrix_enc

Unnamed: 0_level_0,zip_code = 60091,zip_code = 13244,zip_code is unknown,MODE(sessions.device) = mobile,MODE(sessions.device) = desktop,MODE(sessions.device) is unknown,DAY(join_date) = 17,DAY(join_date) = 15,DAY(join_date) = 13,DAY(join_date) = 8,DAY(join_date) is unknown,DAY(date_of_birth) = 18,DAY(date_of_birth) = 28,DAY(date_of_birth) = 21,DAY(date_of_birth) = 15,DAY(date_of_birth) is unknown,YEAR(join_date) = 2011,YEAR(join_date) = 2012,YEAR(join_date) = 2010,YEAR(join_date) is unknown,YEAR(date_of_birth) = 2006,YEAR(date_of_birth) = 2003,YEAR(date_of_birth) = 1994,YEAR(date_of_birth) = 1986,YEAR(date_of_birth) = 1984,YEAR(date_of_birth) is unknown,MONTH(join_date) = 4,MONTH(join_date) = 8,MONTH(join_date) = 7,MONTH(join_date) is unknown,MONTH(date_of_birth) = 8,MONTH(date_of_birth) = 7,MONTH(date_of_birth) = 11,MONTH(date_of_birth) is unknown,WEEKDAY(join_date) = 6,WEEKDAY(join_date) = 5,WEEKDAY(join_date) = 4,WEEKDAY(join_date) is unknown,WEEKDAY(date_of_birth) = 0,WEEKDAY(date_of_birth) = 5,WEEKDAY(date_of_birth) = 4,WEEKDAY(date_of_birth) = 1,WEEKDAY(date_of_birth) is unknown
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1
1,1,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0
2,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0
3,0,1,0,0,1,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0
4,1,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0
5,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0


# Improving Computational Performance

計算時間短縮Tips。省略