# 目录

In [1]:
import pandas as pd
import numpy as np
import featuretools as ft

## 加载数据

In [2]:
%%time
clients = pd.read_csv('./clients.csv', parse_dates = ['joined'])
loans = pd.read_csv('./loans.csv', parse_dates = ['loan_start', 'loan_end'])
payments = pd.read_csv('./payments.csv', parse_dates = ['payment_date'])

Wall time: 1.1 s


In [3]:
%%time
clients.head()

Wall time: 17 ms


Unnamed: 0,client_id,joined,income,credit_score
0,46109,2002-04-16,172677,527
1,49545,2007-11-14,104564,770
2,41480,2013-03-11,122607,585
3,46180,2001-11-06,43851,562
4,25707,2006-10-06,211422,621


In [5]:
loans.head()

Unnamed: 0,client_id,loan_type,loan_amount,repaid,loan_id,loan_start,loan_end,rate
0,46109,home,13672,0,10243,2002-04-16,2003-12-20,2.15
1,46109,credit,9794,0,10984,2003-10-21,2005-07-17,1.25
2,46109,home,12734,1,10990,2006-02-01,2007-07-05,0.68
3,46109,cash,12518,1,10596,2010-12-08,2013-05-05,1.24
4,46109,credit,14049,1,11415,2010-07-07,2012-05-21,3.13


In [6]:
payments.head()

Unnamed: 0,loan_id,payment_amount,payment_date,missed
0,10243,2369,2002-05-31,1
1,10243,2439,2002-06-18,1
2,10243,2662,2002-06-29,0
3,10243,2268,2002-07-20,0
4,10243,2027,2002-07-31,1


In [4]:
#创建实体
es = ft.EntitySet(id = 'clients')

#添加clients实体 这个 dataframe 已经有一个索引和一个时间索引
es = es.entity_from_dataframe(entity_id = 'clients', dataframe = clients, 
                              index = 'client_id', time_index = 'joined')

#添加loads实体
es = es.entity_from_dataframe(entity_id = 'loans', dataframe = loans, 
                              variable_types = {'repaid': ft.variable_types.Categorical},
                              index = 'loan_id', 
                              time_index = 'loan_start')


#添加pyments实体
es = es.entity_from_dataframe(entity_id = 'payments', 
                              dataframe = payments,
                              variable_types = {'missed': ft.variable_types.Categorical},
                              make_index = True,
                              index = 'payment_id',
                              time_index = 'payment_date')
#打印实体集
es

Entityset: clients
  Entities:
    clients [Rows: 25, Columns: 4]
    loans [Rows: 443, Columns: 8]
    payments [Rows: 3456, Columns: 5]
  Relationships:
    No relationships

In [8]:

#添加实体关系
# 通过client_id 关联clients和loans实体
r_client_previous = ft.Relationship(es['clients']['client_id'],
                                    es['loans']['client_id'])
es = es.add_relationship(r_client_previous)

# 通过loan_id 关联payments和loans实体
r_payments = ft.Relationship(es['loans']['loan_id'],
                             es['payments']['loan_id'])
es = es.add_relationship(r_payments)
es
#打印实体集
# es.plot()



Entityset: clients
  Entities:
    clients [Rows: 25, Columns: 4]
    loans [Rows: 443, Columns: 8]
    payments [Rows: 3456, Columns: 5]
  Relationships:
    loans.client_id -> clients.client_id
    payments.loan_id -> loans.loan_id

In [11]:
#聚合特征,并生成新特征
features, feature_names = ft.dfs(entityset = es, target_entity = 'clients')
features.head()

Unnamed: 0_level_0,income,credit_score,SUM(loans.loan_amount),SUM(loans.rate),STD(loans.loan_amount),STD(loans.rate),MAX(loans.loan_amount),MAX(loans.rate),SKEW(loans.loan_amount),SKEW(loans.rate),...,NUM_UNIQUE(loans.WEEKDAY(loan_end)),MODE(loans.MODE(payments.missed)),MODE(loans.DAY(loan_start)),MODE(loans.DAY(loan_end)),MODE(loans.YEAR(loan_start)),MODE(loans.YEAR(loan_end)),MODE(loans.MONTH(loan_start)),MODE(loans.MONTH(loan_end)),MODE(loans.WEEKDAY(loan_start)),MODE(loans.WEEKDAY(loan_end))
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
25707,211422,621,159279,69.54,4149.486062,2.484186,13913,9.44,-0.186352,0.73547,...,6,0,27,1,2010,2007,1,8,3,0
26326,227920,633,116321,40.28,4393.666631,2.057142,13464,6.73,0.149658,1.181651,...,5,0,6,6,2003,2005,4,7,5,2
26695,174532,680,140845,44.39,4196.462499,1.561659,14865,6.51,0.168879,0.896574,...,6,0,3,14,2003,2005,9,4,1,1
26945,214516,806,106889,42.83,4543.621769,1.619717,14593,5.65,0.174492,-0.002227,...,6,0,16,1,2002,2004,12,5,0,1
29841,38354,523,176634,62.01,4209.224171,2.122904,14837,6.76,-0.232215,0.055321,...,7,1,1,15,2005,2007,3,2,5,1


In [36]:
primitives = ft.list_primitives()
pd.options.display.max_colwidth = 100
primitives[primitives['type'] == 'aggregation'].head(10)

Unnamed: 0,name,type,description
0,skew,aggregation,Computes the extent to which a distribution differs from a normal distribution.
1,mean,aggregation,Computes the average for a list of values.
2,percent_true,aggregation,Determines the percent of `True` values.
3,num_unique,aggregation,"Determines the number of distinct values, ignoring `NaN` values."
4,trend,aggregation,Calculates the trend of a variable over time.
5,last,aggregation,Determines the last value in a list.
6,n_most_common,aggregation,Determines the `n` most common elements.
7,max,aggregation,"Calculates the highest value, ignoring `NaN` values."
8,time_since_first,aggregation,Calculates the time elapsed since the first datetime (in seconds).
9,median,aggregation,Determines the middlemost number in a list of values.


In [47]:
primitives[primitives['type'] == 'transform'].head(20)

Unnamed: 0,name,type,description
20,not_equal,transform,Determines if values in one list are not equal to another list.
21,month,transform,Determines the month value of a datetime.
22,divide_numeric_scalar,transform,Divide each element in the list by a scalar.
23,is_weekend,transform,Determines if a date falls on a weekend.
24,equal_scalar,transform,Determines if values in a list are equal to a given scalar.
25,modulo_numeric_scalar,transform,Return the modulo of each element in the list by a scalar.
26,absolute,transform,Computes the absolute value of a number.
27,add_numeric,transform,Element-wise addition of two lists.
28,equal,transform,Determines if values in one list are equal to another list.
29,hour,transform,Determines the hour value of a datetime.


In [15]:
#聚合特征，通过指定聚合agg_primitives和转换trans_primitives生成新特征
features, feature_names = ft.dfs(entityset = es, target_entity = 'clients', 
                                 agg_primitives = ['mean', 'max', 'percent_true', 'last'],
                                 trans_primitives = ['month'])
features.head()

Unnamed: 0_level_0,income,credit_score,MEAN(loans.loan_amount),MEAN(loans.rate),MAX(loans.loan_amount),MAX(loans.rate),LAST(loans.loan_id),LAST(loans.loan_type),LAST(loans.loan_amount),LAST(loans.rate),...,MAX(payments.loans.loan_amount),MAX(payments.loans.rate),LAST(payments.loans.client_id),LAST(payments.loans.loan_type),LAST(payments.loans.loan_amount),LAST(payments.loans.rate),LAST(payments.loans.repaid),MONTH(LAST(loans.loan_start)),MONTH(LAST(loans.loan_end)),MONTH(LAST(payments.payment_date))
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
25707,211422,621,7963.95,3.477,13913,9.44,10363,home,2203,7.4,...,13913,9.44,25707,home,2203,7.4,0,2,8,11
26326,227920,633,7270.0625,2.5175,13464,6.73,11072,credit,5275,1.45,...,13464,6.73,26326,credit,5275,1.45,0,11,7,8
26695,174532,680,7824.722222,2.466111,14865,6.51,10985,other,13918,0.9,...,14865,6.51,26695,other,13918,0.9,1,10,10,3
26945,214516,806,7125.933333,2.855333,14593,5.65,11482,cash,9249,2.86,...,14593,5.65,26945,cash,9249,2.86,1,12,5,7
29841,38354,523,9813.0,3.445,14837,6.76,11188,home,7223,5.09,...,14837,6.76,29841,home,7223,5.09,1,6,2,1
