Tutorial from https://towardsdatascience.com/automated-feature-engineering-in-python-99baf11cc219

There are two main approaches for feature engineering:
* Transformation: take a substring or compute the log a number (feature)
* Aggregation: compute some kind of statistic (min, max, avg, ...)

In [5]:
import pandas as pd
import featuretools as ft

In [2]:
clients = pd.read_csv('./data/clients.csv')
loans = pd.read_csv('./data/loans.csv')
payments = pd.read_csv('./data/payments.csv')

In [7]:
clients.head()

Unnamed: 0,client_id,joined,income,credit_score
0,46109,2002-04-16,172677,527
1,49545,2007-11-14,104564,770
2,41480,2013-03-11,122607,585
3,46180,2001-11-06,43851,562
4,25707,2006-10-06,211422,621


In [8]:
loans.head()

Unnamed: 0,client_id,loan_type,loan_amount,repaid,loan_id,loan_start,loan_end,rate
0,46109,home,13672,0,10243,2002-04-16,2003-12-20,2.15
1,46109,credit,9794,0,10984,2003-10-21,2005-07-17,1.25
2,46109,home,12734,1,10990,2006-02-01,2007-07-05,0.68
3,46109,cash,12518,1,10596,2010-12-08,2013-05-05,1.24
4,46109,credit,14049,1,11415,2010-07-07,2012-05-21,3.13


In [9]:
payments.head()

Unnamed: 0,loan_id,payment_amount,payment_date,missed
0,10243,2369,2002-05-31,1
1,10243,2439,2002-06-18,1
2,10243,2662,2002-06-29,0
3,10243,2268,2002-07-20,0
4,10243,2027,2002-07-31,1


In [4]:
# example of an aggregation using pandas

stats = loans.groupby('client_id')['loan_amount'].agg(['mean', 'max', 'min'])
stats.columns = ['mean_loan_amount', 'max_loan_amount', 'min_loan_amount']

stats = clients.merge(stats, left_on='client_id', right_index=True, how='left')

stats.head(10)

Unnamed: 0,client_id,joined,income,credit_score,mean_loan_amount,max_loan_amount,min_loan_amount
0,46109,2002-04-16,172677,527,8951.6,14049,559
1,49545,2007-11-14,104564,770,10289.3,14971,3851
2,41480,2013-03-11,122607,585,7894.85,14399,811
3,46180,2001-11-06,43851,562,7700.85,14081,1607
4,25707,2006-10-06,211422,621,7963.95,13913,1212
5,39505,2011-10-14,153873,610,7424.05,14575,904
6,32726,2006-05-01,235705,730,6633.263158,14802,851
7,35089,2010-03-01,131176,771,6939.2,13194,773
8,35214,2003-08-08,95849,696,7173.555556,14767,667
9,48177,2008-06-09,190632,769,7424.368421,14740,659


An Entity is just a table/DataFrame
An EntitySet is a collection of Entities and their relationships

In [10]:
# empty entity set
es = ft.EntitySet(id='clients')

# add the client entity to the entity set
es = es.entity_from_dataframe(entity_id='clients', dataframe=clients,
                              index='client_id', time_index='joined')

es = es.entity_from_dataframe(entity_id='loans', dataframe=loans,
                              index='loan_id', time_index='loan_start')

es = es.entity_from_dataframe(entity_id='payments', dataframe=payments,
                              variable_types={'missed': ft.variable_types.Categorical},
                              make_index=True,
                              index='payments_id',
                              time_index='payment_date')

In [11]:
es['payments']

Entity: payments
  Variables:
    payments_id (dtype: index)
    loan_id (dtype: numeric)
    payment_amount (dtype: numeric)
    payment_date (dtype: datetime_time_index)
    missed (dtype: categorical)
  Shape:
    (Rows: 3456, Columns: 5)

In [13]:
r_client_previous = ft.Relationship(es['clients']['client_id'],
                                    es['loans']['client_id'])

es = es.add_relationship(r_client_previous)

r_payments = ft.Relationship(es['loans']['loan_id'],
                             es['payments']['loan_id'])

es = es.add_relationship(r_payments)

es

Entityset: clients
  Entities:
    clients [Rows: 25, Columns: 4]
    loans [Rows: 443, Columns: 8]
    payments [Rows: 3456, Columns: 5]
  Relationships:
    loans.client_id -> clients.client_id
    payments.loan_id -> loans.loan_id

In [15]:
(features, feature_names) = ft.dfs(entityset=es, target_entity='clients',
                                   agg_primitives=['mean', 'max', 'percent_true', 'last'],
                                   trans_primitives=['years', 'month', 'subtract', 'divide'])

features

Unnamed: 0_level_0,income,credit_score,MEAN(loans.loan_amount),MEAN(loans.repaid),MEAN(loans.rate),MAX(loans.loan_amount),MAX(loans.repaid),MAX(loans.rate),LAST(loans.loan_type),LAST(loans.loan_amount),...,MAX(loans.rate) / income - credit_score,MEAN(loans.loan_amount) / LAST(loans.rate),income / MEAN(loans.loan_amount),MAX(loans.rate) / MEAN(loans.loan_amount),MEAN(loans.loan_amount) / MAX(payments.payment_amount),MAX(loans.loan_amount) / MAX(payments.payment_amount),LAST(loans.repaid) / MAX(loans.rate),income / income - credit_score,MAX(loans.repaid) / LAST(payments.payment_amount),income / MEAN(payments.payment_amount)
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
25707,211422,621,7963.95,0.4,3.477,13913,1,9.44,home,2203,...,4.5e-05,1076.209459,26.547379,0.001185,2.945248,5.14534,0.0,1.002946,0.004149,179.3912
26326,227920,633,7270.0625,0.5,2.5175,13464,1,6.73,credit,5275,...,3e-05,5013.836207,31.350487,0.000926,2.735163,5.065463,0.0,1.002785,0.001073,195.34825
26695,174532,680,7824.722222,0.388889,2.466111,14865,1,6.51,other,13918,...,3.7e-05,8694.135802,22.305201,0.000832,2.668732,5.069918,0.15361,1.003911,0.00048,144.54788
26945,214516,806,7125.933333,0.4,2.855333,14593,1,5.65,cash,9249,...,2.6e-05,2491.585082,30.103565,0.000793,2.574398,5.272038,0.176991,1.003771,0.000626,193.349418
29841,38354,523,9813.0,0.555556,3.445,14837,1,6.76,home,7223,...,0.000179,1927.897839,3.908489,0.000689,3.386128,5.119738,0.147929,1.013825,0.00125,26.645208
32726,235705,730,6633.263158,0.578947,3.058947,14802,1,9.1,other,5325,...,3.9e-05,2343.909243,35.533793,0.001372,2.461322,5.492393,0.10989,1.003107,0.000987,249.608701
32885,58955,642,9920.4,0.533333,2.436,14162,1,9.11,other,11886,...,0.000156,1088.95719,5.942805,0.000918,4.095954,5.847234,0.0,1.01101,0.000524,42.216386
32961,230341,714,7882.235294,0.529412,3.930588,14784,1,9.14,cash,1693,...,4e-05,5398.791297,29.222802,0.00116,2.824162,5.297026,0.109409,1.003109,0.004484,221.683498
35089,131176,771,6939.2,0.6,3.5135,13194,1,7.63,other,773,...,5.9e-05,909.462647,18.90362,0.0011,2.707452,5.147874,0.131062,1.005912,0.009434,119.822963
35214,95849,696,7173.555556,0.444444,3.108333,14767,1,8.44,home,9389,...,8.9e-05,5123.968254,13.361436,0.001177,2.496018,5.138135,0.0,1.007315,0.000619,88.997282
