In [1]:
%load_ext autoreload
%autoreload

import pandas as pd
import numpy as np

pd.options.display.float_format = "{:.2f}".format
np.set_printoptions(precision=4)

import matplotlib.pyplot as plt
%matplotlib inline

import featuretools as ft

In [2]:
train = pd.read_csv('/home/dissertation/data/train_v2.csv', 
                          dtype = {'msno' : 'category'})

members = pd.read_csv('/home/dissertation/data/members_v3.csv',
                            dtype={'registered_via' : np.uint8,
                                   'gender' : str,
                                   'city' : 'category',
                                   'registered_via' : 'category'})

members['registration_init_time_dt'] = pd.to_datetime(members['registration_init_time'], 
                                                            format='%Y%m%d', errors='ignore')

## Next load in the transactions data
transactions = pd.read_csv('/home/dissertation/data/transactions.csv',
                                 dtype = {'payment_method' : 'category',
                                          'payment_plan_days' : np.uint8,
                                          'plan_list_price' : np.uint8,
                                          'actual_amount_paid': np.uint8,
                                          'is_auto_renew' : np.bool,
                                          'is_cancel' : np.bool})


In [6]:
merged_input = pd.merge(left=train, right=members, how='inner', on=['msno'])
merged_input.drop(['is_churn'], axis=1, inplace=True)
merged_input.head()

Unnamed: 0,msno,city,bd,gender,registered_via,registration_init_time,registration_init_time_dt
0,ugx0CjOMzazClkFzU2xasmDZaoIqOUAZPsH1q0teWCg=,5,28,male,3,20131223,2013-12-23
1,f/NmvEzHfhINFEYZTR05prUdr+E+3+oewvweYz9cCQE=,13,20,male,3,20131223,2013-12-23
2,zLo9f73nGGT1p21ltZC3ChiRnAVvgibMyazbCxvWPcg=,13,18,male,3,20131227,2013-12-27
3,8iF/+8HY8lJKFrTc7iR9ZYGCG2Ecrogbc2Vy5YhsfhQ=,1,0,,7,20140109,2014-01-09
4,K6fja4+jmoZ5xG6BypqX80Uw/XKpMgrEMdG2edFOxnA=,13,35,female,7,20140125,2014-01-25


In [7]:
import featuretools.variable_types as vtypes

# Make empty entityset
es = ft.EntitySet(id = 'customers')

In [8]:
# Create entity from members
es.entity_from_dataframe(entity_id='members', dataframe=merged_input,
                         index = 'msno', time_index = 'registration_init_time', 
                         variable_types = {'city': vtypes.Categorical, 
                                           'registered_via': vtypes.Categorical})

Entityset: customers
  Entities:
    members [Rows: 860967, Columns: 7]
  Relationships:
    No relationships

In [9]:
# Difference between listing price and price paid
transactions['price_difference'] = transactions['plan_list_price'] - transactions['actual_amount_paid']

# Planned price per day
transactions['planned_daily_price'] = transactions['plan_list_price'] / transactions['payment_plan_days']

# Actual price per day
transactions['daily_price'] = transactions['actual_amount_paid'] / transactions['payment_plan_days']

In [10]:
# Create entity from transactions
es.entity_from_dataframe(entity_id='transactions', dataframe=transactions,
                         index = 'transactions_index', make_index = True,
                         time_index = 'transaction_date', 
                         variable_types = {'payment_method_id': vtypes.Categorical, 
                                           'is_auto_renew': vtypes.Boolean, 'is_cancel': vtypes.Boolean})

Entityset: customers
  Entities:
    members [Rows: 860967, Columns: 7]
    transactions [Rows: 21547746, Columns: 13]
  Relationships:
    No relationships

In [11]:
# Relationships (parent, child)
r_member_transactions = ft.Relationship(es['members']['msno'], es['transactions']['msno'])

es.add_relationships([r_member_transactions])

Entityset: customers
  Entities:
    members [Rows: 860967, Columns: 7]
    transactions [Rows: 21547746, Columns: 13]
  Relationships:
    transactions.msno -> members.msno

In [35]:
feature_defs = ft.dfs(entityset=es, target_entity='members', 
                      where_primitives = ['sum', 'mean'],
                      max_depth=2, features_only=True)

In [36]:
print(f'This will generate {len(feature_defs)} features.')

This will generate 62 features.


In [32]:
import random; random.seed(42)

random.sample(feature_defs, 10)

[<Feature: MIN(transactions.transaction_date)>,
 <Feature: SUM(transactions.actual_amount_paid)>,
 <Feature: gender>,
 <Feature: MEAN(transactions.actual_amount_paid)>,
 <Feature: STD(transactions.membership_expire_date)>,
 <Feature: STD(transactions.actual_amount_paid)>,
 <Feature: STD(transactions.plan_list_price)>,
 <Feature: SUM(transactions.transaction_date)>,
 <Feature: DAY(registration_init_time_dt)>,
 <Feature: SUM(transactions.plan_list_price)>]

In [15]:
all_p = ft.list_primitives()
trans_p = all_p.loc[all_p['type'] == 'transform'].copy()
agg_p = all_p.loc[all_p['type'] == 'aggregation'].copy()

pd.options.display.max_colwidth = 100
agg_p.head()

Unnamed: 0,name,type,description
0,median,aggregation,Determines the middlemost number in a list of values.
1,n_most_common,aggregation,Determines the `n` most common elements.
2,num_true,aggregation,Counts the number of `True` values.
3,mean,aggregation,Computes the average for a list of values.
4,max,aggregation,"Calculates the highest value, ignoring `NaN` values."


In [16]:
# Specify aggregation primitives
agg_primitives = ['sum', 'time_since_last', 'avg_time_between', 'all', 'mode', 'num_unique', 'min', 'last', 
                  'mean', 'percent_true', 'max', 'std', 'count']

In [21]:
# Specify transformation primitives
trans_primitives = ['cum_sum', 'day', 'month', 'diff', 'time_since_previous']

In [19]:
# Specify where primitives
where_primitives = ['sum', 'mean', 'percent_true', 'all', 'any']

In [23]:
feature_defs = ft.dfs(entityset=es, target_entity='members', 
                      agg_primitives = agg_primitives,
                      trans_primitives = trans_primitives,
                      where_primitives = where_primitives,
                      cutoff_time_in_index = True,
                      max_depth = 2, features_only = True)

In [37]:
print(f'This will generate {len(feature_defs)} features.')

This will generate 62 features.


In [33]:
random.sample(feature_defs, 15)

[<Feature: MIN(transactions.planned_daily_price)>,
 <Feature: MEAN(transactions.actual_amount_paid)>,
 <Feature: MODE(transactions.payment_method_id)>,
 <Feature: SKEW(transactions.price_difference)>,
 <Feature: SUM(transactions.payment_plan_days)>,
 <Feature: MIN(transactions.payment_plan_days)>,
 <Feature: MAX(transactions.planned_daily_price)>,
 <Feature: registration_init_time>,
 <Feature: gender>,
 <Feature: YEAR(registration_init_time_dt)>,
 <Feature: STD(transactions.payment_plan_days)>,
 <Feature: STD(transactions.plan_list_price)>,
 <Feature: SKEW(transactions.transaction_date)>,
 <Feature: MIN(transactions.plan_list_price)>,
 <Feature: COUNT(transactions)>]

In [None]:
from timeit import default_timer as timer

start = timer()
feature_matrix, feature_defs = ft.dfs(entityset=es, target_entity='members', 
#                                       agg_primitives = agg_primitives,
#                                       trans_primitives = trans_primitives,
#                                       where_primitives = where_primitives,
                                      where_primitives = ['sum', 'mean'],
                                      max_depth = 2, features_only = False,
                                      verbose = 1, chunk_size = 1000,  
                                      n_jobs = 1,
                                      cutoff_time_in_index = True)
end = timer()
print(f'{round(end - start)} seconds elapsed.')

Built 62 features



Elapsed: 00:00 | Remaining: ? | Progress:   0%|          | Calculated: 0/861 chunks[A[A[A


Elapsed: 00:05 | Remaining: 1:13:14 | Progress:   0%|          | Calculated: 1/861 chunks[A[A[A


Elapsed: 00:08 | Remaining: 1:04:50 | Progress:   0%|          | Calculated: 2/861 chunks[A[A[A


Elapsed: 00:11 | Remaining: 58:56 | Progress:   0%|          | Calculated: 3/861 chunks  [A[A[A


Elapsed: 00:14 | Remaining: 54:43 | Progress:   0%|          | Calculated: 4/861 chunks[A[A[A


Elapsed: 00:17 | Remaining: 51:46 | Progress:   1%|          | Calculated: 5/861 chunks[A[A[A


Elapsed: 00:20 | Remaining: 49:39 | Progress:   1%|          | Calculated: 6/861 chunks[A[A[A


Elapsed: 00:24 | Remaining: 48:09 | Progress:   1%|          | Calculated: 7/861 chunks[A[A[A


Elapsed: 00:27 | Remaining: 47:03 | Progress:   1%|          | Calculated: 8/861 chunks[A[A[A


Elapsed: 00:30 | Remaining: 46:20 | Progress:   1%|          | Calculated: 9/861 chunk

Elapsed: 04:20 | Remaining: 41:53 | Progress:   9%|▉         | Calculated: 81/861 chunks[A[A[A


Elapsed: 04:23 | Remaining: 41:39 | Progress:  10%|▉         | Calculated: 82/861 chunks[A[A[A


Elapsed: 04:26 | Remaining: 41:31 | Progress:  10%|▉         | Calculated: 83/861 chunks[A[A[A


Elapsed: 04:29 | Remaining: 41:17 | Progress:  10%|▉         | Calculated: 84/861 chunks[A[A[A


Elapsed: 04:33 | Remaining: 41:18 | Progress:  10%|▉         | Calculated: 85/861 chunks[A[A[A


Elapsed: 04:36 | Remaining: 41:13 | Progress:  10%|▉         | Calculated: 86/861 chunks[A[A[A


Elapsed: 04:39 | Remaining: 41:07 | Progress:  10%|█         | Calculated: 87/861 chunks[A[A[A


Elapsed: 04:42 | Remaining: 40:54 | Progress:  10%|█         | Calculated: 88/861 chunks[A[A[A


Elapsed: 04:45 | Remaining: 40:59 | Progress:  10%|█         | Calculated: 89/861 chunks[A[A[A


Elapsed: 04:48 | Remaining: 40:58 | Progress:  10%|█         | Calculated: 90/861 chunks[A[A[A




Elapsed: 08:39 | Remaining: 37:27 | Progress:  19%|█▉        | Calculated: 162/861 chunks[A[A[A


Elapsed: 08:42 | Remaining: 37:25 | Progress:  19%|█▉        | Calculated: 163/861 chunks[A[A[A


Elapsed: 08:45 | Remaining: 37:21 | Progress:  19%|█▉        | Calculated: 164/861 chunks[A[A[A


Elapsed: 08:48 | Remaining: 37:23 | Progress:  19%|█▉        | Calculated: 165/861 chunks[A[A[A


Elapsed: 08:51 | Remaining: 37:20 | Progress:  19%|█▉        | Calculated: 166/861 chunks[A[A[A


Elapsed: 08:55 | Remaining: 37:12 | Progress:  19%|█▉        | Calculated: 167/861 chunks[A[A[A


Elapsed: 08:58 | Remaining: 37:02 | Progress:  20%|█▉        | Calculated: 168/861 chunks[A[A[A


Elapsed: 09:01 | Remaining: 36:57 | Progress:  20%|█▉        | Calculated: 169/861 chunks[A[A[A


Elapsed: 09:04 | Remaining: 36:58 | Progress:  20%|█▉        | Calculated: 170/861 chunks[A[A[A


Elapsed: 09:07 | Remaining: 36:56 | Progress:  20%|█▉        | Calculated: 171/861 chunks[

In [None]:
feature_matrix.head()