# Automated Feature Engineering (featuretools)

In [1]:
import featuretools as ft
import pandas as pd
from IPython.display import display
from feature_engine.creation import CyclicalFeatures
from feature_engine.imputation import AddMissingIndicator, ArbitraryNumberImputer
# from dask.distributed import LocalCluster
from featuretools.primitives import TimeSinceFirst, TimeSinceLast
from woodwork.logical_types import Boolean, BooleanNullable, Categorical, Unknown

## Create EntitySet

In [2]:
es = ft.EntitySet(id='client_data')

In [3]:
"""Clients"""

from utils import read_clients

CLIENTS_PATH = '../data/initial/CLIENTS.csv'
# CLIENTS_PATH = '../data/samples/CLIENTS_SAMPLE.csv'

clients_df = read_clients(CLIENTS_PATH, encode_bool=False)
clients_df = clients_df.astype({'client_id': str})
display(clients_df.info())

es.add_dataframe(
    clients_df,
    dataframe_name='clients',
    index='client_id',
    time_index='communication_month',
    # make_index=True,
    logical_types={
        'client_id': Unknown,
        'target': Boolean,
        'is_train': Boolean,
    }
)
display(es['clients'].ww.schema)
display(es)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36337 entries, 0 to 36336
Data columns (total 4 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   client_id            36337 non-null  object        
 1   target               36337 non-null  bool          
 2   is_train             36337 non-null  bool          
 3   communication_month  36337 non-null  datetime64[ns]
dtypes: bool(2), datetime64[ns](1), object(1)
memory usage: 638.9+ KB


None

Unnamed: 0_level_0,Logical Type,Semantic Tag(s)
Column,Unnamed: 1_level_1,Unnamed: 2_level_1
client_id,Unknown,['index']
target,Boolean,[]
is_train,Boolean,[]
communication_month,Datetime,['time_index']


Entityset: client_data
  DataFrames:
    clients [Rows: 36337, Columns: 4]
  Relationships:
    No relationships

In [4]:
"""Transactions"""

from utils import read_transactions

TRANSACTIONS_PATH = '../data/initial/TRANSACTIONS.csv'
# TRANSACTIONS_PATH = '../data/samples/TRANSACTIONS_SAMPLE.csv'

transactions_df = read_transactions(TRANSACTIONS_PATH, encode_bool=False)
transactions_df = transactions_df.astype({'client_id': str})
transactions_df['tran_date_str'] = transactions_df['tran_date'].dt.date.astype(str)  # used later for feature generation
display(transactions_df.info())

es.add_dataframe(
    transactions_df,
    dataframe_name='transactions',
    index='transaction_id',
    time_index='tran_date',
    make_index=True,
    logical_types={
        'client_id': Unknown,
        'cat_c2': Categorical,
        'cat_c3': Categorical,
        'cat_c4': Categorical,
        'tran_date_str': Categorical,
        'fl_c6': Boolean,
        'fl_c7': Boolean,
        'fl_c8': Boolean,
        'fl_c9': Boolean,
        'fl_c10': Boolean,
        'fl_c11': Boolean,
        'fl_c12': Boolean,
        'fl_c13': Boolean,
        'fl_c14': Boolean,
        'fl_c15': Boolean,
    }
)
display(es['transactions'].ww.schema)

es.add_relationship('clients', 'client_id', 'transactions', 'client_id')
display(es)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15654626 entries, 0 to 15654625
Data columns (total 22 columns):
 #   Column         Dtype         
---  ------         -----         
 0   client_id      object        
 1   tran_date      datetime64[ns]
 2   cat_c2         int32         
 3   cat_c3         int32         
 4   cat_c4         int32         
 5   fl_c6          bool          
 6   fl_c7          bool          
 7   fl_c8          bool          
 8   fl_c9          bool          
 9   fl_c10         bool          
 10  fl_c11         bool          
 11  fl_c12         bool          
 12  fl_c13         bool          
 13  fl_c14         bool          
 14  fl_c15         bool          
 15  float_c16      float32       
 16  float_c17      float32       
 17  float_c18      float32       
 18  int_c19        int32         
 19  float_c20      float32       
 20  float_c21      float32       
 21  tran_date_str  object        
dtypes: bool(10), datetime64[ns](1), float32(

None

Unnamed: 0_level_0,Logical Type,Semantic Tag(s)
Column,Unnamed: 1_level_1,Unnamed: 2_level_1
transaction_id,Integer,['index']
client_id,Unknown,[]
tran_date,Datetime,['time_index']
cat_c2,Categorical,['category']
cat_c3,Categorical,['category']
cat_c4,Categorical,['category']
fl_c6,Boolean,[]
fl_c7,Boolean,[]
fl_c8,Boolean,[]
fl_c9,Boolean,[]


Entityset: client_data
  DataFrames:
    clients [Rows: 36337, Columns: 4]
    transactions [Rows: 15654626, Columns: 23]
  Relationships:
    transactions.client_id -> clients.client_id

In [5]:
"""App activity"""

from utils import preprocess_app_activity_data, read_app_activity

ACTIVITY_PATH = '../data/initial/APP_ACTIVITY.csv'
# ACTIVITY_PATH = '../data/samples/APP_ACTIVITY_SAMPLE.csv'

activities_df = read_app_activity(ACTIVITY_PATH, encode_bool=False)
activities_df = activities_df.astype({'client_id': str})
activities_df = preprocess_app_activity_data(activities_df)
activities_df['activity_date_str'] = activities_df['activity_date'].dt.date.astype(str)  # used later for feature generation
display(activities_df.info(show_counts=True))

es.add_dataframe(
    activities_df,
    dataframe_name='activities',
    index='activity_id',
    time_index='activity_date',
    make_index=True,
    logical_types={
        'client_id': Unknown,
        'cat_c3': Categorical,
        'cat_c4': Categorical,
        'cat_c5': Categorical,
        'cat_c6': Categorical,
        'cat_c9': Categorical,
        'activity_date_str': Categorical,
        'cat_c8': BooleanNullable,
        'cat_c10': BooleanNullable,
    }
)
display(es['activities'].ww.schema)

es.add_relationship('clients', 'client_id', 'activities', 'client_id')
display(es)

<class 'pandas.core.frame.DataFrame'>
Index: 17665187 entries, 0 to 17738591
Data columns (total 14 columns):
 #   Column             Non-Null Count     Dtype         
---  ------             --------------     -----         
 0   client_id          17665187 non-null  object        
 1   device_id          17665187 non-null  uint64        
 2   activity_date      17665187 non-null  datetime64[ns]
 3   cat_c3             17665187 non-null  int32         
 4   cat_c4             17665187 non-null  int32         
 5   cat_c5             17665187 non-null  int32         
 6   cat_c6             17665187 non-null  int32         
 7   cat_c8             17665187 non-null  boolean       
 8   cat_c9             17665187 non-null  int32         
 9   cat_c10            17665187 non-null  boolean       
 10  float_c11          17665187 non-null  float32       
 11  float_c12          17665187 non-null  float32       
 12  float_c14          17665187 non-null  float32       
 13  activity_date_s

None

Unnamed: 0_level_0,Logical Type,Semantic Tag(s)
Column,Unnamed: 1_level_1,Unnamed: 2_level_1
activity_id,Integer,['index']
client_id,Unknown,[]
device_id,Integer,['numeric']
activity_date,Datetime,['time_index']
cat_c3,Categorical,['category']
cat_c4,Categorical,['category']
cat_c5,Categorical,['category']
cat_c6,Categorical,['category']
cat_c8,BooleanNullable,[]
cat_c9,Categorical,['category']


Entityset: client_data
  DataFrames:
    clients [Rows: 36337, Columns: 4]
    transactions [Rows: 15654626, Columns: 23]
    activities [Rows: 17665187, Columns: 15]
  Relationships:
    transactions.client_id -> clients.client_id
    activities.client_id -> clients.client_id

In [6]:
"""Communications"""

from utils import read_communications

COMMS_PATH = '../data/initial/COMMUNICATIONS.csv'
# COMMS_PATH = '../data/samples/COMMUNICATIONS_SAMPLE.csv'

comms_df = read_communications(COMMS_PATH)
comms_df = comms_df.astype({'client_id': str})
# comms_df = preprocess_comm_data(comms)
# comms_df = encode_comm_categories(comms)
comms_df['contact_date_str'] = comms_df['contact_date'].dt.date.astype(str)  # used later for feature generation
display(comms_df.info(show_counts=True))

es.add_dataframe(
    comms_df,
    dataframe_name='comms',
    index='comm_id',
    time_index='contact_date',
    make_index=True,
    logical_types={
        'client_id': Unknown,
        'cat_c2': Categorical,
        'cat_c3': Categorical,
        'cat_c4': Categorical,
        'cat_c5': Categorical,
        'contact_date_str': Categorical,
    }
)
display(es['comms'].ww.schema)

es.add_relationship('clients', 'client_id', 'comms', 'client_id')
display(es)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19586922 entries, 0 to 19586921
Data columns (total 7 columns):
 #   Column            Non-Null Count     Dtype         
---  ------            --------------     -----         
 0   client_id         19586922 non-null  object        
 1   contact_date      19586922 non-null  datetime64[ns]
 2   cat_c2            19586922 non-null  category      
 3   cat_c3            19586922 non-null  int32         
 4   cat_c4            19586922 non-null  int32         
 5   cat_c5            19586917 non-null  category      
 6   contact_date_str  19586922 non-null  object        
dtypes: category(2), datetime64[ns](1), int32(2), object(2)
memory usage: 653.8+ MB


None

Unnamed: 0_level_0,Logical Type,Semantic Tag(s)
Column,Unnamed: 1_level_1,Unnamed: 2_level_1
comm_id,Integer,['index']
client_id,Unknown,[]
contact_date,Datetime,['time_index']
cat_c2,Categorical,['category']
cat_c3,Categorical,['category']
cat_c4,Categorical,['category']
cat_c5,Categorical,['category']
contact_date_str,Categorical,['category']


Entityset: client_data
  DataFrames:
    clients [Rows: 36337, Columns: 4]
    transactions [Rows: 15654626, Columns: 23]
    activities [Rows: 17665187, Columns: 15]
    comms [Rows: 19586922, Columns: 8]
  Relationships:
    transactions.client_id -> clients.client_id
    activities.client_id -> clients.client_id
    comms.client_id -> clients.client_id

## Deep Feature Synthesis

In [7]:
MIN_DATE_STR = '2024-12-01 00:00:00'
MAX_DATE_STR = '2025-09-01 00:00:00'

agg_primitives = [
    'count',
    'first', 'last', TimeSinceFirst(unit='days'), TimeSinceLast(unit='days'),  # for date
    'mean', 'std', 'min', 'max', 'sum',   # for numeric
    'num_unique', 'mode',  # for categorical
    'percent_true',  # for boolean
]

primitive_options = {}
for agg in agg_primitives:
    primitive_options[agg]= {'include_columns': {}}
    for df_name in ('transactions', 'activities', 'comms'):
         if agg not in ('count',) and isinstance(agg, str):
             primitive_options[agg]['include_columns'][df_name] = []

In [8]:
"""Transactions"""

df_name = 'transactions'

# Date
for prim in ['first', 'last']:
    primitive_options[prim]['include_columns'][df_name] += ['tran_date']

# Numeric
num_cols = ['float_c16', 'float_c17', 'float_c18', 'int_c19', 'float_c20', 'float_c21']
for prim in ['mean', 'std', 'min', 'max', 'sum']:
    primitive_options[prim]['include_columns'][df_name] += num_cols
    # primitive_options[prim] = {'include_columns': {df_name: num_cols}}

# Categorical
cat_cols = list(es[df_name].ww.select(Categorical).columns)
for prim in ['num_unique', 'mode']:
    # primitive_options['mode'] = {'include_columns': {df_name: cat_cols}}
    primitive_options[prim]['include_columns'][df_name] += cat_cols
primitive_options['mode']['include_columns'][df_name].remove('tran_date_str')  # delete redundant primitive

# Boolean
bool_cols = list(es[df_name].ww.select([Boolean, BooleanNullable]).columns)
# primitive_options['percent_true'] = {'include_columns': {df_name: bool_cols}}
primitive_options['percent_true']['include_columns'][df_name] += bool_cols


# Interesting values
interesting_values = {
    'int_c19': [-1, 1],
}
es.add_interesting_values(dataframe_name=df_name, values=interesting_values)

In [9]:
"""Activities"""

df_name = 'activities'

# Date
for prim in ['first', 'last']:
    primitive_options[prim]['include_columns'][df_name] += ['activity_date']

# Numeric
num_cols = ['float_c11', 'float_c12', 'float_c13', 'float_c14', 'float_c15', 'float_c16', 'float_c17']
for prim in ['mean', 'std', 'min', 'max', 'sum']:
    primitive_options[prim]['include_columns'][df_name] += num_cols

# Categorical
cat_cols = list(es[df_name].ww.select(Categorical).columns)
for prim in ['num_unique', 'mode']:
    primitive_options[prim]['include_columns'][df_name] += cat_cols
primitive_options['mode']['include_columns'][df_name].remove('activity_date_str')  # delete redundant primitive

# Boolean
bool_cols = list(es[df_name].ww.select([Boolean, BooleanNullable]).columns)
primitive_options['percent_true']['include_columns'][df_name] += bool_cols


# Interesting values
interesting_values = {
    'cat_c9': [1, 2],
}
es.add_interesting_values(dataframe_name=df_name, values=interesting_values)

In [10]:
"""Communications"""

df_name = 'comms'

# Date
for prim in ['first', 'last']:
    primitive_options[prim]['include_columns'][df_name] += ['contact_date']

# Categorical
cat_cols = list(es[df_name].ww.select(Categorical).columns)
for prim in ['num_unique', 'mode']:
    primitive_options[prim]['include_columns'][df_name] += cat_cols
primitive_options['mode']['include_columns'][df_name].remove('contact_date_str')  # delete redundant primitive

In [11]:
# cluster = LocalCluster()
feature_matrix, feature_defs = ft.dfs(
    entityset=es,
    target_dataframe_name='clients',
    # ignore_dataframes=['transactions', 'comms'],
    # cutoff_time=transactions_df['tran_date'].max(),
    cutoff_time=MAX_DATE_STR,
    agg_primitives=agg_primitives,
    trans_primitives=['day', 'month', 'weekday', 'is_weekend'],
    where_primitives=['mean', 'sum', 'count'],
    primitive_options=primitive_options,
    max_depth=1,
    verbose=True,
    features_only=False,
    n_jobs=1,
    # chunk_size=.05,
    # dask_kwargs={'cluster': cluster.scheduler.address},
    return_types='all',
)
feature_matrix_enc, features_enc = ft.encode_features(
    feature_matrix,
    feature_defs,
    top_n={
        # Transactions
        'MODE(transactions.cat_c2)': 10,
        'MODE(transactions.cat_c3)': 10,
        'MODE(transactions.cat_c4)': 5,

        # Activities
        'MODE(activities.cat_c3)': 3,
        'MODE(activities.cat_c4)': 2,
        'MODE(activities.cat_c5)': 1,
        'MODE(activities.cat_c6)': 7,
        'MODE(activities.cat_c9)': 2,

        # Communications
        'MODE(comms.cat_c2)': 10,
        'MODE(comms.cat_c3)': 3,
        'MODE(comms.cat_c4)': 3,
        'MODE(comms.cat_c5)': 3,
    },
    to_encode=[
        'MODE(transactions.cat_c2)', 'MODE(transactions.cat_c3)', 'MODE(transactions.cat_c4)',  # transactions
        'MODE(activities.cat_c3)', 'MODE(activities.cat_c4)', 'MODE(activities.cat_c5)', 'MODE(activities.cat_c6)', 'MODE(activities.cat_c9)',  # activities
        'MODE(comms.cat_c2)', 'MODE(comms.cat_c3)', 'MODE(comms.cat_c4)', 'MODE(comms.cat_c5)',  # communications
    ],
)
ft.save_features(features_enc, '../data/features/feature_definitions_v1.json')
feature_matrix_enc.to_csv('../data/features/features_auto_v1_raw.csv', index=False)

display(features_enc)
display(feature_matrix_enc.head())
display(feature_matrix_enc.info(verbose=True, show_counts=True))
display(feature_matrix_enc.describe())



Built 142 features
Elapsed: 00:23 | Progress:  28%|██▊       

  ).apply(wrap)
  ).agg(to_agg)
  ).agg(to_agg)
  ).agg(to_agg)
  ).agg(to_agg)
  ).agg(to_agg)


Elapsed: 00:43 | Progress:  43%|████▎     

  ).agg(to_agg)
  ).agg(to_agg)


Elapsed: 00:46 | Progress:  47%|████▋     

  ).agg(to_agg)


Elapsed: 00:47 | Progress:  51%|█████     

  ).agg(to_agg)


Elapsed: 00:56 | Progress:  52%|█████▏    

  ).apply(wrap)


Elapsed: 01:20 | Progress:  59%|█████▊    

  ).apply(wrap)
  ).agg(to_agg)
  ).agg(to_agg)
  ).agg(to_agg)
  ).agg(to_agg)
  ).agg(to_agg)


Elapsed: 01:43 | Progress:  84%|████████▍ 

  ).agg(to_agg)
  ).agg(to_agg)


Elapsed: 01:47 | Progress:  95%|█████████▌

  ).agg(to_agg)
  ).agg(to_agg)


Elapsed: 01:49 | Progress: 100%|██████████


[<Feature: target>,
 <Feature: is_train>,
 <Feature: communication_month>,
 <Feature: COUNT(transactions)>,
 <Feature: FIRST(transactions.tran_date)>,
 <Feature: LAST(transactions.tran_date)>,
 <Feature: MAX(transactions.float_c16)>,
 <Feature: MAX(transactions.float_c17)>,
 <Feature: MAX(transactions.float_c18)>,
 <Feature: MAX(transactions.float_c20)>,
 <Feature: MAX(transactions.float_c21)>,
 <Feature: MAX(transactions.int_c19)>,
 <Feature: MEAN(transactions.float_c16)>,
 <Feature: MEAN(transactions.float_c17)>,
 <Feature: MEAN(transactions.float_c18)>,
 <Feature: MEAN(transactions.float_c20)>,
 <Feature: MEAN(transactions.float_c21)>,
 <Feature: MEAN(transactions.int_c19)>,
 <Feature: MIN(transactions.float_c16)>,
 <Feature: MIN(transactions.float_c17)>,
 <Feature: MIN(transactions.float_c18)>,
 <Feature: MIN(transactions.float_c20)>,
 <Feature: MIN(transactions.float_c21)>,
 <Feature: MIN(transactions.int_c19)>,
 <Feature: MODE(transactions.cat_c2) = 14>,
 <Feature: MODE(transacti

Unnamed: 0_level_0,target,is_train,communication_month,COUNT(transactions),FIRST(transactions.tran_date),LAST(transactions.tran_date),MAX(transactions.float_c16),MAX(transactions.float_c17),MAX(transactions.float_c18),MAX(transactions.float_c20),...,MODE(comms.cat_c3) = 3,MODE(comms.cat_c3) is unknown,MODE(comms.cat_c4) = 1,MODE(comms.cat_c4) = 2,MODE(comms.cat_c4) = 3,MODE(comms.cat_c4) is unknown,MODE(comms.cat_c5) = 4,MODE(comms.cat_c5) = 0,MODE(comms.cat_c5) = 7,MODE(comms.cat_c5) is unknown
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10000196835799192770,True,True,2025-06-01,348,2024-12-01,2025-05-31,2.15,0.0,600.0,0.0,...,False,False,True,False,False,False,True,False,False,False
1000332954451581031,False,True,2025-06-01,832,2024-12-02,2025-05-31,0.11,3.5,850.0,0.0,...,False,False,True,False,False,False,True,False,False,False
10003496907835962037,True,True,2025-06-01,235,2024-12-02,2025-05-31,2.24,0.0,600.0,0.0,...,False,False,True,False,False,False,True,False,False,False
10004128521937951167,True,True,2025-06-01,1155,2024-12-01,2025-05-31,46.09,3.95,2000.0,0.0,...,False,False,True,False,False,False,True,False,False,False
1000774139179871611,False,True,2025-06-01,182,2024-12-01,2025-05-27,0.0,0.0,740.0,0.0,...,False,False,True,False,False,False,True,False,False,False


<class 'pandas.core.frame.DataFrame'>
Index: 36337 entries, 10000196835799192770 to 9997098291014878881
Data columns (total 201 columns):
 #    Column                                                 Non-Null Count  Dtype         
---   ------                                                 --------------  -----         
 0    target                                                 36337 non-null  bool          
 1    is_train                                               36337 non-null  bool          
 2    communication_month                                    36337 non-null  datetime64[ns]
 3    COUNT(transactions)                                    36337 non-null  Int64         
 4    FIRST(transactions.tran_date)                          35070 non-null  datetime64[ns]
 5    LAST(transactions.tran_date)                           35070 non-null  datetime64[ns]
 6    MAX(transactions.float_c16)                            35070 non-null  float64       
 7    MAX(transactions.float_c17) 

None

Unnamed: 0,communication_month,COUNT(transactions),FIRST(transactions.tran_date),LAST(transactions.tran_date),MAX(transactions.float_c16),MAX(transactions.float_c17),MAX(transactions.float_c18),MAX(transactions.float_c20),MAX(transactions.float_c21),MAX(transactions.int_c19),...,MEAN(activities.float_c12 WHERE cat_c9 = 2),MEAN(activities.float_c12 WHERE cat_c9 = 1),MEAN(activities.float_c14 WHERE cat_c9 = 2),MEAN(activities.float_c14 WHERE cat_c9 = 1),SUM(activities.float_c11 WHERE cat_c9 = 2),SUM(activities.float_c11 WHERE cat_c9 = 1),SUM(activities.float_c12 WHERE cat_c9 = 2),SUM(activities.float_c12 WHERE cat_c9 = 1),SUM(activities.float_c14 WHERE cat_c9 = 2),SUM(activities.float_c14 WHERE cat_c9 = 1)
count,36337,36337.0,35070,35070,35070.0,35070.0,35070.0,35070.0,35070.0,35070.0,...,15687.0,22449.0,15687.0,22449.0,36337.0,36337.0,36337.0,36337.0,36337.0,36337.0
mean,2025-07-14 05:17:23.333241344,430.817789,2025-01-25 19:28:37.775876864,2025-07-08 21:36:49.272882688,1.615024,4.5646,1438.993483,80.61292,1409.988739,0.996863,...,53.595697,58.666043,0.567413,1.095978,5553.1739,13199.61835,10734.255489,16171.626909,117.625678,303.425463
min,2025-06-01 00:00:00,0.0,2024-12-01 00:00:00,2024-12-01 00:00:00,0.0,0.0,0.01,0.0,0.0,-1.0,...,4.2,1.0,0.0,0.003922,0.0,-8.0,0.0,0.0,0.0,0.0
25%,2025-06-01 00:00:00,105.0,2025-01-01 00:00:00,2025-05-31 00:00:00,0.0,0.0,500.0,0.0,500.0,1.0,...,45.96663,50.090909,0.480801,0.32508,0.0,0.0,0.0,0.0,0.0,0.0
50%,2025-07-01 00:00:00,292.0,2025-02-01 00:00:00,2025-06-30 00:00:00,0.46,1.0,1000.0,0.0,970.0,1.0,...,53.357025,58.935,0.569646,0.483541,0.0,1991.0,0.0,5575.0,0.0,40.474511
75%,2025-08-01 00:00:00,598.0,2025-03-01 00:00:00,2025-07-31 00:00:00,1.66,4.0,1950.0,0.0,1900.0,1.0,...,60.921744,67.642241,0.65,0.732535,3378.0,16529.0,15351.000215,23826.0,157.100001,219.372552
max,2025-09-01 00:00:00,8721.0,2025-08-31 00:00:00,2025-08-31 00:00:00,413.130005,2294.719971,55600.0,13500.0,55600.0,1.0,...,100.0,100.0,1.0,26.118857,310892.0,643692.0,536858.003653,585864.0,6258.350019,45994.304443
std,,472.842999,,,4.861283,19.187926,1646.345874,486.548652,1632.86202,0.079142,...,11.469829,13.038848,0.140197,2.11245,14824.783846,25280.738854,19954.757006,25467.422529,227.074799,1190.763807


## Postprocessing

In [12]:
df = feature_matrix_enc
df = df.reset_index()
# df = df.drop(columns=['client_id'])
df = df.dropna(subset=[
    # Transactions
    'NUM_UNIQUE(transactions.tran_date_str)', 'MEAN(transactions.float_c16 WHERE int_c19 = -1)', 'MEAN(transactions.float_c16 WHERE int_c19 = 1)',

    # Activities
    'NUM_UNIQUE(activities.activity_date_str)', 'STD(activities.float_c11)',

    # Communications
    'NUM_UNIQUE(comms.contact_date_str)',
])

# Convert dates to days from initial point
for date_col in df.select_dtypes(include=['datetime64[ns]']).columns:
    df[date_col + '_days'] = (df[date_col] - pd.to_datetime(MIN_DATE_STR)).dt.days
    df = df.drop(columns=[date_col])

# Encode cyclical features (e.g. day of week, month)
time_cols = ['DAY(communication_month)', 'MONTH(communication_month)', 'WEEKDAY(communication_month)']
df = df.astype({col: 'int8' for col in time_cols})
cyclical = CyclicalFeatures(variables=time_cols, drop_original=True)
df = cyclical.fit_transform(df)

# Fill activities missing data
cols_with_blanks = [
    'MEAN(activities.float_c11 WHERE cat_c9 = 2)', 'MEAN(activities.float_c11 WHERE cat_c9 = 1)',
    'MEAN(activities.float_c12 WHERE cat_c9 = 2)', 'MEAN(activities.float_c12 WHERE cat_c9 = 1)',
    'MEAN(activities.float_c14 WHERE cat_c9 = 2)', 'MEAN(activities.float_c14 WHERE cat_c9 = 1)',
]
ami = AddMissingIndicator(variables=cols_with_blanks)
df = ami.fit_transform(df)
ani = ArbitraryNumberImputer(arbitrary_number=0, variables=cols_with_blanks)
df = ani.fit_transform(df)

# Convert boolean values to 0 and 1
df = df.astype({col: 'int8' for col in df.select_dtypes(include=['bool']).columns})
df = df.astype({col: 'Int8' for col in df.select_dtypes(include=['boolean']).columns})

df.to_csv('../data/features/features_auto_v1.csv', index=False)
display(df.info(verbose=True, show_counts=True))
display(df.describe())


<class 'pandas.core.frame.DataFrame'>
Index: 34837 entries, 0 to 36336
Data columns (total 211 columns):
 #    Column                                                 Non-Null Count  Dtype  
---   ------                                                 --------------  -----  
 0    client_id                                              34837 non-null  string 
 1    target                                                 34837 non-null  int8   
 2    is_train                                               34837 non-null  int8   
 3    COUNT(transactions)                                    34837 non-null  Int64  
 4    MAX(transactions.float_c16)                            34837 non-null  float64
 5    MAX(transactions.float_c17)                            34837 non-null  float64
 6    MAX(transactions.float_c18)                            34837 non-null  float64
 7    MAX(transactions.float_c20)                            34837 non-null  float64
 8    MAX(transactions.float_c21)            

None

Unnamed: 0,target,is_train,COUNT(transactions),MAX(transactions.float_c16),MAX(transactions.float_c17),MAX(transactions.float_c18),MAX(transactions.float_c20),MAX(transactions.float_c21),MAX(transactions.int_c19),MEAN(transactions.float_c16),...,MONTH(communication_month)_sin,MONTH(communication_month)_cos,WEEKDAY(communication_month)_sin,WEEKDAY(communication_month)_cos,MEAN(activities.float_c11 WHERE cat_c9 = 2)_na,MEAN(activities.float_c11 WHERE cat_c9 = 1)_na,MEAN(activities.float_c12 WHERE cat_c9 = 2)_na,MEAN(activities.float_c12 WHERE cat_c9 = 1)_na,MEAN(activities.float_c14 WHERE cat_c9 = 2)_na,MEAN(activities.float_c14 WHERE cat_c9 = 1)_na
count,34837.0,34837.0,34837.0,34837.0,34837.0,34837.0,34837.0,34837.0,34837.0,34837.0,...,34837.0,34837.0,34837.0,34837.0,34837.0,34837.0,34837.0,34837.0,34837.0,34837.0
mean,0.244912,0.792778,449.322559,1.625743,4.583724,1446.550533,80.503727,1418.000101,1.0,0.02968,...,-0.658584,0.323086,-0.002212483,0.471324,0.558257,0.371444,0.558257,0.371444,0.558257,0.371444
std,0.430041,0.405322,474.247815,4.875727,19.238116,1648.495912,486.876171,1634.915812,0.0,0.08637,...,0.3595777,0.576715,0.6289322,0.618317,0.496602,0.483198,0.496602,0.483198,0.496602,0.483198
min,0.0,0.0,2.0,0.0,0.0,0.01,0.0,0.0,1.0,-0.03459,...,-0.9848078,-0.5,-0.8660254,-0.5,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,1.0,124.0,0.0,0.0,510.0,0.0,500.0,1.0,0.0,...,-0.9848078,-0.5,-0.8660254,-0.5,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,1.0,311.0,0.48,1.0,1000.0,0.0,980.0,1.0,0.0075,...,-0.8660254,0.173648,-2.449294e-16,0.5,1.0,0.0,1.0,0.0,1.0,0.0
75%,0.0,1.0,618.0,1.67,4.03,1963.430054,0.0,1900.0,1.0,0.035062,...,-0.6427876,0.766044,0.8660254,1.0,1.0,1.0,1.0,1.0,1.0,1.0
max,1.0,1.0,8721.0,413.130005,2294.719971,55600.0,13500.0,55600.0,1.0,4.931538,...,-2.449294e-16,1.0,0.8660254,1.0,1.0,1.0,1.0,1.0,1.0,1.0
