# Automated Feature Engineering (featuretools)

In [1]:
import featuretools as ft
import pandas as pd
from IPython.display import display
from feature_engine.creation import CyclicalFeatures
from feature_engine.imputation import AddMissingIndicator, ArbitraryNumberImputer, MeanMedianImputer
# from dask.distributed import LocalCluster
from featuretools.primitives import TimeSinceFirst, TimeSinceLast
from tsfresh import extract_features, extract_relevant_features
from tsfresh.feature_extraction import EfficientFCParameters
from woodwork.logical_types import Boolean, BooleanNullable, Categorical, Unknown

from utils import add_calendar_values

## Create EntitySet

In [2]:
es = ft.EntitySet(id='client_data')

In [3]:
"""Clients"""

from utils import read_clients

CLIENTS_PATH = '../data/initial/CLIENTS.csv'
# CLIENTS_PATH = '../data/samples/CLIENTS_SAMPLE.csv'

clients_df = read_clients(CLIENTS_PATH, encode_bool=False)
clients_df = clients_df.astype({'client_id': str})
display(clients_df.info())

es.add_dataframe(
    clients_df,
    dataframe_name='clients',
    index='client_id',
    time_index='communication_month',
    # make_index=True,
    logical_types={
        'client_id': Unknown,
        'target': Boolean,
        'is_train': Boolean,
    }
)
display(es['clients'].ww.schema)
display(es)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36337 entries, 0 to 36336
Data columns (total 4 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   client_id            36337 non-null  object        
 1   target               36337 non-null  bool          
 2   is_train             36337 non-null  bool          
 3   communication_month  36337 non-null  datetime64[ns]
dtypes: bool(2), datetime64[ns](1), object(1)
memory usage: 638.9+ KB


None

Unnamed: 0_level_0,Logical Type,Semantic Tag(s)
Column,Unnamed: 1_level_1,Unnamed: 2_level_1
client_id,Unknown,['index']
target,Boolean,[]
is_train,Boolean,[]
communication_month,Datetime,['time_index']


Entityset: client_data
  DataFrames:
    clients [Rows: 36337, Columns: 4]
  Relationships:
    No relationships

In [4]:
"""Transactions"""

from utils import read_transactions

TRANSACTIONS_PATH = '../data/initial/TRANSACTIONS.csv'
# TRANSACTIONS_PATH = '../data/samples/TRANSACTIONS_SAMPLE.csv'

transactions_df = read_transactions(TRANSACTIONS_PATH, encode_bool=False)
transactions_df = transactions_df.astype({'client_id': str})
transactions_df['tran_date_str'] = transactions_df['tran_date'].dt.date.astype(str)  # used later for feature generation
transactions_df = add_calendar_values(transactions_df, 'tran_date', prefix='tran_date_')
display(transactions_df.info())

es.add_dataframe(
    transactions_df,
    dataframe_name='transactions',
    index='transaction_id',
    time_index='tran_date',
    make_index=True,
    logical_types={
        'client_id': Unknown,
        'cat_c2': Categorical,
        'cat_c3': Categorical,
        'cat_c4': Categorical,
        'tran_date_str': Categorical,
        'fl_c6': Boolean,
        'fl_c7': Boolean,
        'fl_c8': Boolean,
        'fl_c9': Boolean,
        'fl_c10': Boolean,
        'fl_c11': Boolean,
        'fl_c12': Boolean,
        'fl_c13': Boolean,
        'fl_c14': Boolean,
        'fl_c15': Boolean,
    }
)
display(es['transactions'].ww.schema)

es.add_relationship('clients', 'client_id', 'transactions', 'client_id')
display(es)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15654626 entries, 0 to 15654625
Data columns (total 27 columns):
 #   Column                      Dtype         
---  ------                      -----         
 0   client_id                   object        
 1   tran_date                   datetime64[ns]
 2   cat_c2                      int32         
 3   cat_c3                      int32         
 4   cat_c4                      int32         
 5   fl_c6                       bool          
 6   fl_c7                       bool          
 7   fl_c8                       bool          
 8   fl_c9                       bool          
 9   fl_c10                      bool          
 10  fl_c11                      bool          
 11  fl_c12                      bool          
 12  fl_c13                      bool          
 13  fl_c14                      bool          
 14  fl_c15                      bool          
 15  float_c16                   float32       
 16  float_c17       

None

Unnamed: 0_level_0,Logical Type,Semantic Tag(s)
Column,Unnamed: 1_level_1,Unnamed: 2_level_1
transaction_id,Integer,['index']
client_id,Unknown,[]
tran_date,Datetime,['time_index']
cat_c2,Categorical,['category']
cat_c3,Categorical,['category']
cat_c4,Categorical,['category']
fl_c6,Boolean,[]
fl_c7,Boolean,[]
fl_c8,Boolean,[]
fl_c9,Boolean,[]


Entityset: client_data
  DataFrames:
    clients [Rows: 36337, Columns: 4]
    transactions [Rows: 15654626, Columns: 28]
  Relationships:
    transactions.client_id -> clients.client_id

In [5]:
"""App activity"""

from utils import preprocess_app_activity_data, read_app_activity

ACTIVITY_PATH = '../data/initial/APP_ACTIVITY.csv'
# ACTIVITY_PATH = '../data/samples/APP_ACTIVITY_SAMPLE.csv'

activities_df = read_app_activity(ACTIVITY_PATH, encode_bool=False)
activities_df = activities_df.astype({'client_id': str})
activities_df = preprocess_app_activity_data(activities_df)
activities_df['activity_date_str'] = activities_df['activity_date'].dt.date.astype(str)  # used later for feature generation
activities_df = add_calendar_values(activities_df, 'activity_date', prefix='activity_date_')
display(activities_df.info(show_counts=True))

es.add_dataframe(
    activities_df,
    dataframe_name='activities',
    index='activity_id',
    time_index='activity_date',
    make_index=True,
    logical_types={
        'client_id': Unknown,
        'cat_c3': Categorical,
        'cat_c4': Categorical,
        'cat_c5': Categorical,
        'cat_c6': Categorical,
        'cat_c9': Categorical,
        'activity_date_str': Categorical,
        'cat_c8': BooleanNullable,
        'cat_c10': BooleanNullable,
    }
)
display(es['activities'].ww.schema)

es.add_relationship('clients', 'client_id', 'activities', 'client_id')
display(es)

<class 'pandas.core.frame.DataFrame'>
Index: 17665187 entries, 0 to 17738591
Data columns (total 19 columns):
 #   Column                          Non-Null Count     Dtype         
---  ------                          --------------     -----         
 0   client_id                       17665187 non-null  object        
 1   device_id                       17665187 non-null  uint64        
 2   activity_date                   17665187 non-null  datetime64[ns]
 3   cat_c3                          17665187 non-null  int32         
 4   cat_c4                          17665187 non-null  int32         
 5   cat_c5                          17665187 non-null  int32         
 6   cat_c6                          17665187 non-null  int32         
 7   cat_c8                          17665187 non-null  boolean       
 8   cat_c9                          17665187 non-null  int32         
 9   cat_c10                         17665187 non-null  boolean       
 10  float_c11                       1

None

Unnamed: 0_level_0,Logical Type,Semantic Tag(s)
Column,Unnamed: 1_level_1,Unnamed: 2_level_1
activity_id,Integer,['index']
client_id,Unknown,[]
device_id,Integer,['numeric']
activity_date,Datetime,['time_index']
cat_c3,Categorical,['category']
cat_c4,Categorical,['category']
cat_c5,Categorical,['category']
cat_c6,Categorical,['category']
cat_c8,BooleanNullable,[]
cat_c9,Categorical,['category']


Entityset: client_data
  DataFrames:
    clients [Rows: 36337, Columns: 4]
    transactions [Rows: 15654626, Columns: 28]
    activities [Rows: 17665187, Columns: 20]
  Relationships:
    transactions.client_id -> clients.client_id
    activities.client_id -> clients.client_id

In [6]:
"""Communications"""

from utils import read_communications

COMMS_PATH = '../data/initial/COMMUNICATIONS.csv'
# COMMS_PATH = '../data/samples/COMMUNICATIONS_SAMPLE.csv'

comms_df = read_communications(COMMS_PATH)
comms_df = comms_df.astype({'client_id': str})
# comms_df = preprocess_comm_data(comms)
# comms_df = encode_comm_categories(comms)
comms_df['contact_date_str'] = comms_df['contact_date'].dt.date.astype(str)  # used later for feature generation
comms_df = add_calendar_values(comms_df, 'contact_date', prefix='contact_date_')
display(comms_df.info(show_counts=True))

es.add_dataframe(
    comms_df,
    dataframe_name='comms',
    index='comm_id',
    time_index='contact_date',
    make_index=True,
    logical_types={
        'client_id': Unknown,
        'cat_c2': Categorical,
        'cat_c3': Categorical,
        'cat_c4': Categorical,
        'cat_c5': Categorical,
        'contact_date_str': Categorical,
    }
)
display(es['comms'].ww.schema)

es.add_relationship('clients', 'client_id', 'comms', 'client_id')
display(es)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19586922 entries, 0 to 19586921
Data columns (total 12 columns):
 #   Column                         Non-Null Count     Dtype         
---  ------                         --------------     -----         
 0   client_id                      19586922 non-null  object        
 1   contact_date                   19586922 non-null  datetime64[ns]
 2   cat_c2                         19586922 non-null  category      
 3   cat_c3                         19586922 non-null  int32         
 4   cat_c4                         19586922 non-null  int32         
 5   cat_c5                         19586917 non-null  category      
 6   contact_date_str               19586922 non-null  object        
 7   contact_date_is_weekend        19586922 non-null  bool          
 8   contact_date_day_of_week_sin   19586922 non-null  float64       
 9   contact_date_day_of_week_cos   19586922 non-null  float64       
 10  contact_date_day_of_month_sin  19586922 

None

Unnamed: 0_level_0,Logical Type,Semantic Tag(s)
Column,Unnamed: 1_level_1,Unnamed: 2_level_1
comm_id,Integer,['index']
client_id,Unknown,[]
contact_date,Datetime,['time_index']
cat_c2,Categorical,['category']
cat_c3,Categorical,['category']
cat_c4,Categorical,['category']
cat_c5,Categorical,['category']
contact_date_str,Categorical,['category']
contact_date_is_weekend,Boolean,[]
contact_date_day_of_week_sin,Double,['numeric']


Entityset: client_data
  DataFrames:
    clients [Rows: 36337, Columns: 4]
    transactions [Rows: 15654626, Columns: 28]
    activities [Rows: 17665187, Columns: 20]
    comms [Rows: 19586922, Columns: 13]
  Relationships:
    transactions.client_id -> clients.client_id
    activities.client_id -> clients.client_id
    comms.client_id -> clients.client_id

## Deep Feature Synthesis

In [7]:
MIN_DATE_STR = '2024-12-01 00:00:00'
MAX_DATE_STR = '2025-09-01 00:00:00'

agg_primitives = [
    'count',
    'first', 'last', TimeSinceFirst(unit='days'), TimeSinceLast(unit='days'),  # for date
    'mean', 'std', 'min', 'max', 'sum',   # for numeric
    'num_unique', 'mode',  # for categorical
    'percent_true',  # for boolean
]

primitive_options = {}
for agg in agg_primitives:
    primitive_options[agg]= {'include_columns': {}}
    for df_name in ('transactions', 'activities', 'comms'):
         if agg not in ('count',) and isinstance(agg, str):
             primitive_options[agg]['include_columns'][df_name] = []

In [15]:
"""Transactions"""

df_name = 'transactions'

# Date
for prim in ['first', 'last']:
    primitive_options[prim]['include_columns'][df_name] += ['tran_date']
primitive_options['mean']['include_columns'][df_name] += [
    'tran_date_day_of_week_sin', 'tran_date_day_of_week_cos',
    'tran_date_day_of_month_sin', 'tran_date_day_of_month_cos',
]

# Numeric
num_cols = ['float_c16', 'float_c17', 'float_c18', 'int_c19', 'float_c20', 'float_c21']
for prim in ['mean', 'std', 'min', 'max', 'sum']:
    primitive_options[prim]['include_columns'][df_name] += num_cols
    # primitive_options[prim] = {'include_columns': {df_name: num_cols}}

# Categorical
cat_cols = list(es[df_name].ww.select(Categorical).columns)
for prim in ['num_unique', 'mode']:
    # primitive_options['mode'] = {'include_columns': {df_name: cat_cols}}
    primitive_options[prim]['include_columns'][df_name] += cat_cols
primitive_options['mode']['include_columns'][df_name].remove('tran_date_str')  # delete redundant primitive

# Boolean
bool_cols = list(es[df_name].ww.select([Boolean, BooleanNullable]).columns)
# primitive_options['percent_true'] = {'include_columns': {df_name: bool_cols}}
primitive_options['percent_true']['include_columns'][df_name] += bool_cols


# Interesting values
interesting_values = {
    'cat_c2': [4, 14, 15],
    'cat_c3': [209, 303, 305, 314],
    'int_c19': [-1, 1],
    'tran_date_is_weekend': [True, False],
}
es.add_interesting_values(dataframe_name=df_name, values=interesting_values)

In [16]:
"""Activities"""

df_name = 'activities'

# Date
for prim in ['first', 'last']:
    primitive_options[prim]['include_columns'][df_name] += ['activity_date']
primitive_options['mean']['include_columns'][df_name] += [
    'activity_date_day_of_week_sin', 'activity_date_day_of_week_cos',
    'activity_date_day_of_month_sin', 'activity_date_day_of_month_cos',
]

# Numeric
num_cols = ['float_c11', 'float_c12', 'float_c13', 'float_c14', 'float_c15', 'float_c16', 'float_c17']
for prim in ['mean', 'std', 'min', 'max', 'sum']:
    primitive_options[prim]['include_columns'][df_name] += num_cols

# Categorical
cat_cols = list(es[df_name].ww.select(Categorical).columns)
for prim in ['num_unique', 'mode']:
    primitive_options[prim]['include_columns'][df_name] += cat_cols
primitive_options['mode']['include_columns'][df_name].remove('activity_date_str')  # delete redundant primitive

# Boolean
bool_cols = list(es[df_name].ww.select([Boolean, BooleanNullable]).columns)
primitive_options['percent_true']['include_columns'][df_name] += bool_cols


# Interesting values
interesting_values = {
    'cat_c4': [1, 2],
    'cat_c6': [1, 2, 3],
    'cat_c9': [1, 2],
    'activity_date_is_weekend': [True, False],
}
es.add_interesting_values(dataframe_name=df_name, values=interesting_values)

In [17]:
"""Communications"""

df_name = 'comms'

# Date
for prim in ['first', 'last']:
    primitive_options[prim]['include_columns'][df_name] += ['contact_date']
primitive_options['mean']['include_columns'][df_name] += [
    'contact_date_day_of_week_sin', 'contact_date_day_of_week_cos',
    'contact_date_day_of_month_sin', 'contact_date_day_of_month_cos',
]

# Categorical
cat_cols = list(es[df_name].ww.select(Categorical).columns)
for prim in ['num_unique', 'mode']:
    primitive_options[prim]['include_columns'][df_name] += cat_cols
primitive_options['mode']['include_columns'][df_name].remove('contact_date_str')  # delete redundant primitive


# Boolean
bool_cols = list(es[df_name].ww.select([Boolean, BooleanNullable]).columns)
primitive_options['percent_true']['include_columns'][df_name] += bool_cols

# Interesting values
interesting_values = {
    'cat_c2': ['S3564', 'S3565', 'S3677', 'S3769'],
    'cat_c3': [3, 4, 7],
    'cat_c4': [1, 2],
    'cat_c5': ['4', '7'],
    'contact_date_is_weekend': [True, False],
}
es.add_interesting_values(dataframe_name=df_name, values=interesting_values)

In [18]:
# cluster = LocalCluster()
feature_matrix, feature_defs = ft.dfs(
    entityset=es,
    target_dataframe_name='clients',
    # ignore_dataframes=['transactions', 'comms'],
    # cutoff_time=transactions_df['tran_date'].max(),
    cutoff_time=MAX_DATE_STR,
    agg_primitives=agg_primitives,
    trans_primitives=['day', 'month', 'weekday', 'is_weekend'],
    where_primitives=['mean', 'sum', 'count'],
    primitive_options=primitive_options,
    max_depth=1,
    verbose=True,
    features_only=False,
    n_jobs=1,
    # chunk_size=.05,
    # dask_kwargs={'cluster': cluster.scheduler.address},
    return_types='all',
)
feature_matrix_enc, features_enc = ft.encode_features(
    feature_matrix,
    feature_defs,
    top_n={
        # Transactions
        'MODE(transactions.cat_c2)': 10,
        'MODE(transactions.cat_c3)': 10,
        'MODE(transactions.cat_c4)': 5,

        # Activities
        'MODE(activities.cat_c3)': 3,
        'MODE(activities.cat_c4)': 2,
        'MODE(activities.cat_c5)': 1,
        'MODE(activities.cat_c6)': 7,
        'MODE(activities.cat_c9)': 2,

        # Communications
        'MODE(comms.cat_c2)': 10,
        'MODE(comms.cat_c3)': 3,
        'MODE(comms.cat_c4)': 3,
        'MODE(comms.cat_c5)': 3,
    },
    to_encode=[
        'MODE(transactions.cat_c2)', 'MODE(transactions.cat_c3)', 'MODE(transactions.cat_c4)',  # transactions
        'MODE(activities.cat_c3)', 'MODE(activities.cat_c4)', 'MODE(activities.cat_c5)', 'MODE(activities.cat_c6)', 'MODE(activities.cat_c9)',  # activities
        'MODE(comms.cat_c2)', 'MODE(comms.cat_c3)', 'MODE(comms.cat_c4)', 'MODE(comms.cat_c5)',  # communications
    ],
)
ft.save_features(features_enc, '../data/features/feature_definitions_v2.json')
feature_matrix_enc.to_csv('../data/features/features_auto_v2_raw.csv', index=False)

display(features_enc)
display(feature_matrix_enc.head())
display(feature_matrix_enc.info(verbose=True, show_counts=True))
display(feature_matrix_enc.describe())



Built 468 features
Elapsed: 01:25 | Progress:  17%|█▋        

  ).apply(wrap)
  ).agg(to_agg)
  ).agg(to_agg)
  ).agg(to_agg)
  ).agg(to_agg)
  ).agg(to_agg)


Elapsed: 01:46 | Progress:  23%|██▎       

  ).agg(to_agg)
  ).agg(to_agg)


Elapsed: 01:49 | Progress:  25%|██▍       

  ).agg(to_agg)


Elapsed: 01:50 | Progress:  27%|██▋       

  ).agg(to_agg)
  ).agg(to_agg)
  ).agg(to_agg)


Elapsed: 01:55 | Progress:  30%|███       

  ).agg(to_agg)
  ).agg(to_agg)
  ).agg(to_agg)
  ).agg(to_agg)


Elapsed: 01:57 | Progress:  32%|███▏      

  ).agg(to_agg)


Elapsed: 01:58 | Progress:  34%|███▍      

  ).agg(to_agg)
  ).agg(to_agg)


Elapsed: 01:59 | Progress:  36%|███▌      

  ).agg(to_agg)
  ).agg(to_agg)
  ).agg(to_agg)


Elapsed: 02:02 | Progress:  38%|███▊      

  ).agg(to_agg)
  ).agg(to_agg)


Elapsed: 02:13 | Progress:  40%|████      

  ).apply(wrap)
  ).agg(to_agg)


Elapsed: 02:28 | Progress:  43%|████▎     

  ).agg(to_agg)


Elapsed: 02:28 | Progress:  44%|████▍     

  ).agg(to_agg)
  ).agg(to_agg)


Elapsed: 02:30 | Progress:  45%|████▍     

  ).agg(to_agg)


Elapsed: 02:31 | Progress:  46%|████▌     

  ).agg(to_agg)


Elapsed: 02:32 | Progress:  47%|████▋     

  ).agg(to_agg)
  ).agg(to_agg)


Elapsed: 02:36 | Progress:  49%|████▉     

  ).agg(to_agg)
  ).agg(to_agg)


Elapsed: 02:39 | Progress:  50%|████▉     

  ).agg(to_agg)


Elapsed: 02:40 | Progress:  51%|█████     

  ).agg(to_agg)


Elapsed: 02:44 | Progress:  52%|█████▏    

  ).agg(to_agg)
  ).agg(to_agg)


Elapsed: 02:47 | Progress:  53%|█████▎    

  ).agg(to_agg)


Elapsed: 02:58 | Progress:  54%|█████▍    

  ).apply(wrap)
  ).agg(to_agg)
  ).agg(to_agg)
  ).agg(to_agg)
  ).agg(to_agg)
  ).agg(to_agg)


Elapsed: 03:21 | Progress:  64%|██████▍   

  ).agg(to_agg)
  ).agg(to_agg)


Elapsed: 03:23 | Progress:  67%|██████▋   

  ).agg(to_agg)
  ).agg(to_agg)


Elapsed: 03:25 | Progress:  70%|██████▉   

  ).agg(to_agg)


Elapsed: 03:26 | Progress:  73%|███████▎  

  ).agg(to_agg)
  ).agg(to_agg)


Elapsed: 03:27 | Progress:  75%|███████▌  

  ).agg(to_agg)
  ).agg(to_agg)


Elapsed: 03:27 | Progress:  78%|███████▊  

  ).agg(to_agg)
  ).agg(to_agg)
  ).agg(to_agg)


Elapsed: 03:29 | Progress:  81%|████████  

  ).agg(to_agg)
  ).agg(to_agg)


Elapsed: 03:31 | Progress:  84%|████████▍ 

  ).agg(to_agg)
  ).agg(to_agg)


Elapsed: 03:34 | Progress:  87%|████████▋ 

  ).agg(to_agg)
  ).agg(to_agg)


Elapsed: 03:36 | Progress:  89%|████████▉ 

  ).agg(to_agg)
  ).agg(to_agg)


Elapsed: 03:40 | Progress:  92%|█████████▏

  ).agg(to_agg)
  ).agg(to_agg)


Elapsed: 03:43 | Progress: 100%|██████████


[<Feature: target>,
 <Feature: is_train>,
 <Feature: communication_month>,
 <Feature: COUNT(transactions)>,
 <Feature: FIRST(transactions.tran_date)>,
 <Feature: LAST(transactions.tran_date)>,
 <Feature: MAX(transactions.float_c16)>,
 <Feature: MAX(transactions.float_c17)>,
 <Feature: MAX(transactions.float_c18)>,
 <Feature: MAX(transactions.float_c20)>,
 <Feature: MAX(transactions.float_c21)>,
 <Feature: MAX(transactions.int_c19)>,
 <Feature: MEAN(transactions.float_c16)>,
 <Feature: MEAN(transactions.float_c17)>,
 <Feature: MEAN(transactions.float_c18)>,
 <Feature: MEAN(transactions.float_c20)>,
 <Feature: MEAN(transactions.float_c21)>,
 <Feature: MEAN(transactions.int_c19)>,
 <Feature: MEAN(transactions.tran_date_day_of_month_cos)>,
 <Feature: MEAN(transactions.tran_date_day_of_month_sin)>,
 <Feature: MEAN(transactions.tran_date_day_of_week_cos)>,
 <Feature: MEAN(transactions.tran_date_day_of_week_sin)>,
 <Feature: MIN(transactions.float_c16)>,
 <Feature: MIN(transactions.float_c17)

Unnamed: 0_level_0,target,is_train,communication_month,COUNT(transactions),FIRST(transactions.tran_date),LAST(transactions.tran_date),MAX(transactions.float_c16),MAX(transactions.float_c17),MAX(transactions.float_c18),MAX(transactions.float_c20),...,MODE(comms.cat_c3) = 3,MODE(comms.cat_c3) is unknown,MODE(comms.cat_c4) = 1,MODE(comms.cat_c4) = 2,MODE(comms.cat_c4) = 3,MODE(comms.cat_c4) is unknown,MODE(comms.cat_c5) = 4,MODE(comms.cat_c5) = 0,MODE(comms.cat_c5) = 7,MODE(comms.cat_c5) is unknown
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10000196835799192770,True,True,2025-06-01,348,2024-12-01,2025-05-31,2.15,0.0,600.0,0.0,...,False,False,True,False,False,False,True,False,False,False
1000332954451581031,False,True,2025-06-01,832,2024-12-02,2025-05-31,0.11,3.5,850.0,0.0,...,False,False,True,False,False,False,True,False,False,False
10003496907835962037,True,True,2025-06-01,235,2024-12-02,2025-05-31,2.24,0.0,600.0,0.0,...,False,False,True,False,False,False,True,False,False,False
10004128521937951167,True,True,2025-06-01,1155,2024-12-01,2025-05-31,46.09,3.95,2000.0,0.0,...,False,False,True,False,False,False,True,False,False,False
1000774139179871611,False,True,2025-06-01,182,2024-12-01,2025-05-27,0.0,0.0,740.0,0.0,...,False,False,True,False,False,False,True,False,False,False


<class 'pandas.core.frame.DataFrame'>
Index: 36337 entries, 10000196835799192770 to 9997098291014878881
Data columns (total 527 columns):
 #    Column                                                                                  Non-Null Count  Dtype         
---   ------                                                                                  --------------  -----         
 0    target                                                                                  36337 non-null  bool          
 1    is_train                                                                                36337 non-null  bool          
 2    communication_month                                                                     36337 non-null  datetime64[ns]
 3    COUNT(transactions)                                                                     36337 non-null  Int64         
 4    FIRST(transactions.tran_date)                                                           35070 non-null  da

None

Unnamed: 0,communication_month,COUNT(transactions),FIRST(transactions.tran_date),LAST(transactions.tran_date),MAX(transactions.float_c16),MAX(transactions.float_c17),MAX(transactions.float_c18),MAX(transactions.float_c20),MAX(transactions.float_c21),MAX(transactions.int_c19),...,MEAN(comms.contact_date_day_of_week_sin WHERE cat_c2 = S3769),MEAN(comms.contact_date_day_of_week_sin WHERE cat_c4 = 1),MEAN(comms.contact_date_day_of_week_sin WHERE cat_c3 = 3),MEAN(comms.contact_date_day_of_week_sin WHERE cat_c5 = 7),MEAN(comms.contact_date_day_of_week_sin WHERE cat_c2 = S3677),MEAN(comms.contact_date_day_of_week_sin WHERE contact_date_is_weekend = False),MEAN(comms.contact_date_day_of_week_sin WHERE cat_c5 = 4),MEAN(comms.contact_date_day_of_week_sin WHERE cat_c4 = 2),MEAN(comms.contact_date_day_of_week_sin WHERE cat_c3 = 7),MEAN(comms.contact_date_day_of_week_sin WHERE cat_c3 = 4)
count,36337,36337.0,35070,35070,35070.0,35070.0,35070.0,35070.0,35070.0,35070.0,...,31635.0,35578.0,35500.0,22792.0,34719.0,35791.0,35789.0,35747.0,34992.0,35802.0
mean,2025-07-14 05:17:23.333241344,430.817789,2025-01-25 19:28:37.775876864,2025-07-08 21:36:49.272882688,1.615024,4.5646,1438.993483,80.61292,1409.988739,0.996863,...,0.007921,0.004061,-0.052049,-0.003293852,0.004801,0.163782,-0.002479895,0.004513229,0.1009251,-0.00317072
min,2025-06-01 00:00:00,0.0,2024-12-01 00:00:00,2024-12-01 00:00:00,0.0,0.0,0.01,0.0,0.0,-1.0,...,-0.866025,-0.866025,-0.866025,-0.8660254,-0.866025,-0.866025,-0.8660254,-0.8660254,-0.8660254,-0.8660254
25%,2025-06-01 00:00:00,105.0,2025-01-01 00:00:00,2025-05-31 00:00:00,0.0,0.0,500.0,0.0,500.0,1.0,...,-0.086603,-0.03958,-0.173205,-0.1732051,-0.057735,0.126735,-0.04037414,-0.07021828,-1.659665e-17,-0.04207411
50%,2025-07-01 00:00:00,292.0,2025-02-01 00:00:00,2025-06-30 00:00:00,0.46,1.0,1000.0,0.0,970.0,1.0,...,0.0,0.002366,-0.050943,-7.628252e-18,0.0,0.16884,-2.739511e-17,8.123583e-18,0.1082532,-2.860847e-17
75%,2025-08-01 00:00:00,598.0,2025-03-01 00:00:00,2025-07-31 00:00:00,1.66,4.0,1950.0,0.0,1900.0,1.0,...,0.103923,0.047522,0.072169,0.1732051,0.068856,0.207278,0.03901705,0.0805605,0.2165064,0.04028025
max,2025-09-01 00:00:00,8721.0,2025-08-31 00:00:00,2025-08-31 00:00:00,413.130005,2294.719971,55600.0,13500.0,55600.0,1.0,...,0.866025,0.866025,0.866025,0.8660254,0.866025,0.866025,0.8660254,0.8660254,0.8660254,0.8660254
std,,472.842999,,,4.861283,19.187926,1646.345874,486.548652,1632.86202,0.079142,...,0.254059,0.123998,0.212456,0.4095745,0.171682,0.087253,0.08638733,0.1295948,0.1990566,0.09254468


## Add more date features

In [12]:
def prep_time_gaps(df, date_col, prefix):
    df = df.sort_values(by=['client_id', date_col])

    gap_col_name = f'{prefix}_gap_days'
    df[gap_col_name] = df.groupby('client_id')[date_col].diff().dt.total_seconds() / (3600 * 24)
    df[gap_col_name] = df[gap_col_name].fillna(0)

    return df[['client_id', date_col, gap_col_name]]

df_transactions_gap = prep_time_gaps(transactions_df, 'tran_date', 'tran_date')
df_activities_gap = prep_time_gaps(activities_df, 'activity_date', 'activity_date')
df_comms_gap  = prep_time_gaps(comms_df, 'contact_date', 'contact_date')

display(df_transactions_gap.head(10))

Unnamed: 0,client_id,tran_date,tran_date_gap_days
461179,10000196835799192770,2024-12-01,0.0
461290,10000196835799192770,2024-12-01,0.0
461139,10000196835799192770,2024-12-03,2.0
461189,10000196835799192770,2024-12-03,0.0
461143,10000196835799192770,2024-12-05,2.0
461195,10000196835799192770,2024-12-05,0.0
461117,10000196835799192770,2024-12-07,2.0
461146,10000196835799192770,2024-12-07,0.0
461201,10000196835799192770,2024-12-07,0.0
461202,10000196835799192770,2024-12-07,0.0


In [19]:
datasets = {
    'transactions': (df_transactions_gap, 'tran_date', 'tran_date_gap_days'),
    'activities': (df_activities_gap, 'activity_date', 'activity_date_gap_days'),
    'communications': (df_comms_gap, 'contact_date', 'contact_date_gap_days')
}
all_features = []
for name, (df, date_col, val_col) in datasets.items():
    print(f'Processing {name}...')

    fc_parameters = {
        'median': None,
        'mean': None,
        'maximum': None,
        'minimum': None,
        'standard_deviation': None,
        'skewness': None,
        'linear_trend': [{'attr': 'slope'}],
        'approximate_entropy': [{'m': 2, 'r': 0.25}]
    }
    features = extract_features(
        timeseries_container=df,
        column_id='client_id',
        column_sort=date_col,
        column_value=val_col,
        default_fc_parameters=fc_parameters,
        n_jobs=14,
    )
    features = features.reset_index(names='client_id')
    all_features.append(features)

final_features = feature_matrix_enc
for features in all_features:
    final_features = final_features.merge(features, on='client_id', how='left')

display(final_features.info(verbose=True, show_counts=True))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36337 entries, 0 to 36336
Data columns (total 552 columns):
 #    Column                                                                                  Non-Null Count  Dtype         
---   ------                                                                                  --------------  -----         
 0    client_id                                                                               36337 non-null  string        
 1    target                                                                                  36337 non-null  bool          
 2    is_train                                                                                36337 non-null  bool          
 3    communication_month                                                                     36337 non-null  datetime64[ns]
 4    COUNT(transactions)                                                                     36337 non-null  Int64         
 5    FIRST(tra

None

## Postprocessing

In [20]:
df = final_features
df = df.reset_index()
# df = df.drop(columns=['client_id'])
df = df.dropna(subset=[
    # Transactions
    'NUM_UNIQUE(transactions.tran_date_str)', 'MEAN(transactions.float_c16 WHERE int_c19 = -1)', 'MEAN(transactions.float_c16 WHERE int_c19 = 1)',

    # Activities
    'NUM_UNIQUE(activities.activity_date_str)', 'STD(activities.float_c11)',

    # Communications
    'NUM_UNIQUE(comms.contact_date_str)',
])

# Convert dates to days from initial point
for date_col in df.select_dtypes(include=['datetime64[ns]']).columns:
    df[date_col + '_days'] = (df[date_col] - pd.to_datetime(MIN_DATE_STR)).dt.days
    df = df.drop(columns=[date_col])

# Encode cyclical features (e.g. day of week, month)
time_cols = ['DAY(communication_month)', 'MONTH(communication_month)', 'WEEKDAY(communication_month)']
df = df.astype({col: 'int8' for col in time_cols})
cyclical = CyclicalFeatures(variables=time_cols, drop_original=True)
df = cyclical.fit_transform(df)

# Fill  missing data
cols_to_fill_with_mean = [
    c for c in df.columns
    if (' WHERE ' in c and c.startswith('MEAN(')) or c.endswith('__skewness')
]
ami = AddMissingIndicator(variables=cols_to_fill_with_mean)
df = ami.fit_transform(df)
mmi = MeanMedianImputer(imputation_method='mean', variables=cols_to_fill_with_mean)
df = mmi.fit_transform(df)

# Convert boolean values to 0 and 1
df = df.astype({col: 'int8' for col in df.select_dtypes(include=['bool']).columns})
df = df.astype({col: 'Int8' for col in df.select_dtypes(include=['boolean']).columns})

df.to_csv('../data/features/features_auto_v2.csv', index=False)
display(df.info(verbose=True, show_counts=True))
display(df.describe())


  X[indicator_names] = X[self.variables_].isna().astype(int)
  X[indicator_names] = X[self.variables_].isna().astype(int)
  X[indicator_names] = X[self.variables_].isna().astype(int)
  X[indicator_names] = X[self.variables_].isna().astype(int)
  X[indicator_names] = X[self.variables_].isna().astype(int)
  X[indicator_names] = X[self.variables_].isna().astype(int)
  X[indicator_names] = X[self.variables_].isna().astype(int)
  X[indicator_names] = X[self.variables_].isna().astype(int)
  X[indicator_names] = X[self.variables_].isna().astype(int)
  X[indicator_names] = X[self.variables_].isna().astype(int)
  X[indicator_names] = X[self.variables_].isna().astype(int)
  X[indicator_names] = X[self.variables_].isna().astype(int)
  X[indicator_names] = X[self.variables_].isna().astype(int)
  X[indicator_names] = X[self.variables_].isna().astype(int)
  X[indicator_names] = X[self.variables_].isna().astype(int)
  X[indicator_names] = X[self.variables_].isna().astype(int)
  X[indicator_names] = X

<class 'pandas.core.frame.DataFrame'>
Index: 34837 entries, 0 to 36336
Data columns (total 747 columns):
 #    Column                                                                                     Non-Null Count  Dtype  
---   ------                                                                                     --------------  -----  
 0    index                                                                                      34837 non-null  int64  
 1    client_id                                                                                  34837 non-null  string 
 2    target                                                                                     34837 non-null  int8   
 3    is_train                                                                                   34837 non-null  int8   
 4    COUNT(transactions)                                                                        34837 non-null  Int64  
 5    MAX(transactions.float_c16)               

None

Unnamed: 0,index,target,is_train,COUNT(transactions),MAX(transactions.float_c16),MAX(transactions.float_c17),MAX(transactions.float_c18),MAX(transactions.float_c20),MAX(transactions.float_c21),MAX(transactions.int_c19),...,MEAN(comms.contact_date_day_of_week_sin WHERE contact_date_is_weekend = True)_na,MEAN(comms.contact_date_day_of_week_sin WHERE cat_c2 = S3565)_na,MEAN(comms.contact_date_day_of_week_sin WHERE cat_c2 = S3769)_na,MEAN(comms.contact_date_day_of_week_sin WHERE cat_c3 = 3)_na,MEAN(comms.contact_date_day_of_week_sin WHERE cat_c5 = 7)_na,MEAN(comms.contact_date_day_of_week_sin WHERE cat_c2 = S3677)_na,MEAN(comms.contact_date_day_of_week_sin WHERE cat_c4 = 2)_na,MEAN(comms.contact_date_day_of_week_sin WHERE cat_c3 = 7)_na,tran_date_gap_days__skewness_na,activity_date_gap_days__skewness_na
count,34837.0,34837.0,34837.0,34837.0,34837.0,34837.0,34837.0,34837.0,34837.0,34837.0,...,34837.0,34837.0,34837.0,34837.0,34837.0,34837.0,34837.0,34837.0,34837.0,34837.0
mean,18126.682521,0.244912,0.792778,449.322559,1.625743,4.583724,1446.550533,80.503727,1418.000101,1.0,...,0.000545,0.014439,0.092689,0.005798,0.35115,0.007205,0.000689,0.014955,0.004248,0.000144
std,10500.225961,0.430041,0.405322,474.247815,4.875727,19.238116,1648.495912,486.876171,1634.915812,0.0,...,0.023348,0.119292,0.29,0.075927,0.477336,0.084577,0.026239,0.121376,0.065042,0.01198
min,0.0,0.0,0.0,2.0,0.0,0.0,0.01,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,9029.0,0.0,1.0,124.0,0.0,0.0,510.0,0.0,500.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,18098.0,0.0,1.0,311.0,0.48,1.0,1000.0,0.0,980.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,27227.0,0.0,1.0,618.0,1.67,4.03,1963.430054,0.0,1900.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
max,36336.0,1.0,1.0,8721.0,413.130005,2294.719971,55600.0,13500.0,55600.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
