# Automated Feature Engineering (featuretools)

In [None]:
import warnings

import featuretools as ft
import numpy as np
import pandas as pd
from IPython.display import display
from feature_engine.creation import CyclicalFeatures
from feature_engine.imputation import AddMissingIndicator, ArbitraryNumberImputer, MeanMedianImputer
# from dask.distributed import LocalCluster
from featuretools.primitives import TimeSinceFirst, TimeSinceLast
from tsfresh import extract_features, extract_relevant_features
from tsfresh.feature_extraction import EfficientFCParameters
from woodwork.logical_types import Boolean, BooleanNullable, Categorical, Unknown

from utils import add_calendar_values

In [None]:
pd.set_option('display.float_format', '{:.3f}'.format)
pd.set_option('display.max_columns', None)

In [None]:
warnings.simplefilter(action='ignore', category=FutureWarning)

## Create EntitySet

In [None]:
es = ft.EntitySet(id='client_data')

In [None]:
"""Clients"""

from utils import read_clients

# CLIENTS_PATH = '../data/initial/CLIENTS.csv'
CLIENTS_PATH = '../data/samples/CLIENTS_SAMPLE.csv'

clients_df = read_clients(CLIENTS_PATH, encode_bool=False)
clients_df = clients_df.astype({'client_id': str})
display(clients_df.info())

es.add_dataframe(
    clients_df,
    dataframe_name='clients',
    index='client_id',
    time_index='communication_month',
    # make_index=True,
    logical_types={
        'client_id': Unknown,
        'target': Boolean,
        'is_train': Boolean,
    }
)
display(es['clients'].ww.schema)
display(es)

In [None]:
"""Transactions"""

from utils import read_transactions

# TRANSACTIONS_PATH = '../data/initial/TRANSACTIONS.csv'
TRANSACTIONS_PATH = '../data/samples/TRANSACTIONS_SAMPLE.csv'

transactions_df = read_transactions(TRANSACTIONS_PATH, encode_bool=False, encode_category=False)
transactions_df = transactions_df.astype({'client_id': str})
# transactions_df['tran_date_str'] = transactions_df['tran_date'].dt.date.astype(str)  # used later for feature generation
transactions_df = add_calendar_values(transactions_df, 'tran_date', prefix='tran_date_')
display(transactions_df.info())

es.add_dataframe(
    transactions_df,
    dataframe_name='transactions',
    index='transaction_id',
    time_index='tran_date',
    make_index=True,
    logical_types={
        'client_id': Unknown,
        'cat_c2': Categorical,
        'cat_c3': Categorical,
        'cat_c4': Categorical,
        # 'tran_date_str': Categorical,
        'fl_c6': Boolean,
        'fl_c7': Boolean,
        'fl_c8': Boolean,
        'fl_c9': Boolean,
        'fl_c10': Boolean,
        'fl_c11': Boolean,
        'fl_c12': Boolean,
        'fl_c13': Boolean,
        'fl_c14': Boolean,
        'fl_c15': Boolean,
    }
)
display(es['transactions'].ww.schema)

es.add_relationship('clients', 'client_id', 'transactions', 'client_id')
display(es)

In [None]:
"""App activity"""

from utils import preprocess_app_activity_data, read_app_activity

# ACTIVITY_PATH = '../data/initial/APP_ACTIVITY.csv'
ACTIVITY_PATH = '../data/samples/APP_ACTIVITY_SAMPLE.csv'

activities_df = read_app_activity(ACTIVITY_PATH, encode_bool=False, encode_category=False)
activities_df = activities_df.astype({'client_id': str})
activities_df = preprocess_app_activity_data(activities_df)
# activities_df['activity_date_str'] = activities_df['activity_date'].dt.date.astype(str)  # used later for feature generation
activities_df = add_calendar_values(activities_df, 'activity_date', prefix='activity_date_')
display(activities_df.info())

es.add_dataframe(
    activities_df,
    dataframe_name='activities',
    index='activity_id',
    time_index='activity_date',
    make_index=True,
    logical_types={
        'client_id': Unknown,
        'cat_c3': Categorical,
        'cat_c4': Categorical,
        'cat_c5': Categorical,
        'cat_c6': Categorical,
        'cat_c9': Categorical,
        # 'activity_date_str': Categorical,
        'cat_c8': BooleanNullable,
        'cat_c10': BooleanNullable,
    }
)
display(es['activities'].ww.schema)

es.add_relationship('clients', 'client_id', 'activities', 'client_id')
display(es)

In [None]:
"""Communications"""

from utils import read_communications

# COMMS_PATH = '../data/initial/COMMUNICATIONS.csv'
COMMS_PATH = '../data/samples/COMMUNICATIONS_SAMPLE.csv'

comms_df = read_communications(COMMS_PATH, encode_category=False)
comms_df = comms_df.astype({'client_id': str})
# comms_df = preprocess_comm_data(comms)
# comms_df = encode_comm_categories(comms)
# comms_df['contact_date_str'] = comms_df['contact_date'].dt.date.astype(str)  # used later for feature generation
comms_df = add_calendar_values(comms_df, 'contact_date', prefix='contact_date_')
display(comms_df.info(show_counts=True))

es.add_dataframe(
    comms_df,
    dataframe_name='comms',
    index='comm_id',
    time_index='contact_date',
    make_index=True,
    logical_types={
        'client_id': Unknown,
        'cat_c2': Categorical,
        'cat_c3': Categorical,
        'cat_c4': Categorical,
        'cat_c5': Categorical,
        # 'contact_date_str': Categorical,
    }
)
display(es['comms'].ww.schema)

es.add_relationship('clients', 'client_id', 'comms', 'client_id')
display(es)

In [None]:
es.add_last_time_indexes()

## Deep Feature Synthesis

In [None]:
MIN_DATE_STR = '2024-12-01 00:00:00'
MAX_DATE_STR = '2025-09-01 00:00:00'

agg_primitives = [
    'count',
    'first', 'last', TimeSinceFirst(unit='days'), TimeSinceLast(unit='days'),  # for date
    'mean', 'std', 'skew', 'kurtosis', 'min', 'max', 'sum',   # for numeric
    'num_unique', 'mode', 'entropy',  # for categorical
    'percent_true',  # for boolean
]

primitive_options = {}
for agg in agg_primitives:
    primitive_options[agg]= {'include_columns': {}}
    for df_name in ('transactions', 'activities', 'comms'):
         if agg not in ('count',) and isinstance(agg, str):
             primitive_options[agg]['include_columns'][df_name] = []

In [None]:
"""Transactions"""

df_name = 'transactions'

# Date
for prim in ['first', 'last']:
    primitive_options[prim]['include_columns'][df_name] += ['tran_date']
primitive_options['mean']['include_columns'][df_name] += [
    'tran_date_day_of_week_sin', 'tran_date_day_of_week_cos',
    'tran_date_day_of_month_sin', 'tran_date_day_of_month_cos',
]

# Numeric
num_cols = ['float_c16', 'float_c17', 'float_c18', 'int_c19', 'float_c20', 'float_c21']
for prim in ['mean', 'std', 'skew', 'kurtosis', 'min', 'max', 'sum']:
    primitive_options[prim]['include_columns'][df_name] += num_cols
    # primitive_options[prim] = {'include_columns': {df_name: num_cols}}

# Categorical
cat_cols = list(es[df_name].ww.select(Categorical).columns)
for prim in ['num_unique', 'mode', 'entropy']:
    # primitive_options['mode'] = {'include_columns': {df_name: cat_cols}}
    primitive_options[prim]['include_columns'][df_name] += cat_cols
# for prim in ('mode', 'entropy'):
#     # Delete redundant primitive
#     primitive_options[prim]['include_columns'][df_name].remove('tran_date_str')

# Boolean
bool_cols = list(es[df_name].ww.select([Boolean, BooleanNullable]).columns)
# primitive_options['percent_true'] = {'include_columns': {df_name: bool_cols}}
primitive_options['percent_true']['include_columns'][df_name] += bool_cols


# Interesting values
interesting_values = {
    'cat_c2': [4, 14, 15],
    'cat_c3': [209, 303, 305, 314],
    'int_c19': [-1, 1],
    'fl_c12': [True, False],
    'fl_c13': [True, False],
    'fl_c14': [True, False],
    'tran_date_is_weekend': [True, False],
}
es.add_interesting_values(dataframe_name=df_name, values=interesting_values)

In [None]:
"""Activities"""

df_name = 'activities'

# Date
for prim in ['first', 'last']:
    primitive_options[prim]['include_columns'][df_name] += ['activity_date']
primitive_options['mean']['include_columns'][df_name] += [
    'activity_date_day_of_week_sin', 'activity_date_day_of_week_cos',
    'activity_date_day_of_month_sin', 'activity_date_day_of_month_cos',
]

# Numeric
num_cols = ['float_c11', 'float_c12', 'float_c13', 'float_c14', 'float_c15', 'float_c16', 'float_c17']
for prim in ['mean', 'std', 'skew', 'kurtosis', 'min', 'max', 'sum']:
    primitive_options[prim]['include_columns'][df_name] += num_cols

# Categorical
cat_cols = list(es[df_name].ww.select(Categorical).columns)
for prim in ['num_unique', 'mode', 'entropy']:
    primitive_options[prim]['include_columns'][df_name] += cat_cols
# for prim in ('mode', 'entropy'):
#     # Delete redundant primitive
#     primitive_options[prim]['include_columns'][df_name].remove('activity_date_str')

# Boolean
bool_cols = list(es[df_name].ww.select([Boolean, BooleanNullable]).columns)
primitive_options['percent_true']['include_columns'][df_name] += bool_cols


# Interesting values
interesting_values = {
    'cat_c4': [1, 2],
    'cat_c6': [1, 2, 3],
    'cat_c9': [1, 2],
    'cat_c8': [True, False],
    'cat_c10': [True, False],
    'activity_date_is_weekend': [True, False],
}
es.add_interesting_values(dataframe_name=df_name, values=interesting_values)

In [None]:
"""Communications"""

df_name = 'comms'

# Date
for prim in ['first', 'last']:
    primitive_options[prim]['include_columns'][df_name] += ['contact_date']
primitive_options['mean']['include_columns'][df_name] += [
    'contact_date_day_of_week_sin', 'contact_date_day_of_week_cos',
    'contact_date_day_of_month_sin', 'contact_date_day_of_month_cos',
]

# Categorical
cat_cols = list(es[df_name].ww.select(Categorical).columns)
for prim in ['num_unique', 'mode', 'entropy']:
    primitive_options[prim]['include_columns'][df_name] += cat_cols
# for prim in ('mode', 'entropy'):
#     # Delete redundant primitive
#     primitive_options[prim]['include_columns'][df_name].remove('contact_date_str')


# Boolean
bool_cols = list(es[df_name].ww.select([Boolean, BooleanNullable]).columns)
primitive_options['percent_true']['include_columns'][df_name] += bool_cols

# Interesting values
interesting_values = {
    'cat_c2': ['S3564', 'S3565', 'S3677', 'S3769'],
    'cat_c3': [3, 4, 7],
    'cat_c4': [1, 2],
    'cat_c5': ['4', '7'],
    'contact_date_is_weekend': [True, False],
}
es.add_interesting_values(dataframe_name=df_name, values=interesting_values)

In [None]:
# Cut-off time dataframe
cutoff_df = clients_df[['client_id', 'communication_month']].copy()
cutoff_df.rename(columns={'communication_month': 'time'}, inplace=True)

In [None]:
# cluster = LocalCluster()
feature_matrix, feature_defs = ft.dfs(
    entityset=es,
    target_dataframe_name='clients',
    # ignore_dataframes=['transactions', 'comms'],
    # cutoff_time=transactions_df['tran_date'].max(),
    cutoff_time=cutoff_df,
    agg_primitives=agg_primitives,
    trans_primitives=['day', 'month', 'weekday', 'is_weekend'],
    where_primitives=['mean', 'sum', 'count'],
    primitive_options=primitive_options,
    max_depth=1,
    verbose=True,
    features_only=False,
    n_jobs=1,
    chunk_size=es.dataframe_dict['clients'].shape[0],
    # dask_kwargs={'cluster': cluster.scheduler.address},
    return_types='all',
)
feature_matrix_enc, features_enc = ft.encode_features(
    feature_matrix,
    feature_defs,
    top_n={
        # Transactions
        'MODE(transactions.cat_c2)': 10,
        'MODE(transactions.cat_c3)': 10,
        'MODE(transactions.cat_c4)': 5,

        # Activities
        'MODE(activities.cat_c3)': 3,
        'MODE(activities.cat_c4)': 2,
        'MODE(activities.cat_c5)': 1,
        'MODE(activities.cat_c6)': 7,
        'MODE(activities.cat_c9)': 2,

        # Communications
        'MODE(comms.cat_c2)': 10,
        'MODE(comms.cat_c3)': 3,
        'MODE(comms.cat_c4)': 3,
        'MODE(comms.cat_c5)': 3,
    },
    to_encode=[
        'MODE(transactions.cat_c2)', 'MODE(transactions.cat_c3)', 'MODE(transactions.cat_c4)',  # transactions
        'MODE(activities.cat_c3)', 'MODE(activities.cat_c4)', 'MODE(activities.cat_c5)', 'MODE(activities.cat_c6)', 'MODE(activities.cat_c9)',  # activities
        'MODE(comms.cat_c2)', 'MODE(comms.cat_c3)', 'MODE(comms.cat_c4)', 'MODE(comms.cat_c5)',  # communications
    ],
)
ft.save_features(features_enc, '../data/features/feature_definitions_v5_single.json')
feature_matrix_enc.to_csv('../data/features/features_auto_v5_single_raw.csv', index=False)

display(features_enc)
display(feature_matrix_enc.head())
display(feature_matrix_enc.info(verbose=True, show_counts=True))
display(feature_matrix_enc.describe())

## Ratios between last 45/90 days periods

In [None]:
fm_45d, _ = ft.dfs(
    entityset=es,
    target_dataframe_name='clients',
    cutoff_time=cutoff_df,
    training_window='45 days',
    agg_primitives=['count', 'sum', 'mean', 'std', 'percent_true'],
    trans_primitives=[],
    where_primitives=['count'],
    primitive_options=primitive_options,
    max_depth=1,
    verbose=True,
    features_only=False,
    n_jobs=1,
    chunk_size=es.dataframe_dict['clients'].shape[0],
    return_types='all',
)
fm_90d, _ = ft.dfs(
    entityset=es,
    target_dataframe_name='clients',
    cutoff_time=cutoff_df,
    training_window='90 days',
    agg_primitives=['count', 'sum', 'mean', 'std', 'percent_true'],
    trans_primitives=[],
    where_primitives=['count'],
    primitive_options=primitive_options,
    max_depth=1,
    verbose=True,
    features_only=False,
    n_jobs=1,
    chunk_size=es.dataframe_dict['clients'].shape[0],
    return_types='all',
)

# Rename to make columns unique
fm_45d = fm_45d.add_prefix('D45_')
fm_90d = fm_90d.add_prefix('D90_')

trend_features = pd.DataFrame(index=fm_45d.index)
for col_45, col_90 in zip(fm_45d.columns, fm_90d.columns):
    col = col_45.replace('D45_', '')
    if col in ['client_id', 'target', 'is_train']:
        continue

    if pd.api.types.is_numeric_dtype(fm_45d[col_45]):
        trend_features[f"TREND_45_90_{col}"] = fm_45d[col_45] / fm_90d[col_90].replace(0, np.nan)

trend_features.fillna(0, inplace=True)

display(trend_features.head())
display(trend_features.info(verbose=True, show_counts=True))

feature_matrix_enc = feature_matrix_enc.merge(trend_features, on='client_id', how='left')

## Other features

Extra unique features

In [None]:
transactions_df['tran_date_str'] = transactions_df['tran_date'].dt.date.astype('category')
tx_dates_unique = transactions_df.groupby('client_id').agg(tran_date_str_nunique=('tran_date_str', 'nunique')).reset_index(names='client_id')

activities_df['activity_date_str'] = activities_df['activity_date'].dt.date.astype('category')
act_dates_unique = activities_df.groupby('client_id').agg(activity_date_str_nunique=('activity_date_str', 'nunique')).reset_index(names='client_id')
act_devices_unique = activities_df.groupby('client_id').agg(device_id_nunique=('device_id', 'nunique')).reset_index(names='client_id')

comms_df['contact_date_str'] = comms_df['contact_date'].dt.date.astype('category')
comms_dates_unique = comms_df.groupby('client_id').agg(contact_date_str_nunique=('contact_date_str', 'nunique')).reset_index(names='client_id')

feature_matrix_enc = feature_matrix_enc.merge(comms_dates_unique, on='client_id', how='left')
feature_matrix_enc = feature_matrix_enc.merge(act_dates_unique, on='client_id', how='left')
feature_matrix_enc = feature_matrix_enc.merge(act_devices_unique, on='client_id', how='left')
feature_matrix_enc = feature_matrix_enc.merge(tx_dates_unique, on='client_id', how='left')

Cross-table interaction features

In [None]:
feature_matrix_enc['INTERACTION_activity_per_trans'] = feature_matrix_enc['COUNT(activities)'] / feature_matrix_enc['COUNT(transactions)'].replace(0, 1)
feature_matrix_enc['INTERACTION_contact_per_trans'] = feature_matrix_enc['COUNT(comms)'] / feature_matrix_enc['COUNT(transactions)'].replace(0, 1)
feature_matrix_enc['INTERACTION_contact_per_activity'] = feature_matrix_enc['COUNT(comms)'] / feature_matrix_enc['COUNT(activities)'].replace(0, 1)

Financial balance

In [None]:
pos_sum = feature_matrix_enc['SUM(transactions.float_c18 WHERE int_c19 = 1)']
neg_sum = feature_matrix_enc['SUM(transactions.float_c18 WHERE int_c19 = -1)']
feature_matrix_enc['BALANCE_net_flow'] = pos_sum - neg_sum
feature_matrix_enc['BALANCE_savings_potential'] = pos_sum / neg_sum

## Add more time-series features

In [None]:
def prep_time_gaps(df, date_col, prefix):
    df = df.sort_values(by=['client_id', date_col])

    gap_col_name = f'{prefix}_gap_days'
    df[gap_col_name] = df.groupby('client_id')[date_col].diff().dt.total_seconds() / (3600 * 24)
    df[gap_col_name] = df[gap_col_name].fillna(0)

    return df[['client_id', date_col, gap_col_name]]

df_transactions_ts = prep_time_gaps(transactions_df, 'tran_date', 'tran_date')
df_transactions_ts['float_c18_diff'] = transactions_df.groupby('client_id')['float_c18'].diff().fillna(0)  # difference between transactions amount
df_activities_ts = prep_time_gaps(activities_df, 'activity_date', 'activity_date')
df_comms_ts  = prep_time_gaps(comms_df, 'contact_date', 'contact_date')

display(df_transactions_ts.head(10))

In [None]:
datasets = {
    'transactions': (df_transactions_ts, 'tran_date', ['tran_date_gap_days', 'float_c18_diff']),
    'activities': (df_activities_ts, 'activity_date', ['activity_date_gap_days']),
    'communications': (df_comms_ts, 'contact_date', ['contact_date_gap_days'])
}
all_features = []
for name, (df, date_col, val_cols) in datasets.items():
    print(f'Processing {name}...')

    for val_col in val_cols:
        fc_parameters = {
            'median': None,
            'mean': None,
            'maximum': None,
            'minimum': None,
            'standard_deviation': None,
            'skewness': None,
            'kurtosis': None,
            'quantile': [
                {'q': 0.1},
                {'q': 0.25},
                {'q': 0.75},
                {'q': 0.9}
            ],
            'linear_trend': [{'attr': 'slope'}],
            'approximate_entropy': [{'m': 2, 'r': 0.25}],
            'ratio_beyond_r_sigma': [{'r': 2}],
            'autocorrelation': [{'lag': 1}],
            'fft_aggregated': [{'aggtype': 'centroid'}, {'aggtype': 'variance'}],
            'longest_strike_above_mean': None,
            'longest_strike_below_mean': None,
            'count_above_mean': None,
        }
        features = extract_features(
            timeseries_container=df,
            column_id='client_id',
            column_sort=date_col,
            column_value=val_col,
            default_fc_parameters=fc_parameters,
            n_jobs=14,
        )
        features = features.reset_index(names='client_id')
        all_features.append(features)

final_features = feature_matrix_enc
for features in all_features:
    final_features = final_features.merge(features, on='client_id', how='left')

display(final_features.info(verbose=True, show_counts=True))

## Postprocessing

In [None]:
df = final_features
df = df.reset_index()
# df = df.drop(columns=['client_id'])
df = df.dropna(subset=[
    # Transactions
    'NUM_UNIQUE(transactions.tran_date_str)', 'MEAN(transactions.float_c16 WHERE int_c19 = -1)', 'MEAN(transactions.float_c16 WHERE int_c19 = 1)',

    # Activities
    'NUM_UNIQUE(activities.activity_date_str)', 'STD(activities.float_c11)',

    # Communications
    'NUM_UNIQUE(comms.contact_date_str)',
])

# Convert dates to days from initial point
for date_col in df.select_dtypes(include=['datetime64[ns]']).columns:
    df[date_col + '_days'] = (df[date_col] - pd.to_datetime(MIN_DATE_STR)).dt.days
    df = df.drop(columns=[date_col])

# Encode cyclical features (e.g. day of week, month)
time_cols = ['DAY(communication_month)', 'MONTH(communication_month)', 'WEEKDAY(communication_month)']
df = df.astype({col: 'int8' for col in time_cols})
cyclical = CyclicalFeatures(variables=time_cols, drop_original=True)
df = cyclical.fit_transform(df)

# Fill  missing data
cols_to_fill_with_mean = [
    c for c in df.columns
    if (' WHERE ' in c and c.startswith('MEAN(')
        or c.startswith('SKEW(')
        or c.endswith('__skewness')
        or c.endswith('__kurtosis')
        or '__autocorrelation' in c
        or '__fft_aggregated__aggtype' in c)
]
ami = AddMissingIndicator(variables=cols_to_fill_with_mean)
df = ami.fit_transform(df)
mmi = MeanMedianImputer(imputation_method='mean', variables=cols_to_fill_with_mean)
df = mmi.fit_transform(df)

# Convert boolean values to 0 and 1
df = df.astype({col: 'int8' for col in df.select_dtypes(include=['bool']).columns})
df = df.astype({col: 'Int8' for col in df.select_dtypes(include=['boolean']).columns})

df.to_csv('../data/features/features_auto_v5_single.csv', index=False)
display(df.info(verbose=True, show_counts=True))
display(df.describe())
