In [1]:
import pandas as pd
import numpy as np

from utils import get_series_first_mode_or_nan

# Transactions

In [2]:
from utils import read_transactions

transactions = read_transactions('data/samples/TRANSACTIONS_SAMPLE.csv')
transactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 787188 entries, 0 to 787187
Data columns (total 21 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   client_id  787188 non-null  uint64        
 1   tran_date  787188 non-null  datetime64[ns]
 2   cat_c2     787188 non-null  int32         
 3   cat_c3     787188 non-null  int32         
 4   cat_c4     787188 non-null  int32         
 5   fl_c6      787188 non-null  int8          
 6   fl_c7      787188 non-null  int8          
 7   fl_c8      787188 non-null  int8          
 8   fl_c9      787188 non-null  int8          
 9   fl_c10     787188 non-null  int8          
 10  fl_c11     787188 non-null  int8          
 11  fl_c12     787188 non-null  int8          
 12  fl_c13     787188 non-null  int8          
 13  fl_c14     787188 non-null  int8          
 14  fl_c15     787188 non-null  int8          
 15  float_c16  787188 non-null  float32       
 16  float_c17  787188 no

In [3]:
# Date variables

tx_agg_dates = transactions.groupby('client_id').agg(
    tx_count=('tran_date', 'count'),
    tx_days=('tran_date', 'nunique'),
    first_tx_date=('tran_date', 'min'),
    last_tx_date=('tran_date', 'max'),
).reset_index()

tx_agg_dates['tx_avg_per_day'] = tx_agg_dates['tx_count'] / tx_agg_dates['tx_days']
tx_agg_dates['tx_period_days'] = (tx_agg_dates['last_tx_date'] - tx_agg_dates['first_tx_date']).dt.days

reference_date = transactions['tran_date'].max()
tx_agg_dates['days_since_last_tx'] = (reference_date - tx_agg_dates['last_tx_date']).dt.days

# Replace dates with days from starting point
min_date = transactions['tran_date'].min()
tx_agg_dates['first_tx_day'] = (tx_agg_dates['first_tx_date'] - min_date).dt.days
tx_agg_dates['last_tx_day'] = (tx_agg_dates['last_tx_date'] - min_date).dt.days
tx_agg_dates = tx_agg_dates.drop(columns=['first_tx_date', 'last_tx_date'])


In [4]:
# Numerical variables

num_cols = ['float_c16', 'float_c17', 'float_c18', 'int_c19', 'float_c20', 'float_c21']

tx_agg_nums = transactions.groupby('client_id')[num_cols].agg(['mean', 'std', 'min', 'max', 'sum'])
tx_agg_nums.columns = ['tx_' + '_'.join(col) for col in tx_agg_nums.columns]
tx_agg_nums = tx_agg_nums.reset_index()

# Handle NULL-values in "_std" columns
std_cols = [c for c in tx_agg_nums.columns if c.endswith('_std')]
tx_agg_nums['has_single_tx'] = tx_agg_nums[std_cols].isna().any(axis=1).astype('int8')
tx_agg_nums[std_cols] = tx_agg_nums[std_cols].fillna(0.0)


In [5]:
# Flag variables

flag_cols = [col for col in transactions.columns if col.startswith('fl_')]

tx_agg_flags = transactions.groupby('client_id')[flag_cols].mean()
tx_agg_flags.columns = [f'tx_{c}_ratio' for c in flag_cols]
tx_agg_flags = tx_agg_flags.reset_index()


In [6]:
# Categorical variables

cat_cols = ['cat_c2', 'cat_c3', 'cat_c4']
aggregates = {f'tx_{col}_nunique': (col, 'nunique') for col in cat_cols}
aggregates |= {f'tx_{col}_mode': (col, get_series_first_mode_or_nan) for col in cat_cols}

tx_agg_cat = transactions.groupby('client_id').agg(**aggregates).reset_index()


In [7]:
# Merge all features

tx_features = (
    tx_agg_dates
    .merge(tx_agg_nums, on='client_id', how='left')
    .merge(tx_agg_flags, on='client_id', how='left')
    .merge(tx_agg_cat, on='client_id', how='left')
)
tx_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1763 entries, 0 to 1762
Data columns (total 55 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   client_id           1763 non-null   uint64 
 1   tx_count            1763 non-null   int64  
 2   tx_days             1763 non-null   int64  
 3   tx_avg_per_day      1763 non-null   float64
 4   tx_period_days      1763 non-null   int64  
 5   days_since_last_tx  1763 non-null   int64  
 6   first_tx_day        1763 non-null   int64  
 7   last_tx_day         1763 non-null   int64  
 8   tx_float_c16_mean   1763 non-null   float32
 9   tx_float_c16_std    1763 non-null   float32
 10  tx_float_c16_min    1763 non-null   float32
 11  tx_float_c16_max    1763 non-null   float32
 12  tx_float_c16_sum    1763 non-null   float32
 13  tx_float_c17_mean   1763 non-null   float32
 14  tx_float_c17_std    1763 non-null   float32
 15  tx_float_c17_min    1763 non-null   float32
 16  tx_flo

In [8]:
tx_features.head()

Unnamed: 0,client_id,tx_count,tx_days,tx_avg_per_day,tx_period_days,days_since_last_tx,first_tx_day,last_tx_day,tx_float_c16_mean,tx_float_c16_std,...,tx_fl_c12_ratio,tx_fl_c13_ratio,tx_fl_c14_ratio,tx_fl_c15_ratio,tx_cat_c2_nunique,tx_cat_c3_nunique,tx_cat_c4_nunique,tx_cat_c2_mode,tx_cat_c3_mode,tx_cat_c4_mode
0,4130085634078954,740,141,5.248227,183,0,90,273,0.041514,0.226716,...,0.339189,0.306757,0.306757,0.062162,16,22,3,15,314,31
1,65971544459550091,273,88,3.102273,181,92,0,181,0.0,0.0,...,0.340659,0.131868,0.113553,0.0,10,9,1,14,314,31
2,68856188319167360,857,165,5.193939,180,92,1,181,0.05972,0.688369,...,0.518086,0.021004,0.021004,0.033839,16,17,3,15,314,31
3,69274198427839620,339,111,3.054054,177,31,65,242,0.008378,0.044569,...,0.345133,0.056047,0.050147,0.094395,13,12,1,14,314,31
4,72522681106258751,417,112,3.723214,180,62,31,211,0.027098,0.161232,...,0.357314,0.100719,0.100719,0.038369,11,11,2,14,314,31


In [9]:
tx_features.describe()

Unnamed: 0,client_id,tx_count,tx_days,tx_avg_per_day,tx_period_days,days_since_last_tx,first_tx_day,last_tx_day,tx_float_c16_mean,tx_float_c16_std,...,tx_fl_c12_ratio,tx_fl_c13_ratio,tx_fl_c14_ratio,tx_fl_c15_ratio,tx_cat_c2_nunique,tx_cat_c3_nunique,tx_cat_c4_nunique,tx_cat_c2_mode,tx_cat_c3_mode,tx_cat_c4_mode
count,1763.0,1763.0,1763.0,1763.0,1763.0,1763.0,1763.0,1763.0,1763.0,1763.0,...,1763.0,1763.0,1763.0,1763.0,1763.0,1763.0,1763.0,1763.0,1763.0,1763.0
mean,9.308963e+18,446.504821,101.947816,3.791044,163.754396,52.326716,56.918888,220.673284,0.026897,0.116163,...,0.28041,0.21407,0.192786,0.071459,11.861032,15.815655,2.179807,12.986954,295.872944,35.516166
std,5.218831e+18,489.784924,54.576261,4.067479,39.562783,35.786064,49.862259,35.786064,0.05457,0.262373,...,0.144949,0.169206,0.166513,0.10245,5.276624,10.424505,1.775735,6.023728,40.921865,56.759316
min,4130086000000000.0,1.0,1.0,1.0,0.0,0.0,0.0,39.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,5.0,31.0
25%,4.828662e+18,125.5,58.0,2.11183,173.0,31.0,31.0,181.0,0.0,0.0,...,0.181818,0.071429,0.050532,0.0,8.0,8.0,1.0,14.0,305.0,31.0
50%,9.485536e+18,306.0,107.0,2.989796,180.0,62.0,62.0,211.0,0.006192,0.040786,...,0.266029,0.193159,0.166667,0.02963,12.0,14.0,2.0,14.0,314.0,31.0
75%,1.365026e+19,620.0,152.0,4.398123,180.0,92.0,90.0,242.0,0.036091,0.145808,...,0.368028,0.321777,0.3,0.102363,16.0,21.0,3.0,15.0,314.0,31.0
max,1.844024e+19,6480.0,184.0,115.727273,183.0,234.0,271.0,273.0,0.839776,5.479157,...,1.0,0.816783,0.816783,1.0,29.0,74.0,15.0,73.0,419.0,840.0


# App activity

In [10]:
from utils import handle_activity_null_values, read_app_activity

activities = read_app_activity('data/samples/APP_ACTIVITY_SAMPLE.csv')
activities = handle_activity_null_values(activities)
activities.info()

<class 'pandas.core.frame.DataFrame'>
Index: 878373 entries, 0 to 882318
Data columns (total 13 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   client_id      878373 non-null  uint64        
 1   device_id      878373 non-null  uint64        
 2   activity_date  878373 non-null  datetime64[ns]
 3   cat_c3         878373 non-null  int32         
 4   cat_c4         878373 non-null  int32         
 5   cat_c5         878373 non-null  int32         
 6   cat_c6         878373 non-null  int32         
 7   cat_c8         878373 non-null  int8          
 8   cat_c9         878373 non-null  int32         
 9   cat_c10        878373 non-null  int8          
 10  float_c11      878373 non-null  float32       
 11  float_c12      878373 non-null  float32       
 12  float_c14      878373 non-null  float32       
dtypes: datetime64[ns](1), float32(3), int32(5), int8(2), uint64(2)
memory usage: 55.3 MB


In [11]:
# Date variables

activities_agg_dates = activities.groupby('client_id').agg(
    activity_count=('activity_date', 'count'),
    activity_days=('activity_date', 'nunique'),
    first_activity_date=('activity_date', 'min'),
    last_activity_date=('activity_date', 'max'),
).reset_index()

activities_agg_dates['activity_avg_per_day'] = activities_agg_dates['activity_count'] / activities_agg_dates['activity_days']
activities_agg_dates['activity_period_days'] = (activities_agg_dates['last_activity_date'] - activities_agg_dates['first_activity_date']).dt.days

max_date = activities['activity_date'].max()
activities_agg_dates['days_since_last_activity'] = (max_date - activities_agg_dates['last_activity_date']).dt.days

# Replace dates with days from starting point
min_date = activities['activity_date'].min()
activities_agg_dates['first_activity_day'] = (activities_agg_dates['first_activity_date'] - min_date).dt.days
activities_agg_dates['last_activity_day'] = (activities_agg_dates['last_activity_date'] - min_date).dt.days
activities_agg_dates = activities_agg_dates.drop(columns=['first_activity_date', 'last_activity_date'])

In [12]:
# Numerical variables

num_cols = ['float_c11', 'float_c12', 'float_c14']

activity_agg_nums = activities.groupby('client_id')[num_cols].agg(['mean', 'std', 'min', 'max', 'sum'])
activity_agg_nums.columns = ['activity_' + '_'.join(col) for col in activity_agg_nums.columns]
activity_agg_nums = activity_agg_nums.reset_index()

# Handle NULL-values in "_std" columns
std_cols = [c for c in activity_agg_nums.columns if c.endswith('_std')]
activity_agg_nums['has_single_activity'] = activity_agg_nums[std_cols].isna().any(axis=1).astype('int8')
activity_agg_nums[std_cols] = activity_agg_nums[std_cols].fillna(0.0)


In [13]:
# Flag variables

flag_cols = ['cat_c8', 'cat_c10']

activity_agg_flags = activities.groupby('client_id')[flag_cols].mean()
activity_agg_flags.columns = [f'activity_{c}_ratio' for c in flag_cols]
activity_agg_flags = activity_agg_flags.reset_index()


In [14]:
# Categorical variables

cat_cols = ['cat_c3', 'cat_c4', 'cat_c5', 'cat_c6', 'cat_c9']
aggregates = {f'activity_{col}_nunique': (col, 'nunique') for col in cat_cols}
aggregates |= {f'activity_{col}_mode': (col, get_series_first_mode_or_nan) for col in cat_cols}

activity_agg_cat = activities.groupby('client_id').agg(**aggregates).reset_index()


In [15]:
# Extra features

activity_agg_extra = activities.groupby('client_id').agg(n_devices=('device_id', 'nunique')).reset_index()

In [16]:
# Merge all features

activity_features = (
    activities_agg_dates
    .merge(activity_agg_nums, on='client_id', how='left')
    .merge(activity_agg_flags, on='client_id', how='left')
    .merge(activity_agg_cat, on='client_id', how='left')
    .merge(activity_agg_extra, on='client_id', how='left')
)
activity_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1791 entries, 0 to 1790
Data columns (total 37 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   client_id                 1791 non-null   uint64 
 1   activity_count            1791 non-null   int64  
 2   activity_days             1791 non-null   int64  
 3   activity_avg_per_day      1791 non-null   float64
 4   activity_period_days      1791 non-null   int64  
 5   days_since_last_activity  1791 non-null   int64  
 6   first_activity_day        1791 non-null   int64  
 7   last_activity_day         1791 non-null   int64  
 8   activity_float_c11_mean   1791 non-null   float32
 9   activity_float_c11_std    1791 non-null   float32
 10  activity_float_c11_min    1791 non-null   float32
 11  activity_float_c11_max    1791 non-null   float32
 12  activity_float_c11_sum    1791 non-null   float32
 13  activity_float_c12_mean   1791 non-null   float32
 14  activity

# Communications

In [17]:
from utils import encode_comm_categories, read_communications

comms = read_communications('data/samples/COMMUNICATIONS_SAMPLE.csv')
comms = encode_comm_categories(comms)
comms.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 978647 entries, 0 to 978646
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   client_id     978647 non-null  uint64        
 1   contact_date  978647 non-null  datetime64[ns]
 2   cat_c2        978647 non-null  int32         
 3   cat_c3        978647 non-null  int32         
 4   cat_c4        978647 non-null  int32         
 5   cat_c5        978647 non-null  int32         
dtypes: datetime64[ns](1), int32(4), uint64(1)
memory usage: 29.9 MB


In [18]:
comms.head(10)

Unnamed: 0,client_id,contact_date,cat_c2,cat_c3,cat_c4,cat_c5
0,4130085634078954,2025-08-21,707,4,1,3
1,4130085634078954,2025-08-21,671,4,1,3
2,4130085634078954,2025-08-21,672,4,1,3
3,4130085634078954,2025-08-21,671,4,1,3
4,4130085634078954,2025-08-21,678,4,1,3
5,4130085634078954,2025-04-03,355,3,2,3
6,4130085634078954,2025-04-03,678,4,1,3
7,4130085634078954,2025-04-03,672,4,1,3
8,4130085634078954,2025-07-13,672,4,1,3
9,4130085634078954,2025-07-13,671,4,1,3


In [19]:
comms.describe()

Unnamed: 0,client_id,contact_date,cat_c2,cat_c3,cat_c4,cat_c5
count,978647.0,978647,978647.0,978647.0,978647.0,978647.0
mean,9.45011e+18,2025-04-19 04:28:48.549722368,642.984989,4.020295,1.094241,3.241593
min,4130086000000000.0,2024-12-01 00:00:00,0.0,1.0,1.0,0.0
25%,5.043164e+18,2025-03-05 00:00:00,671.0,4.0,1.0,3.0
50%,9.798488e+18,2025-04-22 00:00:00,673.0,4.0,1.0,3.0
75%,1.384771e+19,2025-06-04 00:00:00,678.0,4.0,1.0,3.0
max,1.844024e+19,2025-08-31 00:00:00,803.0,8.0,3.0,49.0
std,5.227966e+18,,134.969626,0.414063,0.322024,2.399612


In [20]:
# Date variables

comms_agg_dates = comms.groupby('client_id').agg(
    comm_count=('contact_date', 'count'),
    comm_days=('contact_date', 'nunique'),
    first_comm_date=('contact_date', 'min'),
    last_comm_date=('contact_date', 'max'),
).reset_index()

comms_agg_dates['comm_avg_per_day'] = comms_agg_dates['comm_count'] / comms_agg_dates['comm_days']
comms_agg_dates['comm_period_days'] = (comms_agg_dates['last_comm_date'] - comms_agg_dates['first_comm_date']).dt.days

max_date = comms['contact_date'].max()
comms_agg_dates['days_since_last_comm'] = (max_date - comms_agg_dates['last_comm_date']).dt.days

# Replace dates with days from starting point
min_date = comms['contact_date'].min()
comms_agg_dates['first_comm_day'] = (comms_agg_dates['first_comm_date'] - min_date).dt.days
comms_agg_dates['last_comm_day'] = (comms_agg_dates['last_comm_date'] - min_date).dt.days
comms_agg_dates = comms_agg_dates.drop(columns=['first_comm_date', 'last_comm_date'])

In [21]:
# Categorical variables

aggregates = []
cat_cols = ['cat_c2', 'cat_c3', 'cat_c4', 'cat_c5']
aggregates = {f'comm_{col}_nunique': (col, 'nunique') for col in cat_cols}
aggregates |= {f'comm_{col}_mode': (col, get_series_first_mode_or_nan) for col in cat_cols}

comms_agg_cat = comms.groupby('client_id').agg(**aggregates).reset_index()

In [22]:
# Merge features
comms_features = comms_agg_dates.merge(comms_agg_cat, on='client_id', how='left')
comms_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1797 entries, 0 to 1796
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   client_id             1797 non-null   uint64 
 1   comm_count            1797 non-null   int64  
 2   comm_days             1797 non-null   int64  
 3   comm_avg_per_day      1797 non-null   float64
 4   comm_period_days      1797 non-null   int64  
 5   days_since_last_comm  1797 non-null   int64  
 6   first_comm_day        1797 non-null   int64  
 7   last_comm_day         1797 non-null   int64  
 8   comm_cat_c2_nunique   1797 non-null   int64  
 9   comm_cat_c3_nunique   1797 non-null   int64  
 10  comm_cat_c4_nunique   1797 non-null   int64  
 11  comm_cat_c5_nunique   1797 non-null   int64  
 12  comm_cat_c2_mode      1797 non-null   int32  
 13  comm_cat_c3_mode      1797 non-null   int32  
 14  comm_cat_c4_mode      1797 non-null   int32  
 15  comm_cat_c5_mode     

In [23]:
comms_features.describe()

Unnamed: 0,client_id,comm_count,comm_days,comm_avg_per_day,comm_period_days,days_since_last_comm,first_comm_day,last_comm_day,comm_cat_c2_nunique,comm_cat_c3_nunique,comm_cat_c4_nunique,comm_cat_c5_nunique,comm_cat_c2_mode,comm_cat_c3_mode,comm_cat_c4_mode,comm_cat_c5_mode
count,1797.0,1797.0,1797.0,1797.0,1797.0,1797.0,1797.0,1797.0,1797.0,1797.0,1797.0,1797.0,1797.0,1797.0,1797.0,1797.0
mean,9.32022e+18,544.600445,116.008904,4.0953,171.503617,49.119644,52.376739,223.880356,61.411797,3.388425,2.985531,5.156928,658.391764,3.998331,1.067334,2.991653
std,5.207803e+18,549.452397,47.559211,3.638602,30.20523,33.104741,45.120872,33.104741,17.998875,0.655149,0.132695,1.645292,94.321366,0.139589,0.257247,0.187105
min,4130086000000000.0,1.0,1.0,1.0,0.0,0.0,0.0,124.0,1.0,1.0,1.0,1.0,20.0,1.0,1.0,0.0
25%,4.83241e+18,181.0,80.0,2.193548,177.0,31.0,31.0,205.0,50.0,3.0,3.0,4.0,672.0,4.0,1.0,3.0
50%,9.516883e+18,378.0,123.0,3.185714,180.0,62.0,62.0,211.0,61.0,3.0,3.0,5.0,672.0,4.0,1.0,3.0
75%,1.364283e+19,745.0,159.0,4.928571,181.0,68.0,90.0,242.0,72.0,4.0,3.0,6.0,678.0,4.0,1.0,3.0
max,1.844024e+19,6525.0,184.0,92.571429,183.0,149.0,267.0,273.0,144.0,6.0,3.0,13.0,749.0,8.0,3.0,6.0


# Combine features from all datasets

In [33]:
client_features = (
    tx_features
    .merge(activity_features, on='client_id', how='inner')
    .merge(comms_features, on='client_id', how='inner')
)
# client_features.info(verbose=True, show_counts=True)

In [34]:
# Reduce float precision to float32
client_features = client_features.astype({col: 'float32' for col in client_features.select_dtypes(include=['float64']).columns})

# Downcast int64 to int32
for col in client_features.select_dtypes(include=['int64']).columns:
    min_val = client_features[col].min()
    max_val = client_features[col].max()
    if (min_val >= np.iinfo(np.int32).min) and (max_val <= np.iinfo(np.int32).max):
        client_features = client_features.astype({col: 'int32'})
    else:
        print(f'Column {col} exceeds int32 range')

In [35]:
client_features.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1761 entries, 0 to 1760
Data columns (total 106 columns):
 #    Column                    Non-Null Count  Dtype  
---   ------                    --------------  -----  
 0    client_id                 1761 non-null   uint64 
 1    tx_count                  1761 non-null   int32  
 2    tx_days                   1761 non-null   int32  
 3    tx_avg_per_day            1761 non-null   float32
 4    tx_period_days            1761 non-null   int32  
 5    days_since_last_tx        1761 non-null   int32  
 6    first_tx_day              1761 non-null   int32  
 7    last_tx_day               1761 non-null   int32  
 8    tx_float_c16_mean         1761 non-null   float32
 9    tx_float_c16_std          1761 non-null   float32
 10   tx_float_c16_min          1761 non-null   float32
 11   tx_float_c16_max          1761 non-null   float32
 12   tx_float_c16_sum          1761 non-null   float32
 13   tx_float_c17_mean         1761 non-null   floa

In [38]:
client_features.to_csv('data/features/features_1.csv', index=False)