In [1]:
import pandas as pd
import numpy as np

from utils import fill_feature_nan_values, get_series_first_mode_or_nan

# Transactions

In [2]:
from utils import read_transactions

TRANSACTIONS_PATH = 'data/initial/TRANSACTIONS.csv'
# TRANSACTIONS_PATH = 'data/samples/TRANSACTIONS_SAMPLE.csv'

transactions = read_transactions(TRANSACTIONS_PATH)
transactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15654626 entries, 0 to 15654625
Data columns (total 21 columns):
 #   Column     Dtype         
---  ------     -----         
 0   client_id  uint64        
 1   tran_date  datetime64[ns]
 2   cat_c2     int32         
 3   cat_c3     int32         
 4   cat_c4     int32         
 5   fl_c6      int8          
 6   fl_c7      int8          
 7   fl_c8      int8          
 8   fl_c9      int8          
 9   fl_c10     int8          
 10  fl_c11     int8          
 11  fl_c12     int8          
 12  fl_c13     int8          
 13  fl_c14     int8          
 14  fl_c15     int8          
 15  float_c16  float32       
 16  float_c17  float32       
 17  float_c18  float32       
 18  int_c19    int32         
 19  float_c20  float32       
 20  float_c21  float32       
dtypes: datetime64[ns](1), float32(5), int32(4), int8(10), uint64(1)
memory usage: 925.6 MB


In [3]:
# Date variables

tx_agg_dates = transactions.groupby('client_id').agg(
    tx_count=('tran_date', 'count'),
    tx_days=('tran_date', 'nunique'),
    first_tx_date=('tran_date', 'min'),
    last_tx_date=('tran_date', 'max'),
).reset_index()

tx_agg_dates['tx_avg_per_day'] = tx_agg_dates['tx_count'] / tx_agg_dates['tx_days']
tx_agg_dates['tx_period_days'] = (tx_agg_dates['last_tx_date'] - tx_agg_dates['first_tx_date']).dt.days

reference_date = transactions['tran_date'].max()
tx_agg_dates['days_since_last_tx'] = (reference_date - tx_agg_dates['last_tx_date']).dt.days

# Replace dates with days from starting point
min_date = transactions['tran_date'].min()
tx_agg_dates['first_tx_day'] = (tx_agg_dates['first_tx_date'] - min_date).dt.days
tx_agg_dates['last_tx_day'] = (tx_agg_dates['last_tx_date'] - min_date).dt.days
tx_agg_dates = tx_agg_dates.drop(columns=['first_tx_date', 'last_tx_date'])


In [4]:
# Numerical variables

num_cols = ['float_c16', 'float_c17', 'float_c18', 'int_c19', 'float_c20', 'float_c21']

tx_agg_nums = transactions.groupby('client_id')[num_cols].agg(['mean', 'std', 'min', 'max', 'sum'])
tx_agg_nums.columns = ['tx_' + '_'.join(col) for col in tx_agg_nums.columns]
tx_agg_nums = tx_agg_nums.reset_index()

# Handle NULL-values in "_std" columns
std_cols = [c for c in tx_agg_nums.columns if c.endswith('_std')]
tx_agg_nums['has_single_tx'] = tx_agg_nums[std_cols].isna().any(axis=1).astype('int8')
tx_agg_nums[std_cols] = tx_agg_nums[std_cols].fillna(0.0)


In [5]:
# Flag variables

flag_cols = [col for col in transactions.columns if col.startswith('fl_')]

tx_agg_flags = transactions.groupby('client_id')[flag_cols].mean()
tx_agg_flags.columns = [f'tx_{c}_ratio' for c in flag_cols]
tx_agg_flags = tx_agg_flags.reset_index()


In [6]:
# Categorical variables

cat_cols = ['cat_c2', 'cat_c3', 'cat_c4']
aggregates = {f'tx_{col}_nunique': (col, 'nunique') for col in cat_cols}
aggregates |= {f'tx_{col}_mode': (col, get_series_first_mode_or_nan) for col in cat_cols}

tx_agg_cat = transactions.groupby('client_id').agg(**aggregates).reset_index()


In [7]:
# Merge all features

tx_features = (
    tx_agg_dates
    .merge(tx_agg_nums, on='client_id', how='left')
    .merge(tx_agg_flags, on='client_id', how='left')
    .merge(tx_agg_cat, on='client_id', how='left')
)
tx_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35070 entries, 0 to 35069
Data columns (total 55 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   client_id           35070 non-null  uint64 
 1   tx_count            35070 non-null  int64  
 2   tx_days             35070 non-null  int64  
 3   tx_avg_per_day      35070 non-null  float64
 4   tx_period_days      35070 non-null  int64  
 5   days_since_last_tx  35070 non-null  int64  
 6   first_tx_day        35070 non-null  int64  
 7   last_tx_day         35070 non-null  int64  
 8   tx_float_c16_mean   35070 non-null  float32
 9   tx_float_c16_std    35070 non-null  float32
 10  tx_float_c16_min    35070 non-null  float32
 11  tx_float_c16_max    35070 non-null  float32
 12  tx_float_c16_sum    35070 non-null  float32
 13  tx_float_c17_mean   35070 non-null  float32
 14  tx_float_c17_std    35070 non-null  float32
 15  tx_float_c17_min    35070 non-null  float32
 16  tx_f

# App activity

In [8]:
from utils import preprocess_app_activity_data, read_app_activity

ACTIVITY_PATH = 'data/initial/APP_ACTIVITY.csv'
# ACTIVITY_PATH = 'data/samples/APP_ACTIVITY_SAMPLE.csv'

activities = read_app_activity(ACTIVITY_PATH)
# activities = preprocess_app_activity_data(activities)
activities.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17738592 entries, 0 to 17738591
Data columns (total 17 columns):
 #   Column         Non-Null Count     Dtype         
---  ------         --------------     -----         
 0   client_id      17738592 non-null  uint64        
 1   device_id      17738592 non-null  uint64        
 2   activity_date  17738592 non-null  datetime64[ns]
 3   cat_c3         17734699 non-null  Int32         
 4   cat_c4         17600200 non-null  Int32         
 5   cat_c5         17738445 non-null  Int32         
 6   cat_c6         17738592 non-null  Int32         
 7   cat_c8         17673411 non-null  Int8          
 8   cat_c9         17693001 non-null  Int32         
 9   cat_c10        17695745 non-null  Int8          
 10  float_c11      17695744 non-null  float32       
 11  float_c12      17695744 non-null  float32       
 12  float_c13      3741017 non-null   float32       
 13  float_c14      17695744 non-null  float32       
 14  float_c15      1

In [9]:
# Date variables

activities_agg_dates = activities.groupby('client_id').agg(
    activity_count=('activity_date', 'count'),
    activity_days=('activity_date', 'nunique'),
    first_activity_date=('activity_date', 'min'),
    last_activity_date=('activity_date', 'max'),
).reset_index()

activities_agg_dates['activity_avg_per_day'] = activities_agg_dates['activity_count'] / activities_agg_dates['activity_days']
activities_agg_dates['activity_period_days'] = (activities_agg_dates['last_activity_date'] - activities_agg_dates['first_activity_date']).dt.days

max_date = activities['activity_date'].max()
activities_agg_dates['days_since_last_activity'] = (max_date - activities_agg_dates['last_activity_date']).dt.days

# Replace dates with days from starting point
min_date = activities['activity_date'].min()
activities_agg_dates['first_activity_day'] = (activities_agg_dates['first_activity_date'] - min_date).dt.days
activities_agg_dates['last_activity_day'] = (activities_agg_dates['last_activity_date'] - min_date).dt.days
activities_agg_dates = activities_agg_dates.drop(columns=['first_activity_date', 'last_activity_date'])
activities_agg_dates.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35672 entries, 0 to 35671
Data columns (total 8 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   client_id                 35672 non-null  uint64 
 1   activity_count            35672 non-null  int64  
 2   activity_days             35672 non-null  int64  
 3   activity_avg_per_day      35672 non-null  float64
 4   activity_period_days      35672 non-null  int64  
 5   days_since_last_activity  35672 non-null  int64  
 6   first_activity_day        35672 non-null  int64  
 7   last_activity_day         35672 non-null  int64  
dtypes: float64(1), int64(6), uint64(1)
memory usage: 2.2 MB


In [10]:
# Numerical variables

# num_cols = ['float_c11', 'float_c12', 'float_c14']
num_cols = ['float_c11', 'float_c12', 'float_c13', 'float_c14', 'float_c15', 'float_c16', 'float_c17']

activity_agg_nums = activities.groupby('client_id')[num_cols].agg(['mean', 'std', 'min', 'max', 'sum'])
activity_agg_nums.columns = ['activity_' + '_'.join(col) for col in activity_agg_nums.columns]
activity_agg_nums = activity_agg_nums.reset_index()

# Handle NULL-values in "_std" columns
std_cols = [c for c in activity_agg_nums.columns if c.endswith('_std')]
activity_agg_nums['has_single_activity'] = activity_agg_nums[std_cols].isna().any(axis=1).astype('int8')
activity_agg_nums[std_cols] = activity_agg_nums[std_cols].fillna(0.0)
activity_agg_nums.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35672 entries, 0 to 35671
Data columns (total 37 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   client_id                35672 non-null  uint64 
 1   activity_float_c11_mean  35672 non-null  float32
 2   activity_float_c11_std   35672 non-null  float32
 3   activity_float_c11_min   35672 non-null  float32
 4   activity_float_c11_max   35672 non-null  float32
 5   activity_float_c11_sum   35672 non-null  float32
 6   activity_float_c12_mean  35672 non-null  float32
 7   activity_float_c12_std   35672 non-null  float32
 8   activity_float_c12_min   35672 non-null  float32
 9   activity_float_c12_max   35672 non-null  float32
 10  activity_float_c12_sum   35672 non-null  float32
 11  activity_float_c13_mean  35333 non-null  float32
 12  activity_float_c13_std   35672 non-null  float32
 13  activity_float_c13_min   35333 non-null  float32
 14  activity_float_c13_max

In [11]:
activity_agg_nums.describe()

Unnamed: 0,client_id,activity_float_c11_mean,activity_float_c11_std,activity_float_c11_min,activity_float_c11_max,activity_float_c11_sum,activity_float_c12_mean,activity_float_c12_std,activity_float_c12_min,activity_float_c12_max,...,activity_float_c16_std,activity_float_c16_min,activity_float_c16_max,activity_float_c16_sum,activity_float_c17_mean,activity_float_c17_std,activity_float_c17_min,activity_float_c17_max,activity_float_c17_sum,has_single_activity
count,35672.0,35672.0,35672.0,35672.0,35672.0,35672.0,35672.0,35672.0,35672.0,35672.0,...,35672.0,22443.0,22443.0,35672.0,22443.0,35672.0,22443.0,22443.0,35672.0,35672.0
mean,9.232191e+18,37.792583,6.649549,25.837099,51.359524,19137.029297,56.773373,22.270895,9.56728,98.224907,...,1.603135,-4.834176,10.086658,1512.785034,6.677791,1.975121,-5.643444,13.107902,1910.251221,0.386325
std,5.313957e+18,26.735794,7.348373,26.914335,28.17485,27379.166016,12.244381,4.382798,11.761169,6.320566,...,1.347729,6.009652,1.800422,2434.085693,1.266891,1.700666,4.516261,3.933831,2992.792236,0.486913
min,844657900000000.0,0.0,0.0,-1.0,0.0,0.0,3.75,0.0,0.0,5.0,...,0.0,-78.443626,-1.76205,-112.512207,-8.073355,0.0,-83.054871,-5.05095,-1766.154663,0.0
25%,4.66733e+18,11.492629,2.074277,1.0,25.0,2477.0,48.328691,20.067261,1.0,100.0,...,0.0,-9.641515,9.660389,0.0,6.064535,0.0,-8.41905,10.776841,0.0,0.0
50%,9.215637e+18,36.564026,3.888989,17.0,54.0,9491.0,56.52618,22.83774,5.0,100.0,...,2.093808,-2.470816,9.888053,549.505219,6.852714,2.416202,-5.881199,12.430051,666.816437,0.0
75%,1.380215e+19,61.288026,7.91098,49.0,77.0,25288.0,65.051586,25.075394,13.0,100.0,...,2.639877,-0.276521,10.280882,2153.651245,7.492867,3.345726,-3.68167,14.472475,2782.461609,1.0
max,1.84458e+19,95.963638,44.761478,95.0,99.0,643924.0,100.0,46.4758,100.0,100.0,...,8.857243,9.80665,78.48233,53059.53125,13.493124,12.401772,13.467001,102.414375,51292.566406,1.0


In [12]:
# Postprocess numerical features (replace NULL-values with medians)
for initial_col in ('float_c13', 'float_c15', 'float_c16', 'float_c17'):
    # feature_cols = [col for col in activity_agg_nums.columns if initial_col in col]
    # feature_cols_medians = [activity_agg_nums[col].median() for col in feature_cols]
    # rows_with_nulls = activity_agg_nums[activity_agg_nums[feature_cols].isna().any(axis=1)]
    # activity_agg_nums.loc[rows_with_nulls.index, feature_cols] = feature_cols_medians
    #
    # # Add new column to indicate which rows had blanks
    # new_missing_col = f'activity_{initial_col}_is_missing'
    # activity_agg_nums[new_missing_col] = 0
    # activity_agg_nums[new_missing_col] = activity_agg_nums[new_missing_col].astype('int8')
    # activity_agg_nums.loc[rows_with_nulls.index, new_missing_col] = 1

    feature_cols = [col for col in activity_agg_nums.columns if initial_col in col]
    feature_cols_medians = [activity_agg_nums[col].median() for col in feature_cols]
    new_missing_col = f'activity_{initial_col}_is_missing'
    fill_feature_nan_values(activity_agg_nums, feature_cols, feature_cols_medians, new_missing_col)

activity_agg_nums.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35672 entries, 0 to 35671
Data columns (total 41 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   client_id                      35672 non-null  uint64 
 1   activity_float_c11_mean        35672 non-null  float32
 2   activity_float_c11_std         35672 non-null  float32
 3   activity_float_c11_min         35672 non-null  float32
 4   activity_float_c11_max         35672 non-null  float32
 5   activity_float_c11_sum         35672 non-null  float32
 6   activity_float_c12_mean        35672 non-null  float32
 7   activity_float_c12_std         35672 non-null  float32
 8   activity_float_c12_min         35672 non-null  float32
 9   activity_float_c12_max         35672 non-null  float32
 10  activity_float_c12_sum         35672 non-null  float32
 11  activity_float_c13_mean        35672 non-null  float32
 12  activity_float_c13_std         35672 non-null 

In [13]:
# Flag variables
flag_cols = ['cat_c8', 'cat_c10']

activity_agg_flags = activities.groupby('client_id')[flag_cols].mean()
activity_agg_flags.columns = [f'activity_{c}_ratio' for c in flag_cols]
activity_agg_flags = activity_agg_flags.reset_index()
activity_agg_flags.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35672 entries, 0 to 35671
Data columns (total 3 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   client_id               35672 non-null  uint64 
 1   activity_cat_c8_ratio   35669 non-null  Float64
 2   activity_cat_c10_ratio  35672 non-null  Float64
dtypes: Float64(2), uint64(1)
memory usage: 905.9 KB


In [14]:
# Postprocess flag features
cols_with_nulls = ['activity_cat_c8_ratio']
for col in cols_with_nulls:
    # val_to_fill = activity_agg_flags[col].median()
    # rows_with_nulls = activity_agg_flags[activity_agg_flags[col].isna()]
    # activity_agg_flags.loc[rows_with_nulls.index, col] = val_to_fill
    #
    # # Add new column to indicate which rows had blanks
    # new_missing_col = f'{col}_is_missing'
    # activity_agg_flags[new_missing_col] = 0
    # activity_agg_flags[new_missing_col] = activity_agg_flags[new_missing_col].astype('int8')
    # activity_agg_flags.loc[rows_with_nulls.index, new_missing_col] = 1

    val_to_fill = activity_agg_flags[col].median()
    new_missing_col = f'{col}_is_missing'
    fill_feature_nan_values(activity_agg_flags, [col], [val_to_fill], new_missing_col)

activity_agg_flags.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35672 entries, 0 to 35671
Data columns (total 4 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   client_id                         35672 non-null  uint64 
 1   activity_cat_c8_ratio             35672 non-null  Float64
 2   activity_cat_c10_ratio            35672 non-null  Float64
 3   activity_cat_c8_ratio_is_missing  35672 non-null  int8   
dtypes: Float64(2), int8(1), uint64(1)
memory usage: 940.7 KB


In [15]:
# Categorical variables
cat_cols = ['cat_c3', 'cat_c4', 'cat_c5', 'cat_c6', 'cat_c9']
aggregates = {f'activity_{col}_nunique': (col, 'nunique') for col in cat_cols}
aggregates |= {f'activity_{col}_mode': (col, get_series_first_mode_or_nan) for col in cat_cols}

activity_agg_cat = activities.groupby('client_id').agg(**aggregates).reset_index()
activity_agg_cat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35672 entries, 0 to 35671
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   client_id                35672 non-null  uint64
 1   activity_cat_c3_nunique  35672 non-null  int64 
 2   activity_cat_c4_nunique  35672 non-null  int64 
 3   activity_cat_c5_nunique  35672 non-null  int64 
 4   activity_cat_c6_nunique  35672 non-null  int64 
 5   activity_cat_c9_nunique  35672 non-null  int64 
 6   activity_cat_c3_mode     35672 non-null  Int32 
 7   activity_cat_c4_mode     35657 non-null  Int32 
 8   activity_cat_c5_mode     35672 non-null  Int32 
 9   activity_cat_c6_mode     35672 non-null  Int32 
 10  activity_cat_c9_mode     35672 non-null  Int32 
dtypes: Int32(5), int64(5), uint64(1)
memory usage: 2.5 MB


In [16]:
# Postprocess categorical features
cols_with_nulls = ['activity_cat_c4_mode']
for col in cols_with_nulls:
    # val_to_fill = activity_agg_cat[col].value_counts().idxmax()
    # # activity_agg_cat['activity_cat_c4_mode'] = activity_agg_cat['activity_cat_c4_mode'].fillna(cat_c4_mode_fill_val)
    # rows_with_nulls = activity_agg_cat[activity_agg_cat[col].isna()]
    # activity_agg_cat.loc[rows_with_nulls.index, col] = val_to_fill
    #
    # # Add new column to indicate which rows had blanks
    # new_missing_col = f'{col}_is_missing'
    # activity_agg_cat[new_missing_col] = 0
    # activity_agg_cat[new_missing_col] = activity_agg_cat[new_missing_col].astype('int8')
    # activity_agg_cat.loc[rows_with_nulls.index, new_missing_col] = 1

    val_to_fill = activity_agg_cat[col].value_counts().idxmax()
    new_missing_col = f'{col}_is_missing'
    fill_feature_nan_values(activity_agg_cat, [col], [val_to_fill], new_missing_col)

activity_agg_cat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35672 entries, 0 to 35671
Data columns (total 12 columns):
 #   Column                           Non-Null Count  Dtype 
---  ------                           --------------  ----- 
 0   client_id                        35672 non-null  uint64
 1   activity_cat_c3_nunique          35672 non-null  int64 
 2   activity_cat_c4_nunique          35672 non-null  int64 
 3   activity_cat_c5_nunique          35672 non-null  int64 
 4   activity_cat_c6_nunique          35672 non-null  int64 
 5   activity_cat_c9_nunique          35672 non-null  int64 
 6   activity_cat_c3_mode             35672 non-null  Int32 
 7   activity_cat_c4_mode             35672 non-null  Int32 
 8   activity_cat_c5_mode             35672 non-null  Int32 
 9   activity_cat_c6_mode             35672 non-null  Int32 
 10  activity_cat_c9_mode             35672 non-null  Int32 
 11  activity_cat_c4_mode_is_missing  35672 non-null  int8  
dtypes: Int32(5), int64(5), int8(1), 

In [17]:
# Extra features
activity_agg_extra = activities.groupby('client_id').agg(n_devices=('device_id', 'nunique')).reset_index()

In [18]:
# Merge all features
activity_features = (
    activities_agg_dates
    .merge(activity_agg_nums, on='client_id', how='left')
    .merge(activity_agg_flags, on='client_id', how='left')
    .merge(activity_agg_cat, on='client_id', how='left')
    .merge(activity_agg_extra, on='client_id', how='left')
)
activity_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35672 entries, 0 to 35671
Data columns (total 63 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   client_id                         35672 non-null  uint64 
 1   activity_count                    35672 non-null  int64  
 2   activity_days                     35672 non-null  int64  
 3   activity_avg_per_day              35672 non-null  float64
 4   activity_period_days              35672 non-null  int64  
 5   days_since_last_activity          35672 non-null  int64  
 6   first_activity_day                35672 non-null  int64  
 7   last_activity_day                 35672 non-null  int64  
 8   activity_float_c11_mean           35672 non-null  float32
 9   activity_float_c11_std            35672 non-null  float32
 10  activity_float_c11_min            35672 non-null  float32
 11  activity_float_c11_max            35672 non-null  float32
 12  acti

# Communications

In [19]:
from utils import encode_comm_categories, preprocess_comm_data, read_communications

COMMS_PATH = 'data/initial/COMMUNICATIONS.csv'
# COMMS_PATH = 'data/samples/COMMUNICATIONS_SAMPLE.csv'

comms = read_communications(COMMS_PATH)
# comms = preprocess_comm_data(comms)
# comms = encode_comm_categories(comms)
comms.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19586922 entries, 0 to 19586921
Data columns (total 6 columns):
 #   Column        Non-Null Count     Dtype         
---  ------        --------------     -----         
 0   client_id     19586922 non-null  uint64        
 1   contact_date  19586922 non-null  datetime64[ns]
 2   cat_c2        19586922 non-null  category      
 3   cat_c3        19586922 non-null  int32         
 4   cat_c4        19586922 non-null  int32         
 5   cat_c5        19586917 non-null  category      
dtypes: category(2), datetime64[ns](1), int32(2), uint64(1)
memory usage: 504.4 MB


In [20]:
comms.head(10)

Unnamed: 0,client_id,contact_date,cat_c2,cat_c3,cat_c4,cat_c5
0,1064751079059572946,2025-02-04,S8406,4,1,4
1,1064751079059572946,2025-02-04,S3565,4,1,7
2,1064751079059572946,2025-02-04,S8404,4,1,4
3,1064751079059572946,2025-04-07,S27888,4,1,4
4,1064751079059572946,2025-04-07,S3769,4,1,4
5,1064751079059572946,2025-04-07,S3564,4,1,4
6,1064751079059572946,2025-04-07,S3681,4,1,4
7,1064751079059572946,2025-02-05,S3564,4,1,4
8,1064751079059572946,2025-05-31,S27888,4,1,4
9,1064751079059572946,2025-05-31,S3770,4,1,4


In [21]:
comms.describe()

Unnamed: 0,client_id,contact_date,cat_c3,cat_c4
count,19586920.0,19586922,19586920.0,19586920.0
mean,9.239797e+18,2025-04-17 04:33:31.363128576,4.020126,1.09451
min,844657900000000.0,2024-12-01 00:00:00,1.0,1.0
25%,4.627148e+18,2025-03-02 00:00:00,4.0,1.0
50%,9.240516e+18,2025-04-20 00:00:00,4.0,1.0
75%,1.382496e+19,2025-06-01 00:00:00,4.0,1.0
max,1.84458e+19,2025-08-31 00:00:00,8.0,3.0
std,5.337613e+18,,0.4120818,0.3222231


In [22]:
# Date variables

comms_agg_dates = comms.groupby('client_id').agg(
    comm_count=('contact_date', 'count'),
    comm_days=('contact_date', 'nunique'),
    first_comm_date=('contact_date', 'min'),
    last_comm_date=('contact_date', 'max'),
).reset_index()

comms_agg_dates['comm_avg_per_day'] = comms_agg_dates['comm_count'] / comms_agg_dates['comm_days']
comms_agg_dates['comm_period_days'] = (comms_agg_dates['last_comm_date'] - comms_agg_dates['first_comm_date']).dt.days

max_date = comms['contact_date'].max()
comms_agg_dates['days_since_last_comm'] = (max_date - comms_agg_dates['last_comm_date']).dt.days

# Replace dates with days from starting point
min_date = comms['contact_date'].min()
comms_agg_dates['first_comm_day'] = (comms_agg_dates['first_comm_date'] - min_date).dt.days
comms_agg_dates['last_comm_day'] = (comms_agg_dates['last_comm_date'] - min_date).dt.days
comms_agg_dates = comms_agg_dates.drop(columns=['first_comm_date', 'last_comm_date'])

In [23]:
# Categorical variables

aggregates = []
cat_cols = ['cat_c2', 'cat_c3', 'cat_c4', 'cat_c5']
aggregates = {f'comm_{col}_nunique': (col, 'nunique') for col in cat_cols}
aggregates |= {f'comm_{col}_mode': (col, get_series_first_mode_or_nan) for col in cat_cols}

comms_agg_cat = comms.groupby('client_id').agg(**aggregates).reset_index()
comms_agg_cat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35805 entries, 0 to 35804
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype   
---  ------               --------------  -----   
 0   client_id            35805 non-null  uint64  
 1   comm_cat_c2_nunique  35805 non-null  int64   
 2   comm_cat_c3_nunique  35805 non-null  int64   
 3   comm_cat_c4_nunique  35805 non-null  int64   
 4   comm_cat_c5_nunique  35805 non-null  int64   
 5   comm_cat_c2_mode     35805 non-null  category
 6   comm_cat_c3_mode     35805 non-null  int32   
 7   comm_cat_c4_mode     35805 non-null  int32   
 8   comm_cat_c5_mode     35805 non-null  category
dtypes: category(2), int32(2), int64(4), uint64(1)
memory usage: 1.8 MB


In [24]:
# Postprocess categorical features
for col in ('comm_cat_c2_mode', 'comm_cat_c5_mode'):
    mapping = {val: idx for idx, val in enumerate(np.sort(comms_agg_cat[col].unique()))}
    comms_agg_cat[col] = comms_agg_cat[col].map(mapping).astype('int32')
comms_agg_cat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35805 entries, 0 to 35804
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   client_id            35805 non-null  uint64
 1   comm_cat_c2_nunique  35805 non-null  int64 
 2   comm_cat_c3_nunique  35805 non-null  int64 
 3   comm_cat_c4_nunique  35805 non-null  int64 
 4   comm_cat_c5_nunique  35805 non-null  int64 
 5   comm_cat_c2_mode     35805 non-null  int32 
 6   comm_cat_c3_mode     35805 non-null  int32 
 7   comm_cat_c4_mode     35805 non-null  int32 
 8   comm_cat_c5_mode     35805 non-null  int32 
dtypes: int32(4), int64(4), uint64(1)
memory usage: 1.9 MB


In [25]:
# Merge features
comms_features = comms_agg_dates.merge(comms_agg_cat, on='client_id', how='left')
comms_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35805 entries, 0 to 35804
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   client_id             35805 non-null  uint64 
 1   comm_count            35805 non-null  int64  
 2   comm_days             35805 non-null  int64  
 3   comm_avg_per_day      35805 non-null  float64
 4   comm_period_days      35805 non-null  int64  
 5   days_since_last_comm  35805 non-null  int64  
 6   first_comm_day        35805 non-null  int64  
 7   last_comm_day         35805 non-null  int64  
 8   comm_cat_c2_nunique   35805 non-null  int64  
 9   comm_cat_c3_nunique   35805 non-null  int64  
 10  comm_cat_c4_nunique   35805 non-null  int64  
 11  comm_cat_c5_nunique   35805 non-null  int64  
 12  comm_cat_c2_mode      35805 non-null  int32  
 13  comm_cat_c3_mode      35805 non-null  int32  
 14  comm_cat_c4_mode      35805 non-null  int32  
 15  comm_cat_c5_mode   

# Combine features from all datasets

In [33]:
from utils import convert_np_int_dtypes_to_nullable

# Replace dtypes with nullable to prevent their convertion to float64 after merge
tx_features = convert_np_int_dtypes_to_nullable(tx_features)
activity_features = convert_np_int_dtypes_to_nullable(activity_features)
comms_features = convert_np_int_dtypes_to_nullable(comms_features)

client_features = (
    tx_features
    .merge(activity_features, on='client_id', how='inner')
    .merge(comms_features, on='client_id', how='inner')
)
client_features.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35048 entries, 0 to 35047
Data columns (total 132 columns):
 #    Column                            Non-Null Count  Dtype  
---   ------                            --------------  -----  
 0    client_id                         35048 non-null  uint64 
 1    tx_count                          35048 non-null  Int64  
 2    tx_days                           35048 non-null  Int64  
 3    tx_avg_per_day                    35048 non-null  float64
 4    tx_period_days                    35048 non-null  Int64  
 5    days_since_last_tx                35048 non-null  Int64  
 6    first_tx_day                      35048 non-null  Int64  
 7    last_tx_day                       35048 non-null  Int64  
 8    tx_float_c16_mean                 35048 non-null  float32
 9    tx_float_c16_std                  35048 non-null  float32
 10   tx_float_c16_min                  35048 non-null  float32
 11   tx_float_c16_max                  35048 non-null  fl

In [34]:
# Postprocess client features
from utils import fill_feature_nan_values

# Fill NULL-values and create a new column to indicate this
for sub_features, new_missing_col in zip([tx_features, activity_features, comms_features], ('tx_is_missing', 'activity_is_missing', 'comm_is_missing')):
    sub_features = sub_features.drop(columns=['client_id'])
    sub_features_cols = sub_features.columns
    if client_features[client_features[sub_features_cols].isna().all(axis=1)].empty:
        continue

    # for col in sub_features.columns:
    #     val_to_fill = client_features[col].median
    #     client_features = fill_feature_nan_values(client_features, col, val_to_fill)
    #
    # client_features[new_missing_col] = 0
    # client_features[new_missing_col] = client_features[new_missing_col].astype('int8')
    # client_features.loc[rows_with_nulls.index, new_missing_col] = 1

    sub_features_medians = [client_features[col].median() for col in sub_features_cols]
    client_features = fill_feature_nan_values(client_features, sub_features_cols, sub_features_medians, new_missing_col)

client_features.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35048 entries, 0 to 35047
Data columns (total 132 columns):
 #    Column                            Non-Null Count  Dtype  
---   ------                            --------------  -----  
 0    client_id                         35048 non-null  uint64 
 1    tx_count                          35048 non-null  Int64  
 2    tx_days                           35048 non-null  Int64  
 3    tx_avg_per_day                    35048 non-null  float64
 4    tx_period_days                    35048 non-null  Int64  
 5    days_since_last_tx                35048 non-null  Int64  
 6    first_tx_day                      35048 non-null  Int64  
 7    last_tx_day                       35048 non-null  Int64  
 8    tx_float_c16_mean                 35048 non-null  float32
 9    tx_float_c16_std                  35048 non-null  float32
 10   tx_float_c16_min                  35048 non-null  float32
 11   tx_float_c16_max                  35048 non-null  fl

In [35]:
# Reduce float precision to float32
client_features = client_features.astype({col: 'float32' for col in client_features.select_dtypes(include=['float64']).columns})

# Downcast int64 to int32
for int64_dtype, int32_dtype in zip(['int64', 'Int64'], ['int32', 'Int32']):
    int64_cols = [col for col in client_features.columns if client_features[col].dtype.name == int64_dtype]
    for col in int64_cols:
        min_val = client_features[col].min()
        max_val = client_features[col].max()
        if (min_val >= np.iinfo(np.int32).min) and (max_val <= np.iinfo(np.int32).max):
            client_features = client_features.astype({col: int32_dtype})
        else:
            print(f'Column {col} exceeds int32 range')

In [36]:
client_features.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35048 entries, 0 to 35047
Data columns (total 132 columns):
 #    Column                            Non-Null Count  Dtype  
---   ------                            --------------  -----  
 0    client_id                         35048 non-null  uint64 
 1    tx_count                          35048 non-null  Int32  
 2    tx_days                           35048 non-null  Int32  
 3    tx_avg_per_day                    35048 non-null  float32
 4    tx_period_days                    35048 non-null  Int32  
 5    days_since_last_tx                35048 non-null  Int32  
 6    first_tx_day                      35048 non-null  Int32  
 7    last_tx_day                       35048 non-null  Int32  
 8    tx_float_c16_mean                 35048 non-null  float32
 9    tx_float_c16_std                  35048 non-null  float32
 10   tx_float_c16_min                  35048 non-null  float32
 11   tx_float_c16_max                  35048 non-null  fl

In [37]:
client_features.describe()

Unnamed: 0,client_id,tx_count,tx_days,tx_avg_per_day,tx_period_days,days_since_last_tx,first_tx_day,last_tx_day,tx_float_c16_mean,tx_float_c16_std,...,first_comm_day,last_comm_day,comm_cat_c2_nunique,comm_cat_c3_nunique,comm_cat_c4_nunique,comm_cat_c5_nunique,comm_cat_c2_mode,comm_cat_c3_mode,comm_cat_c4_mode,comm_cat_c5_mode
count,35048.0,35048.0,35048.0,35048.0,35048.0,35048.0,35048.0,35048.0,35048.0,35048.0,...,35048.0,35048.0,35048.0,35048.0,35048.0,35048.0,35048.0,35048.0,35048.0,35048.0
mean,9.228491e+18,446.652248,102.330433,3.702068,164.131363,53.070218,55.798419,219.929782,0.029508,0.12595,...,49.77411,223.122346,62.581802,3.383274,2.997147,5.130564,123.023824,4.000799,1.057436,0.999401
std,5.314549e+18,474.062009,54.683926,2.885128,39.380292,36.383763,48.412568,36.383763,0.08614,0.334475,...,41.925627,33.27781,17.357703,0.630731,0.05334,1.61569,15.590284,0.056525,0.237531,0.082922
min,844657900000000.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,-0.03459,0.0,...,0.0,162.0,3.0,1.0,2.0,1.0,0.0,4.0,1.0,0.0
25%,4.662208e+18,121.0,56.0,2.105263,173.0,31.0,31.0,181.0,0.0,0.0,...,16.0,181.0,51.0,3.0,3.0,4.0,122.0,4.0,1.0,1.0
50%,9.210583e+18,308.0,109.0,2.993151,180.0,62.0,62.0,211.0,0.00728,0.043387,...,62.0,211.0,62.0,3.0,3.0,5.0,122.0,4.0,1.0,1.0
75%,1.379687e+19,615.0,151.0,4.470209,181.0,92.0,90.0,242.0,0.034805,0.141631,...,74.0,242.0,73.0,4.0,3.0,6.0,127.0,4.0,1.0,1.0
max,1.84458e+19,8721.0,184.0,139.0,183.0,273.0,273.0,273.0,4.931539,18.192768,...,271.0,273.0,148.0,7.0,3.0,14.0,165.0,8.0,3.0,7.0


In [38]:
client_features.to_csv('data/features/features_5.csv', index=False)