In [1]:
import pandas as pd
import numpy as np
import random 
from datetime import datetime, timedelta
from functools import reduce

# Data Ingestion

In [2]:
df_c03 = pd.read_csv("data/mock_data.csv", dtype={'MCC':str})
df_c03['MCC'] = df_c03['MCC'].astype(str).str[:4]
df_c09 = pd.read_csv("data/mock_data_c09.csv")

  df_c03 = pd.read_csv("data/mock_data.csv", dtype={'MCC':str})


In [3]:
# MCC and Currency Data

mcc_df = pd.read_csv("data/mcc_data.csv", sep=";", dtype={'Code': str})
mcc_df.rename({'Code':'MCC'}, axis=1, inplace=True)
currency_code = pd.read_csv('data/currency_code.csv', sep=';', dtype={'Number': str})

# Feature Engineering Logics

In [4]:
from src.calculation_features import generate_rolling_features, calculate_time_differences

In [5]:
df_c03['MCC'] = df_c03['MCC'].replace({np.nan: '-1', 'nan': '-1'})
df_c03['Merchant Id'] = df_c03['Merchant Id'].replace({np.nan: '-1', 'nan': '-1'})

## Value Mapping Steps

1. MCC value mapping: 
- MCC to MCC Details
- MCC to MCC Transaction Category Code
- MCC to MCC Category
2. Value mapping Terminal Owner/Card Acceptor Name to bigger group (ecommerce, online ads, etc.)

In [6]:
from src.utils import extract_provider_name, categorize_terminal_owner

In [7]:
# 1. Value mapping MCC_code to MCC Details, Trnx Category Code, Category

df_c03 = df_c03.merge(mcc_df[['MCC', 'Description', 'Transaction Category Code', 'MCC Category']], on='MCC', how='left')
df_c03.rename(columns={
    'Description': 'MCC Details',
    'Transaction Category Code': 'MCC Trnx Category Code'
    }, inplace=True
)

In [8]:
# 2. Value mapping Terminal Owner/Card Acceptor Name to bigger group (ecommerce, online ads, etc.)
df_c03['Cat Terminal Owner'] = df_c03['Terminal Owner'].apply(lambda x: categorize_terminal_owner(extract_provider_name(x)))

## Time Difference Features

In [9]:
from src.feature_engineering_configs import (
    time_shift_config,
    time_windows,
    freq_config,
    monetary_config_1,
    monetary_config_2,
    monetary_config_3,
    monetary_config_4,
    monetary_config_5,
    monetary_config_6,
    unique_count_config_1,
    unique_count_config_2
)

In [10]:
df_time_diff = calculate_time_differences(
    df=df_c03,
    datetime_col='Transaction Datetime',
    groupby_col='PANNumber',
    time_window=time_windows,
    config=time_shift_config
)

  .rolling(window=window)
  .rolling(window=window)
  .rolling(window=window)
  .rolling(window=window)
  .rolling(window=window)
  .rolling(window=window)
  .rolling(window=window)
  .rolling(window=window)
  .rolling(window=window)
  .rolling(window=window)


In [11]:
# check result

pd.set_option('display.max_columns', 500)

df_c03[df_c03['PANNumber'] == 999988325][
    [
        'Transaction Serial No',
        'PANNumber',
        'Transaction Datetime',
        'MCC',
        'MCC Category',
        'Merchant Id',
        'Country Code'
    ] +
    # + ['time_diff_before_merchant']
    [x for x in df_c03.columns if ('time_diff' in x) and ('avg' not in x)]
].sort_values(by=['PANNumber','Transaction Datetime'])

Unnamed: 0,Transaction Serial No,PANNumber,Transaction Datetime,MCC,MCC Category,Merchant Id,Country Code
14312,974909877,999988325,2024-09-08 02:59:42,4722,Travel/Transportation/Gas and Fuel Services,18622,360
30433,999590208,999988325,2024-10-09 11:51:10,-1,,4619,360
30434,999590353,999988325,2024-10-09 11:51:40,7311,Business/Professional/Miscellaneous Services,4619,360
30435,999590358,999988325,2024-10-09 11:51:40,-1,,4619,360
30436,999590548,999988325,2024-10-09 11:52:20,7311,Business/Professional/Miscellaneous Services,13445,360
30529,999590773,999988325,2024-10-09 11:53:13,5969,Business/Professional/Miscellaneous Services,15611,360


## Frequency Features

In [12]:
df_freq = generate_rolling_features(
    df_c03,
    datetime_col="Transaction Datetime",
    key_col="Transaction Serial No",
    features_config=freq_config,
)

  .rolling(window, closed="left")
  .rolling(window, closed="left")
  .rolling(window, closed="left")
  .rolling(window, closed="left")
  .rolling(window, closed="left")
  .rolling(window, closed="left")
  .rolling(window, closed="left")
  .rolling(window, closed="left")
  .rolling(window, closed="left")
  .rolling(window, closed="left")
  .rolling(window, closed="left")
  .rolling(window, closed="left")


## Monetary

Calculate SUM, AVG, MAX of Transaction Amount grouped by PANNumber

In [13]:
df_monetary_1 = generate_rolling_features(
    df_c03,
    datetime_col="Transaction Datetime",
    key_col="Transaction Serial No",
    features_config=monetary_config_1,
)

  .rolling(window, closed="left")
  .rolling(window, closed="left")
  .rolling(window, closed="left")
  .rolling(window, closed="left")
  .rolling(window, closed="left")
  .rolling(window, closed="left")


Calculate SUM, MAX, MEAN of Transaction Amount to `MCC` grouped by PANNumber

In [14]:
df_monetary_2 = generate_rolling_features(
    df_c03,
    datetime_col="Transaction Datetime",
    key_col="Transaction Serial No",
    features_config=monetary_config_2,
)

  .rolling(window, closed="left")
  .rolling(window, closed="left")
  .rolling(window, closed="left")
  .rolling(window, closed="left")
  .rolling(window, closed="left")
  .rolling(window, closed="left")


Calculate SUM, MAX, MEAN of Transaction Amount to `MCC Details` grouped by PANNumber

In [15]:
df_monetary_3 = generate_rolling_features(
    df_c03,
    datetime_col="Transaction Datetime",
    key_col="Transaction Serial No",
    features_config=monetary_config_3,
)

  .rolling(window, closed="left")
  .rolling(window, closed="left")
  .rolling(window, closed="left")
  .rolling(window, closed="left")
  .rolling(window, closed="left")
  .rolling(window, closed="left")


Calculate SUM, MAX, MEAN of Transaction Amount to `MCC Trnx Category Code` grouped by PANNumber

In [16]:
df_monetary_4 = generate_rolling_features(
    df_c03,
    datetime_col="Transaction Datetime",
    key_col="Transaction Serial No",
    features_config=monetary_config_4,
)

  .rolling(window, closed="left")
  .rolling(window, closed="left")
  .rolling(window, closed="left")
  .rolling(window, closed="left")
  .rolling(window, closed="left")
  .rolling(window, closed="left")


Calculate SUM, MAX, MEAN of Transaction Amount to `MCC Category` grouped by PANNumber

In [17]:
df_monetary_5 = generate_rolling_features(
    df_c03,
    datetime_col="Transaction Datetime",
    key_col="Transaction Serial No",
    features_config=monetary_config_5,
)

  .rolling(window, closed="left")
  .rolling(window, closed="left")
  .rolling(window, closed="left")
  .rolling(window, closed="left")
  .rolling(window, closed="left")
  .rolling(window, closed="left")


Calculate SUM, MAX, MEAN of Transaction Amount to `Terminal Owner Category` grouped by PANNumber

In [18]:
df_monetary_6 = generate_rolling_features(
    df_c03,
    datetime_col="Transaction Datetime",
    key_col="Transaction Serial No",
    features_config=monetary_config_6,
)

  .rolling(window, closed="left")
  .rolling(window, closed="left")
  .rolling(window, closed="left")
  .rolling(window, closed="left")
  .rolling(window, closed="left")
  .rolling(window, closed="left")


## Unique Count

In [19]:
df_c03["MCC Num"], uniques = df_c03["MCC"].factorize()
df_c03["PANNumber Num"], uniques = df_c03["PANNumber"].factorize()

df_unique_count_1 = generate_rolling_features(
    df_c03,
    datetime_col="Transaction Datetime",
    key_col="Transaction Serial No",
    features_config=unique_count_config_1,
)

  .rolling(window=window, closed="left", min_periods=1)
  .rolling(window=window, closed="left", min_periods=1)
  .rolling(window=window, closed="left", min_periods=1)
  .rolling(window=window, closed="left", min_periods=1)
  .rolling(window=window, closed="left", min_periods=1)
  .rolling(window=window, closed="left", min_periods=1)


Count unique (distinct) `Transaction Amount`, grouped by PANNumber, MCC, MCC Details, MCC Category Code, MCC Category, Terminal Owner Category

In [20]:
df_unique_count_2 = generate_rolling_features(
    df_c03,
    datetime_col="Transaction Datetime",
    key_col="Transaction Serial No",
    features_config=unique_count_config_2,
)

  .rolling(window=window, closed="left", min_periods=1)
  .rolling(window=window, closed="left", min_periods=1)
  .rolling(window=window, closed="left", min_periods=1)
  .rolling(window=window, closed="left", min_periods=1)
  .rolling(window=window, closed="left", min_periods=1)
  .rolling(window=window, closed="left", min_periods=1)
  .rolling(window=window, closed="left", min_periods=1)
  .rolling(window=window, closed="left", min_periods=1)
  .rolling(window=window, closed="left", min_periods=1)
  .rolling(window=window, closed="left", min_periods=1)
  .rolling(window=window, closed="left", min_periods=1)
  .rolling(window=window, closed="left", min_periods=1)


## Getting All Transformed Features

In [None]:
# Define the common keys for merging
merge_keys = ['Transaction Serial No', 'PANNumber']

# Helper function to extract columns based on config and time windows
def extract_columns(config, time_windows):
    return [cfg['windows'][win] for cfg in config for win in time_windows]

# Extract columns
time_diff_cols = list(time_diff_columns.keys())
freq_cols = extract_columns(freq_config, time_windows)
monetary_cols = extract_columns(monetary_config, time_windows)
unique_count_cols = extract_columns(unique_count_config, time_windows)

# Subset dataframes
df_time_diff = df_time_diff[merge_keys + time_diff_cols]
df_freq = df_freq[merge_keys + freq_cols]
df_monetary = df_monetary[merge_keys + monetary_cols]
df_unique_count = df_unique_count[merge_keys + unique_count_cols]

# Merge all feature dataframes + additional features
dfs_to_merge = [df_time_diff, df_freq, df_monetary, df_unique_count]
df_final = reduce(lambda left, right: pd.merge(left, right, on=merge_keys, how='outer'), dfs_to_merge)
df_final = df_final.merge(
    df_mock[merge_keys + ['Transaction Amount', 'Card Billing Amount', 'Confirmed']],
    on=merge_keys,
    how='left'
)

In [41]:
df_final

Unnamed: 0,Transaction Serial No,PANNumber,TSLastTxn_mins,TSLastTxn_fromacct_to_acct_mins,TSLastTxn_toacct_mins,TSLastTxn_merchant_mins,TSLastTxn_mcc_mins,TSLastTxn_countrycode_mins,MFTxnCount_L15M,MFTxnCount_L1H,...,Unique_AmountTo_L15M,Unique_AmountTo_L1H,Unique_AmountTo_L1D,Unique_AmountTo_L7D,Unique_AmountTo_L14D,Unique_AmountTo_L30D,Unique_AmountTo_L90D,Transaction Amount,Card Billing Amount,Confirmed
0,3022,8339,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,746.66,209.5,8010
1,3454,3621,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,623.59,139.58,7640
2,5096,8481,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,955.41,674.45,5487
3,6487,1066,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,574.86,480.66,8978
4,7938,2887,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,868.09,380.46,6630
