In [1]:
import pandas as pd
import numpy as np
from src.utils import read_query_file

# Data Ingestion

In [2]:
df_c09 = pd.read_csv("data/mock_data_c09.csv", dtype={"MCC": str})

In [3]:
# MCC and Currency Data

mcc_df = pd.read_csv("data/mcc_data.csv", sep=";", dtype={"Code": str})
mcc_df.rename({"Code": "MCC"}, axis=1, inplace=True)
currency_code = pd.read_csv(
    "data/currency_code.csv", sep=";", dtype={"Number": str, "Card Acceptor Name": str}
)

In [4]:
# define sampling percentage
sampling_pct = 0.5

# read query file
debit_query = read_query_file("sql/join_data/02_debit_card_data.sql")
debit_query = debit_query.format(sampling_pct=sampling_pct)

# Feature Engineering Logics

In [5]:
from src.calculation_features import (
    generate_rolling_features,
    calculate_time_differences,
)

In [6]:
df_c09["MCC"] = df_c09["MCC"].replace({np.nan: "-1", "nan": "-1"})

## Value Mapping Steps

1. MCC value mapping: 
- MCC to MCC Details
- MCC to MCC Transaction Category Code
- MCC to MCC Category
2. Value mapping Terminal Owner/Card Acceptor Name to bigger group (ecommerce, online ads, etc.)

In [7]:
from src.utils import extract_provider_name, categorize_terminal_owner

In [7]:
# 1. Value mapping MCC_code to MCC Details, Trnx Category Code, Category

df_c09 = df_c09.merge(
    mcc_df[["MCC", "Description", "Transaction Category Code", "MCC Category"]],
    on="MCC",
    how="left",
)
df_c09.rename(
    columns={
        "Description": "MCC Details",
        "Transaction Category Code": "MCC Trnx Category Code",
    },
    inplace=True,
)

In [8]:
# 2. Value mapping Terminal Owner/Card Acceptor Name to bigger group (ecommerce, online ads, etc.)
df_c09["Cat Card Acceptor Name"] = df_c09["Card Acceptor Name"].apply(
    lambda x: (
        categorize_terminal_owner(extract_provider_name(x))
        if isinstance(x, str)
        else "unknown"
    )
)

In [19]:
df_c09["Card Acceptor Country Code"].nunique()

49

## Time Difference Features

In [9]:
from src.debit_card_config import (
    time_windows,
    freq_config,
    time_shift_config,
    monetary_config_1,
    monetary_config_2,
    monetary_config_3,
    monetary_config_4,
    monetary_config_5,
    monetary_config_6,
    monetary_config_7,
    monetary_config_8,
    unique_count_config_1,
    unique_count_config_2,
)

In [12]:
df_time_diff = calculate_time_differences(
    df=df_c09,
    datetime_col="Transaction Datetime",
    groupby_col="Card No",
    time_window=time_windows,
    config=time_shift_config,
)

  .rolling(window=window)
  .rolling(window=window)
  .rolling(window=window)
  .rolling(window=window)
  .rolling(window=window)
  .rolling(window=window)
  .rolling(window=window)
  .rolling(window=window)
  .rolling(window=window)
  .rolling(window=window)


## Frequency Features

In [None]:
df_freq = generate_rolling_features(
    df_c09,
    datetime_col="Transaction Datetime",
    key_col="Transaction Serial No",
    features_config=freq_config,
)

## Monetary

Calculate SUM, AVG, MAX of Transaction Amount grouped by Card No

In [None]:
df_monetary_1 = generate_rolling_features(
    df_c09,
    datetime_col="Transaction Datetime",
    key_col="Transaction Serial No",
    features_config=monetary_config_1,
)

  .rolling(window, closed="left")
  .rolling(window, closed="left")
  .rolling(window, closed="left")
  .rolling(window, closed="left")
  .rolling(window, closed="left")
  .rolling(window, closed="left")


Calculate SUM, MAX, MEAN of Transaction Amount to `MCC` grouped by Card No

In [None]:
df_monetary_2 = generate_rolling_features(
    df_c09,
    datetime_col="Transaction Datetime",
    key_col="Transaction Serial No",
    features_config=monetary_config_2,
)

  .rolling(window, closed="left")
  .rolling(window, closed="left")
  .rolling(window, closed="left")
  .rolling(window, closed="left")
  .rolling(window, closed="left")
  .rolling(window, closed="left")


Calculate SUM, MAX, MEAN of Transaction Amount to `MCC Details` grouped by Card No

In [None]:
df_monetary_3 = generate_rolling_features(
    df_c09,
    datetime_col="Transaction Datetime",
    key_col="Transaction Serial No",
    features_config=monetary_config_3,
)

  .rolling(window, closed="left")
  .rolling(window, closed="left")
  .rolling(window, closed="left")
  .rolling(window, closed="left")
  .rolling(window, closed="left")
  .rolling(window, closed="left")


Calculate SUM, MAX, MEAN of Transaction Amount to `MCC Trnx Category Code` grouped by Card No

In [None]:
df_monetary_4 = generate_rolling_features(
    df_c09,
    datetime_col="Transaction Datetime",
    key_col="Transaction Serial No",
    features_config=monetary_config_4,
)

  .rolling(window, closed="left")
  .rolling(window, closed="left")
  .rolling(window, closed="left")
  .rolling(window, closed="left")
  .rolling(window, closed="left")
  .rolling(window, closed="left")


Calculate SUM, MAX, MEAN of Transaction Amount to `MCC Category` grouped by Card No

In [None]:
df_monetary_5 = generate_rolling_features(
    df_c09,
    datetime_col="Transaction Datetime",
    key_col="Transaction Serial No",
    features_config=monetary_config_5,
)

  .rolling(window, closed="left")
  .rolling(window, closed="left")
  .rolling(window, closed="left")
  .rolling(window, closed="left")
  .rolling(window, closed="left")
  .rolling(window, closed="left")


Calculate SUM, MAX, MEAN of Transaction Amount to `Terminal Owner Category` grouped by Card No

In [None]:
df_monetary_6 = generate_rolling_features(
    df_c09,
    datetime_col="Transaction Datetime",
    key_col="Transaction Serial No",
    features_config=monetary_config_6,
)

  .rolling(window, closed="left")
  .rolling(window, closed="left")
  .rolling(window, closed="left")
  .rolling(window, closed="left")
  .rolling(window, closed="left")
  .rolling(window, closed="left")


In [None]:
df_monetary_7 = generate_rolling_features(
    df_c09,
    datetime_col="Transaction Datetime",
    key_col="Transaction Serial No",
    features_config=monetary_config_7,
)

  .rolling(window, closed="left")
  .rolling(window, closed="left")
  .rolling(window, closed="left")
  .rolling(window, closed="left")
  .rolling(window, closed="left")
  .rolling(window, closed="left")


In [None]:
df_monetary_8 = generate_rolling_features(
    df_c09,
    datetime_col="Transaction Datetime",
    key_col="Transaction Serial No",
    features_config=monetary_config_8,
)

  .rolling(window, closed="left")
  .rolling(window, closed="left")
  .rolling(window, closed="left")
  .rolling(window, closed="left")
  .rolling(window, closed="left")
  .rolling(window, closed="left")


## Unique Count

In [None]:
df_c09["MCC Num"], uniques = df_c09["MCC"].factorize()
df_c09["Card No Num"], uniques = df_c09["Card No"].factorize()

df_unique_count_1 = generate_rolling_features(
    df_c09,
    datetime_col="Transaction Datetime",
    key_col="Transaction Serial No",
    features_config=unique_count_config_1,
)

  .rolling(window=window, closed="left", min_periods=1)
  .rolling(window=window, closed="left", min_periods=1)
  .rolling(window=window, closed="left", min_periods=1)
  .rolling(window=window, closed="left", min_periods=1)
  .rolling(window=window, closed="left", min_periods=1)
  .rolling(window=window, closed="left", min_periods=1)


Count unique (distinct) `Transaction Amount`, grouped by Card No, MCC, MCC Details, MCC Category Code, MCC Category, Terminal Owner Category

In [None]:
df_unique_count_2 = generate_rolling_features(
    df_c09,
    datetime_col="Transaction Datetime",
    key_col="Transaction Serial No",
    features_config=unique_count_config_2,
)

  .rolling(window=window, closed="left", min_periods=1)
  .rolling(window=window, closed="left", min_periods=1)
  .rolling(window=window, closed="left", min_periods=1)
  .rolling(window=window, closed="left", min_periods=1)
  .rolling(window=window, closed="left", min_periods=1)
  .rolling(window=window, closed="left", min_periods=1)
  .rolling(window=window, closed="left", min_periods=1)
  .rolling(window=window, closed="left", min_periods=1)
  .rolling(window=window, closed="left", min_periods=1)
  .rolling(window=window, closed="left", min_periods=1)
  .rolling(window=window, closed="left", min_periods=1)
  .rolling(window=window, closed="left", min_periods=1)


## Getting All Transformed Features

In [13]:
from functools import reduce

In [16]:
# Define the common keys for merging
merge_keys = ["Transaction Serial No", "Card No"]


# Helper function to extract columns based on config and time windows
def extract_columns(config, time_windows):
    return [cfg["windows"][win] for cfg in config for win in time_windows]


# Extract columns
time_shift_cols = list(time_shift_config.keys())
freq_cols = extract_columns(freq_config, time_windows)

# Extract monetary and unique count columns using loops
monetary_configs = [
    monetary_config_1,
    monetary_config_2,
    monetary_config_3,
    monetary_config_4,
    monetary_config_5,
    monetary_config_6,
    monetary_config_7,
    monetary_config_8,
]
monetary_cols = [extract_columns(cfg, time_windows) for cfg in monetary_configs]

unique_count_configs = [unique_count_config_1, unique_count_config_2]
unique_count_cols = [extract_columns(cfg, time_windows) for cfg in unique_count_configs]

In [None]:
# Subset dataframes using a loop
df_time_diff = df_time_diff[merge_keys + time_shift_cols]
df_freq = df_freq[merge_keys + freq_cols]

# Map dataframe variables to a list for iteration
monetary_dfs = [
    df_monetary_1,
    df_monetary_2,
    df_monetary_3,
    df_monetary_4,
    df_monetary_5,
    df_monetary_6,
    df_monetary_7,
    df_monetary_8,
]
monetary_dfs = [df[merge_keys + cols] for df, cols in zip(monetary_dfs, monetary_cols)]

unique_count_dfs = [
    df_unique_count_1[merge_keys + unique_count_cols[0]],
    df_unique_count_2[merge_keys + unique_count_cols[1]],
]

# Merge all feature dataframes
dfs_to_merge = [df_time_diff, df_freq] + monetary_dfs + unique_count_dfs
df_final = reduce(
    lambda left, right: pd.merge(left, right, on=merge_keys, how="outer"), dfs_to_merge
)

# Merge with additional features
df_final = df_final.merge(
    df_c09[merge_keys + ["Transaction Amount", "Card Billing Amount", "Confirmed"]],
    on=merge_keys,
    how="left",
)