In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.metrics import classification_report

from src.config import *
from src.helpers import *

In [2]:
from dotenv import load_dotenv
load_dotenv()

TRAIN_DATA_PATH = os.getenv("TRAIN_DATA_PATH")

In [3]:
data = pd.read_csv(TRAIN_DATA_PATH)

## **Additional features**

**Balance mismatches**

- **Conservation law:** new balance = old balance +- amount 
- TRANSFER and CASH OUT types expect strict conservation
- Mismatch may signal suspecious transactions

In [4]:
EPS = 1e-2

data["orig_balance_mismatch"] = (
    data["type"].isin(STRICT_TYPES)
    & (np.abs(data["oldbalanceOrg"] - data["amount"] - data["newbalanceOrig"]) > EPS)
).astype(int)

data["dest_balance_mismatch"] = (
    data["type"].isin(STRICT_TYPES) & (np.abs(data["oldbalanceDest"] + data["amount"] - data["newbalanceDest"]) > EPS)
).astype(int)

**Amount ratios**

- amount / balance

In [5]:
data["orig_amount_ratio"] = data["amount"] / (data["oldbalanceOrg"] + 1e-6).clip(upper=10)
data["dest_amount_ratio"] = data["amount"] / (data["oldbalanceDest"] + 1e-6).clip(upper=10)

**Stress flags**

- very large amounts and zero balances

In [6]:
data["orig_drain"] = (
    data["amount"] >= 0.98 * data["oldbalanceOrg"]
).astype(int)

data["dest_jump"] = (
    data["amount"] >= 5 * (data["oldbalanceDest"] + 1)
).astype(int)

data["dest_zero_before"] = (data["oldbalanceDest"] == 0).astype(int)

data["orig_zero_after"] = (data["newbalanceOrig"] == 0).astype(int)


## **Preprocessing**

In [7]:
df_test = subset(data, 15)
df_train = data.drop(index=df_test.index)

In [8]:
df_train = remove_correlations(df_train, CORRELATED_FEATURES_TO_REMOVE)
df_train = log_trainsform(df_train, LOG_TRANSFORM_FEATURES)
df_train = encode_categorical(df_train)[0]
df_train = encode_names(df_train, NAME_COLS)[0]

df_test = remove_correlations(df_test, CORRELATED_FEATURES_TO_REMOVE)
df_test = log_trainsform(df_test, LOG_TRANSFORM_FEATURES)
df_test = encode_categorical(df_test)[0]
df_test = encode_names(df_test, NAME_COLS)[0]

In [9]:
train_target_summary = target_summary_table(df_train, TARGET_COL)
train_target_summary


Unnamed: 0,urgency_level,count,percentage
0,0,5302177,99.894
1,1,1848,0.035
2,2,1841,0.035
3,3,1937,0.036


In [10]:
test_target_summary = target_summary_table(df_test, TARGET_COL)
test_target_summary

Unnamed: 0,urgency_level,count,percentage
0,0,935726,99.899
1,1,328,0.035
2,2,310,0.033
3,3,307,0.033


**Feature selection**

- Use simple lightGBM model to assess feature importance

In [11]:
lgbm_features = lgbm_feature_importance(df_train)[0]
lgbm_features

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.117714 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1629
[LightGBM] [Info] Number of data points in the train set: 5307803, number of used features: 19
[LightGBM] [Info] Start training from score -0.001061
[LightGBM] [Info] Start training from score -7.962829
[LightGBM] [Info] Start training from score -7.966624
[LightGBM] [Info] Start training from score -7.915793


Unnamed: 0,feature,importance_norm,importance_pct,importance_cum
0,orig_balance_mismatch,0.2835126,28.351,28.351
1,oldbalanceOrg,0.2040787,20.408,48.759
2,orig_zero_after,0.1794753,17.948,66.707
3,orig_amount_ratio,0.1035589,10.356,77.063
4,dest_amount_ratio,0.07904634,7.905,84.968
5,type_TRANSFER,0.0633638,6.336,91.304
6,orig_drain,0.05568085,5.568,96.872
7,amount,0.01450756,1.451,98.323
8,type_CASH_OUT,0.01089619,1.09,99.413
9,dest_balance_mismatch,0.004238331,0.424,99.837


Weak predictors (~1.7% importance together):

- nameOrig, nameDest
- oldbalanceDest
- step
- dest_jump, dest_zero_before
- type_PAYMENT, type_CASH_IN, type_DEBIT, type_CASH_OUT
- dest_balance_mismatch



In [11]:
df_train = remove_weak_features(df_train, WEAK_FEATURES_TO_REMOVE)
df_test = remove_weak_features(df_test, WEAK_FEATURES_TO_REMOVE)
