In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.metrics import classification_report
from lightgbm import LGBMClassifier

from src.config import *
from src.helpers import *

In [14]:
from dotenv import load_dotenv
load_dotenv()

TRAIN_DATA_PATH = os.getenv("TRAIN_DATA_PATH")

In [15]:
data = pd.read_csv(TRAIN_DATA_PATH)

**Transaction amount mismatch feature**

In [16]:
data.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,urgency_level,id
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,1
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,2
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,3
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,4
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,5


In [None]:
df_test = subset(data, 15)
df_train = data.drop(index=df_test.index)

**Preprocessing**

In [5]:
df_train = remove_correlations(df_train, CORRELATED_FEATURES_TO_REMOVE)
df_train = log_trainsform(df_train, LOG_TRANSFORM_FEATURES)
df_train = encode_categorical(df_train)[0]
df_train = encode_names(df_train, NAME_COLS)[0]

df_test = remove_correlations(df_test, CORRELATED_FEATURES_TO_REMOVE)
df_test = log_trainsform(df_test, LOG_TRANSFORM_FEATURES)
df_test = encode_categorical(df_test)[0]
df_test = encode_names(df_test, NAME_COLS)[0]

In [6]:
train_target_summary = target_summary_table(df_train, TARGET_COL)
train_target_summary


Unnamed: 0,urgency_level,count,percentage
0,0,5302177,99.894
1,1,1848,0.035
2,2,1841,0.035
3,3,1937,0.036


In [7]:
test_target_summary = target_summary_table(df_test, TARGET_COL)
test_target_summary

Unnamed: 0,urgency_level,count,percentage
0,0,935726,99.899
1,1,328,0.035
2,2,310,0.033
3,3,307,0.033


**Feature selection**

- Use simple lightGBM model to assess feature importance

In [8]:
lgbm_features = lgbm_feature_importance(df_train)[0]
lgbm_features

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.022183 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1107
[LightGBM] [Info] Number of data points in the train set: 5307803, number of used features: 11
[LightGBM] [Info] Start training from score -0.001061
[LightGBM] [Info] Start training from score -7.962829
[LightGBM] [Info] Start training from score -7.966624
[LightGBM] [Info] Start training from score -7.915793


Unnamed: 0,feature,importance_norm,importance_pct,importance_cum
0,oldbalanceOrg,0.546319,54.632,54.632
1,amount,0.155939,15.594,70.226
2,oldbalanceDest,0.075009,7.501,77.727
3,step,0.070465,7.046,84.773
4,type_TRANSFER,0.065887,6.589,91.362
5,type_CASH_IN,0.031277,3.128,94.49
6,nameDest,0.026099,2.61,97.1
7,type_CASH_OUT,0.015879,1.588,98.688
8,type_PAYMENT,0.012858,1.286,99.974
9,type_DEBIT,0.000258,0.026,100.0


- *nameOrig, type_DEBIT, type_PAYMENT, type_CASH_OUT* predictors are not importnat for tree model (together account for ~1.3% importance)
- Will be removed to reduce variance of the model

In [10]:
df_train = remove_weak_features(df_train, WEAK_FEATURES_TO_REMOVE)
df_test = remove_weak_features(df_test, WEAK_FEATURES_TO_REMOVE)
