In [36]:
import os
import sys
import pandas as pd
import polars as pl
import numpy as np
import json
import lightgbm as lgb
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
from sklearn.metrics import balanced_accuracy_score

# root path
ROOT = os.path.abspath(os.path.join(os.getcwd(), '..'))

# Add the project root to the Python path
if ROOT not in sys.path:
    sys.path.append(ROOT)

In [50]:
TRAIN_PATH = os.path.join(ROOT, 'data', 'processed', 'train.parquet')
TEST_PATH = os.path.join(ROOT, 'data', 'processed', 'test.parquet')
PREDICTION_IDS_PATH = os.path.join(ROOT, 'predictions', 'predictions_3.json')

train = pl.read_parquet(TRAIN_PATH)
test = pl.read_parquet(TEST_PATH)

In [30]:
import polars as pl

class EngineerData:

    def __init__(self):
        pass

    def downsample_data_pl(self, df: pl.DataFrame, neg_ratio=None, is_train: bool=True) -> pl.DataFrame:
        """
        Downsample the negative class of a DataFrame if is_train is True and neg_ratio is not 0.

        Parameters:
        df (pl.DataFrame): DataFrame to be downsampled.
        neg_ratio (float, optional): Ratio of negative samples to positive samples. 
                                    If None or 0, all negative samples are used. Defaults to None.
        is_train (bool): If True, considers downsampling data. Defaults to True.

        Returns:
        pl.DataFrame: Downsampled DataFrame if is_train is True and neg_ratio > 0, otherwise original DataFrame.
        """
        if not is_train or neg_ratio == 0:
            return df

        # Separate positive and negative cases
        p_cases = df.filter(pl.col('fraud') == 1)
        n_cases = df.filter(pl.col('fraud') == 0)

        # Downsample negative cases if neg_ratio is valid
        if neg_ratio is not None and neg_ratio > 0:
            n_negative_samples = int(p_cases.height * neg_ratio)
            n_cases = n_cases.sample(n=n_negative_samples, seed=23)

            df = pl.concat([n_cases, p_cases])
            print("Downsampling...")
            print(f"New target distribution: \n {df['fraud'].value_counts(normalize=True)}")

        return df

    def feature_engineering(self, df):
        """
        Set or add additional features for the DataFrame.

        Parameters:
        df (pl.DataFrame): Input DataFrame.
        cat_cols (list, optional): List of categorical column names.
        num_cols (list, optional): List of numerical column names.

        Returns:
        tuple: DataFrame with engineered features, categorical columns, and numerical columns.
        """

        cat_cols = [
            'client_id', 'card_id', 'use_chip', 'merchant_id',
            'merchant_city', 'merchant_state', 'zip', 'mcc'
        ]
        num_cols = ['amount_clean']

        return df, cat_cols, num_cols

    def process_data(self, df, neg_ratio=10, is_train: bool=True):
        """
        Process data by downsampling if applicable and performing feature engineering.

        Parameters:
        df (pl.DataFrame): Input DataFrame.
        neg_ratio (int): Ratio of negative to positive cases for downsampling.
        is_train (bool): Indicates whether the data is for training or inference.

        Returns:
        tuple: Pandas DataFrame with processed data, list of categorical columns, list of numerical columns.
        """
        df = self.downsample_data_pl(df, neg_ratio=neg_ratio, is_train=is_train)
        df, cat_cols, num_cols = self.feature_engineering(df)

        return df.to_pandas(), cat_cols, num_cols


In [31]:
NEG_RATIO = 10

In [35]:
ed = EngineerData()

train_dw, CAT_COLS, NUM_COLS = ed.process_data(train, neg_ratio=NEG_RATIO, is_train=True)
test_, _, _ = ed.process_data(test, neg_ratio=10, is_train=False)

FEATURES = CAT_COLS + NUM_COLS
TARGET = 'fraud'

Downsampling...
New target distribution: 
 shape: (2, 2)
┌───────┬────────────┐
│ fraud ┆ proportion │
│ ---   ┆ ---        │
│ i64   ┆ f64        │
╞═══════╪════════════╡
│ 0     ┆ 0.909091   │
│ 1     ┆ 0.090909   │
└───────┴────────────┘


# Cross-Validation

In [48]:
lgb_params = {
    'objective': 'binary',
    'verbose': -1,
    'random_state': 23,
    'boosting_type':  'gbdt',
    'n_estimators': 200,
}
 
estimator = lgb.LGBMClassifier(**lgb_params)

In [47]:
X = train_dw[FEATURES]
y = train_dw[TARGET]

cv = KFold(5, shuffle=True, random_state=23)

results = cross_validate(
    estimator=estimator,
    X=X, y=y,
    cv=cv,
    scoring='balanced_accuracy',
    return_train_score=True
)

# Print Balanced Accuracy scores for both train and validation
print(f"Folds Train Balanced Accuracy scores: {results['train_score']}")
print(f"Folds Validation Balanced Accuracy scores: {results['test_score']}")

# Print mean Balanced Accuracy scores
print("*"*50)
print(f"Mean Train Balanced Accuracy score: {np.mean(results['train_score'])}")
print(f"Mean Validation Balanced Accuracy score: {np.mean(results['test_score'])}")

Folds Train Balanced Accuracy scores: [0.99995321 0.99990643 0.99995306 0.99990548 1.        ]
Folds Validation Balanced Accuracy scores: [0.94726901 0.94959699 0.95167175 0.94343175 0.94449427]
**************************************************
Mean Train Balanced Accuracy score: 0.9999436370112189
Mean Validation Balanced Accuracy score: 0.9472927561362153


# Predict

In [49]:
estimator.fit(X, y)
preds = estimator.predict(test_[FEATURES])

submission = test_[['id', 'fraud']]
submission[TARGET] = preds
submission[TARGET] = np.where(submission[TARGET] == 0, 'No', 'Yes')

submission_dict = submission.set_index('id')['fraud'].to_dict()

submission_json = {
    "target": submission_dict
}

with open(PREDICTION_IDS_PATH, 'w') as f:
    json.dump(submission_json, f)

print(f"Updated predictions saved to {PREDICTION_IDS_PATH}")