In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_transaction = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_transaction.csv')
train_identity = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_identity.csv')
df = train_transaction.merge(train_identity, on='TransactionID', how='left')

In [None]:
dagshub.init(repo_owner='gioeba', repo_name='IEEE-CIS-Fraud-Detection', mlflow=True)

# Feature Engineering

In [None]:
class FeatureEngineeringTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        START_DATE = pd.to_datetime('2017-12-01')
        X['TransactionDT'] = pd.to_timedelta(X['TransactionDT'], unit='s')
        X['datetime'] = START_DATE + X['TransactionDT']
        X['hour'] = X['datetime'].dt.hour
        X['weekday'] = X['datetime'].dt.weekday
        X['day'] = X['datetime'].dt.day
        X.drop(columns=['datetime'], inplace=True)

        X['DeviceInfo'] = X['DeviceInfo'].fillna('unknown').str.lower()
        X['device_os'] = X['DeviceInfo'].str.extract(r'([a-z]+)', expand=False)
        X['device_ver'] = X['DeviceInfo'].str.extract(r'(\d+)', expand=False)
        
        X = X.loc[:, X.isnull().mean() < 0.5].copy()

        if 'card1' in X.columns and 'card2' in X.columns:
            X['card1_card2_ratio'] = X['card1'] / (X['card2'] + 1)
        if 'card1' in X.columns and 'addr1' in X.columns:
            X['card1_addr1'] = X['card1'].astype(str) + '_' + X['addr1'].astype(str)
        if 'card1' in X.columns and 'TransactionAmt' in X.columns:
            X['card1_count'] = X.groupby('card1')['TransactionAmt'].transform('count')
            X['card1_mean'] = X.groupby('card1')['TransactionAmt'].transform('mean')

        return X

feature_engineering = FeatureEngineeringTransformer()

mlflow.set_experiment("ieee-fe")
with mlflow.start_run(run_name="fe-v2"):
    mlflow.sklearn.log_model(feature_engineering, "feature_engineering")

# Preprocessing

In [None]:
class DynamicPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.preprocessor = None

    def fit(self, X, y=None):
        cat_cols = X.select_dtypes('object').columns.tolist()
        num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

        num_transform = Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ])

        cat_transform = Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
        ])

        self.preprocessor = ColumnTransformer([
            ('num', num_transform, num_cols),
            ('cat', cat_transform, cat_cols)
        ])
        self.preprocessor.fit(X)
        return self

    def transform(self, X):
        return self.preprocessor.transform(X)

mlflow.set_experiment("ieee-pre-processor")
with mlflow.start_run(run_name="pre-processor-v2"):
    mlflow.sklearn.log_model(preprocessor, "pre-processor")