In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import gc
import time
import joblib
import os

In [2]:
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, RandomizedSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder # Choose appropriate encoder
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_classif, RFE
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import roc_auc_score, roc_curve

In [3]:
# Model imports
import xgboost as xgb

# MLflow imports
import mlflow
print("Libraries imported successfully.")

Libraries imported successfully.


In [4]:
import dagshub
dagshub.init(repo_owner='enelene', repo_name='ieee-cis-fraud-detection-mlops', mlflow=True)

DAGSHUB_REPO_OWNER = 'enelene' # YOUR DagsHub Username
DAGSHUB_REPO_NAME = 'ieee-cis-fraud-detection-mlops' # YOUR Repository Name
MLFLOW_TRACKING_URI = f"https://dagshub.com/{DAGSHUB_REPO_OWNER}/{DAGSHUB_REPO_NAME}.mlflow"

# Set the MLflow Experiment specific to this model
# This will create the experiment if it doesn't exist
MODEL_ARCHITECTURE = "XGBoost"
EXPERIMENT_NAME = f"{MODEL_ARCHITECTURE}_Training"

try:
    # Optional: Initialize DagsHub integration (if not already configured globally)
    # dagshub.init(repo_owner=DAGSHUB_REPO_OWNER, repo_name=DAGSHUB_REPO_NAME, mlflow=True)

    mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
    mlflow.set_experiment(EXPERIMENT_NAME)
    print(f"MLflow tracking URI set to: {MLFLOW_TRACKING_URI}")
    print(f"MLflow active experiment set to: {EXPERIMENT_NAME}")
except Exception as e:
    print(f"Could not configure MLflow: {e}. Ensure DagsHub repo exists and you have access.")
    # Depending on your setup, you might need to manually authenticate DagsHub first.

MLflow tracking URI set to: https://dagshub.com/enelene/ieee-cis-fraud-detection-mlops.mlflow
MLflow active experiment set to: XGBoost_Training


In [5]:
# --- Global Config ---
SAMPLE_ROWS = 100000 
TARGET = 'isFraud'
RANDOM_STATE = 42

# --- Preprocessing Config ---
MISSING_THRESHOLD_DROP_COLS = 90
NUMERICAL_IMPUTATION_STRATEGY = 'median' 
CATEGORICAL_IMPUTATION_STRATEGY = 'Missing'

# --- Feature Engineering Config ---
CREATE_TIME_FEATURES = True
CREATE_INTERACTION_FEATURES = True
CREATE_AGGREGATION_FEATURES = True
CATEGORICAL_ENCODING_STRATEGY = 'LabelEncoding' 

# --- Feature Selection Config ---
APPLY_VARIANCE_THRESHOLD = True
VARIANCE_THRESHOLD_VALUE = 0.01
APPLY_CORRELATION_THRESHOLD = True
CORRELATION_THRESHOLD_DROP_COLS = 0.98
APPLY_MODEL_BASED_FS = True # Using LGBM for importance as an example selector
N_FEATURES_MODEL_BASED = 200 # Number of features to keep from model importance

# --- Training Config ---
N_SPLITS_CV = 5 # Number of folds for cross-validation
N_ITER_RANDOM_SEARCH = 15 # Number of iterations for RandomizedSearchCV hyperparameter tuning

In [6]:

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if pd.isna(c_min) or pd.isna(c_max): continue
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if pd.isna(c_min) or pd.isna(c_max): continue
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                     if df[col].nbytes > df[col].astype(np.float32).nbytes:
                          df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print(f'Mem. usage decreased to {end_mem:.2f} Mb ({100 * (start_mem - end_mem) / start_mem:.1f}% reduction)')
    return df


In [7]:
def create_time_features(df, dt_column='TransactionDT'):
    if dt_column not in df.columns:
        print(f"Warning: {dt_column} not found. Skipping time features.")
        return df, []
    df['Transaction_DayNumber'] = df[dt_column] // (24 * 60 * 60)
    df['Transaction_SecondInDay'] = df[dt_column] % (24 * 60 * 60)
    df['Transaction_Hour'] = df['Transaction_SecondInDay'] // 3600
    df['Transaction_DayOfWeek'] = (df['Transaction_DayNumber']) % 7
    new_time_features = ['Transaction_DayNumber', 'Transaction_SecondInDay', 'Transaction_Hour', 'Transaction_DayOfWeek']
    print(f"Created time features: {new_time_features}")
    return df, new_time_features

In [8]:
with mlflow.start_run(run_name="Data_Loading_Merge") as run_load:
    print(f"\n--- Starting MLflow Run: {run_load.info.run_name} ({run_load.info.run_uuid}) ---")
    mlflow.log_param("sample_rows", SAMPLE_ROWS if SAMPLE_ROWS is not None else "All")
    mlflow.log_param("random_state", RANDOM_STATE)

    train_trans = pd.read_csv('data/train_transaction.csv', nrows=SAMPLE_ROWS)
    train_id = pd.read_csv('data/train_identity.csv', nrows=SAMPLE_ROWS)
    print(f"Loaded {len(train_trans)} transaction rows, {len(train_id)} identity rows")

    train_df = pd.merge(train_trans, train_id, on='TransactionID', how='left')
    del train_trans, train_id; gc.collect()

    mlflow.log_param("initial_rows", train_df.shape[0])
    mlflow.log_param("initial_cols", train_df.shape[1])
    print(f"Initial merged shape: {train_df.shape}")

    train_df = reduce_mem_usage(train_df)

    # Basic Exploration
    print(f"Target Distribution (%):\n{train_df[TARGET].value_counts(normalize=True) * 100}")

    # Log initial target distribution
    target_counts = train_df[TARGET].value_counts()
    mlflow.log_metric("initial_target_count_0", target_counts.get(0, 0))
    mlflow.log_metric("initial_target_count_1", target_counts.get(1, 0))

    print(f"--- Finished MLflow Run: {run_load.info.run_name} ---")


--- Starting MLflow Run: Data_Loading_Merge (7c6b6d10e8fe4530bb8a7193529f7fa3) ---
Loaded 100000 transaction rows, 100000 identity rows
Initial merged shape: (100000, 434)
Mem. usage decreased to 176.91 Mb (46.6% reduction)
Target Distribution (%):
isFraud
0    97.439
1     2.561
Name: proportion, dtype: float64
--- Finished MLflow Run: Data_Loading_Merge ---
🏃 View run Data_Loading_Merge at: https://dagshub.com/enelene/ieee-cis-fraud-detection-mlops.mlflow/#/experiments/1/runs/7c6b6d10e8fe4530bb8a7193529f7fa3
🧪 View experiment at: https://dagshub.com/enelene/ieee-cis-fraud-detection-mlops.mlflow/#/experiments/1


In [9]:
with mlflow.start_run(run_name=f"Cleaning_Drop{MISSING_THRESHOLD_DROP_COLS}_Num{NUMERICAL_IMPUTATION_STRATEGY}_Cat{CATEGORICAL_IMPUTATION_STRATEGY}") as run_clean:
    print(f"\n--- Starting MLflow Run: {run_clean.info.run_name} ({run_clean.info.run_uuid}) ---")
    mlflow.log_param("missing_threshold_drop_cols", MISSING_THRESHOLD_DROP_COLS)
    mlflow.log_param("numerical_imputation", NUMERICAL_IMPUTATION_STRATEGY)
    mlflow.log_param("categorical_imputation", CATEGORICAL_IMPUTATION_STRATEGY)

    print(f"\nCleaning: Dropping columns with >{MISSING_THRESHOLD_DROP_COLS}% missing values...")
    initial_cols = train_df.columns.tolist()
    missing_values = (train_df.isnull().sum() / len(train_df) * 100)
    cols_to_drop_missing = missing_values[missing_values > MISSING_THRESHOLD_DROP_COLS].index.tolist()
    train_df = train_df.drop(columns=cols_to_drop_missing)
    print(f"Dropped {len(cols_to_drop_missing)} columns due to high NaNs.")
    mlflow.log_param("num_cols_dropped_missing", len(cols_to_drop_missing))
    # Log dropped columns list as artifact
    if cols_to_drop_missing:
        pd.Series(cols_to_drop_missing).to_csv("cols_dropped_missing.txt", index=False, header=False)
        mlflow.log_artifact("cols_dropped_missing.txt", "cleaning_info")
        os.remove("cols_dropped_missing.txt")

    categorical_features = [
        'ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6',
        'addr1', 'addr2', 'P_emaildomain', 'R_emaildomain',
        'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9',
        'DeviceType', 'DeviceInfo'
    ] + [f'id_{i}' for i in range(12, 39)]
    categorical_features = [col for col in categorical_features if col in train_df.columns]

    numerical_features = [col for col in train_df.columns if col not in categorical_features
                          and col not in ['TransactionID', 'TransactionDT', TARGET]
                          and train_df[col].dtype in ['int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']]

    print(f"Cleaning: Imputing remaining NaNs (Num: {NUMERICAL_IMPUTATION_STRATEGY}, Cat: {CATEGORICAL_IMPUTATION_STRATEGY})...")
  
    if NUMERICAL_IMPUTATION_STRATEGY == 'median':
        for col in numerical_features:
            if train_df[col].isnull().any(): train_df[col].fillna(train_df[col].median(), inplace=True)
            
    if CATEGORICAL_IMPUTATION_STRATEGY == 'Missing':
         for col in categorical_features:
            # Ensure correct dtype before filling
            if train_df[col].isnull().any():
                if pd.api.types.is_categorical_dtype(train_df[col].dtype):
                     train_df[col] = train_df[col].cat.add_categories('Missing').fillna('Missing')
                else:
                     # Convert to object if needed to hold the string 'Missing'
                     if not pd.api.types.is_object_dtype(train_df[col].dtype):
                          train_df[col] = train_df[col].astype(object)
                     train_df[col].fillna('Missing', inplace=True)

    final_nan_count = train_df.isnull().sum().sum()
    print(f"NaNs remaining after imputation: {final_nan_count}")
    mlflow.log_metric("remaining_nans", final_nan_count)
    mlflow.log_param("cols_after_cleaning", train_df.shape[1])

    print(f"--- Finished MLflow Run: {run_clean.info.run_name} ---")


--- Starting MLflow Run: Cleaning_Drop90_Nummedian_CatMissing (ff3552e002744e66b7cbbc85f345da71) ---

Cleaning: Dropping columns with >90% missing values...
Dropped 12 columns due to high NaNs.
Cleaning: Imputing remaining NaNs (Num: median, Cat: Missing)...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  if train_df[col].isnull().any(): train_df[col].fillna(train_df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  if train_df[col].isnull().any(): train_df[col].fillna(train_df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inp

NaNs remaining after imputation: 0
--- Finished MLflow Run: Cleaning_Drop90_Nummedian_CatMissing ---
🏃 View run Cleaning_Drop90_Nummedian_CatMissing at: https://dagshub.com/enelene/ieee-cis-fraud-detection-mlops.mlflow/#/experiments/1/runs/ff3552e002744e66b7cbbc85f345da71
🧪 View experiment at: https://dagshub.com/enelene/ieee-cis-fraud-detection-mlops.mlflow/#/experiments/1


In [10]:
#7. Feature Engineering
fe_run_name = f"FE_Time{CREATE_TIME_FEATURES}_Interact{CREATE_INTERACTION_FEATURES}_Agg{CREATE_AGGREGATION_FEATURES}_Enc{CATEGORICAL_ENCODING_STRATEGY}"
with mlflow.start_run(run_name=fe_run_name) as run_fe:
    mlflow.log_param("create_time_features", CREATE_TIME_FEATURES)
    mlflow.log_param("create_interaction_features", CREATE_INTERACTION_FEATURES)
    mlflow.log_param("create_aggregation_features", CREATE_AGGREGATION_FEATURES)
    mlflow.log_param("categorical_encoding_strategy", CATEGORICAL_ENCODING_STRATEGY)

    # Keep track of original feature lists before modification
    numerical_features_orig = numerical_features.copy()
    categorical_features_orig = categorical_features.copy()
    newly_created_features = []

    # --- Create Time Features ---
    if CREATE_TIME_FEATURES:
        print("Creating time features...")
        train_df, new_time_feats = create_time_features(train_df, 'TransactionDT')
        numerical_features.extend([tf for tf in new_time_feats if tf not in numerical_features])
        newly_created_features.extend(new_time_feats)

    # --- Create Interaction Features ---
    if CREATE_INTERACTION_FEATURES:
        print("Creating interaction features...")
        # card1_addr1
        if 'card1' in train_df.columns and 'addr1' in train_df.columns:
            feature_name = 'card1_addr1_interaction'
            train_df[feature_name] = train_df['card1'].astype(str) + '_' + train_df['addr1'].astype(str)
            categorical_features.append(feature_name)
            newly_created_features.append(feature_name)
       
        # TransactionAmt / mean(TransactionAmt for ProductCD)
        if 'TransactionAmt' in train_df.columns and 'ProductCD' in train_df.columns:
             feature_name = 'Amt_div_ProductCD_mean'
             map_mean = train_df.groupby('ProductCD')['TransactionAmt'].transform('mean')
             train_df[feature_name] = train_df['TransactionAmt'] / (map_mean + 1e-6) # Add epsilon
             numerical_features.append(feature_name)
             newly_created_features.append(feature_name)
             print(f"Created: {feature_name}")

    if CREATE_AGGREGATION_FEATURES:
        # Example: Aggregating TransactionAmt by card1
        if 'card1' in train_df.columns and 'TransactionAmt' in train_df.columns:
            agg_col = 'card1'
            target_col = 'TransactionAmt'
            funcs = ['mean', 'std', 'count']
            print(f"Aggregating {target_col} by {agg_col}...")
            agg_df = train_df.groupby(agg_col)[target_col].agg(funcs)
            new_agg_cols = [f'{agg_col}_{target_col}_{func}' for func in funcs]
            agg_df.columns = new_agg_cols
            agg_df = agg_df.reset_index() # Make agg_col a column for merging
            train_df = pd.merge(train_df, agg_df, on=agg_col, how='left')
            # Fill NaNs created by std if group size was 1
            for col in new_agg_cols:
                 if 'std' in col and train_df[col].isnull().any():
                      train_df[col].fillna(0, inplace=True)
                 if col not in numerical_features: numerical_features.append(col)
            newly_created_features.extend(new_agg_cols)
            print(f"Created aggregations for {target_col} by {agg_col}: {new_agg_cols}")

        # Example: Aggregate TransactionAmt based on card1 + addr1 combination
        if 'card1' in train_df.columns and 'addr1' in train_df.columns and 'TransactionAmt' in train_df.columns:
            agg_key = ['card1', 'addr1']
            target_col = 'TransactionAmt'
            key_name = 'card1_addr1' # Use a simpler name for columns
            funcs = ['mean', 'std']
            print(f"Aggregating {target_col} by {key_name}...")
            agg_df = train_df.groupby(agg_key)[target_col].agg(funcs).reset_index()
            new_agg_cols = [f'{key_name}_{target_col}_{func}' for func in funcs]
            agg_df.columns = agg_key + new_agg_cols
            train_df = pd.merge(train_df, agg_df, on=agg_key, how='left')
            for col in new_agg_cols:
                    if 'std' in col and train_df[col].isnull().any():
                        train_df[col].fillna(0, inplace=True)
                    if col not in numerical_features: numerical_features.append(col)
            newly_created_features.extend(new_agg_cols)
            print(f"Created aggregations for {target_col} by {key_name}: {new_agg_cols}")

        # Example: Count unique DeviceInfo per card1
        if 'card1' in train_df.columns and 'DeviceInfo' in train_df.columns:
            agg_key = 'card1'
            target_col = 'DeviceInfo'
            feature_name = f'{agg_key}_nunique_{target_col}'
            print(f"Creating aggregation: {feature_name}")
            train_df[feature_name] = train_df.groupby(agg_key)[target_col].transform('nunique')
            if feature_name not in numerical_features: numerical_features.append(feature_name)
            newly_created_features.append(feature_name)

        # Example: Count transactions per card1 within the same 'Transaction_DayNumber'
        if 'card1' in train_df.columns and 'Transaction_DayNumber' in train_df.columns:
            agg_key = ['card1', 'Transaction_DayNumber']
            feature_name = 'card1_trans_count_today'
            print(f"Creating aggregation: {feature_name}")
            train_df[feature_name] = train_df.groupby(agg_key)['TransactionID'].transform('count')
            if feature_name not in numerical_features: numerical_features.append(feature_name)
            newly_created_features.append(feature_name)

        # --- Aggregations based on Vxxx features (Corrected) ---
        if 'P_emaildomain' in train_df.columns:
            v_features_subset = ['V12', 'V35', 'V53', 'V75'] # Example subset
            v_features_subset = [v for v in v_features_subset if v in train_df.columns] # Ensure they exist
            if v_features_subset:
                agg_key = 'P_emaildomain'
                funcs = ['mean', 'std']
                print(f"Aggregating {v_features_subset} by {agg_key} with functions {funcs}...")

                # Perform aggregation
                agg_df = train_df.groupby(agg_key)[v_features_subset].agg(funcs)

                # --- Flatten the MultiIndex columns ---
                new_v_agg_cols = []
                for v_feat in v_features_subset:
                    for func in funcs:
                        new_col_name = f'{agg_key}_{v_feat}_{func}' # Construct the desired flat name
                        new_v_agg_cols.append(new_col_name)

                # Assign the new flat column names directly
                agg_df.columns = new_v_agg_cols
                # Reset index to bring the grouping key (agg_key) back as a column
                agg_df = agg_df.reset_index()
                # --- End Flattening ---

                print(f"Columns in agg_df after flattening: {agg_df.columns.tolist()}") # Debug print

                # Merge back to the original dataframe
                columns_to_merge = [agg_key] + [col for col in new_v_agg_cols if col in agg_df.columns]
                if len(columns_to_merge) > 1: # Check if there are aggregation columns to merge
                    print(f"Merging columns: {columns_to_merge}")
                    train_df = pd.merge(train_df, agg_df[columns_to_merge], on=agg_key, how='left')

                    # Fill NaNs created by std and add to feature lists
                    for col in new_v_agg_cols:
                        if col in train_df.columns: # Check if column was successfully merged
                            if 'std' in col and train_df[col].isnull().any():
                                train_df[col].fillna(0, inplace=True)
                            # Add to numerical features list if not already present
                            if col not in numerical_features:
                                numerical_features.append(col)
                    # Add only successfully merged columns to newly_created list
                    newly_created_features.extend([col for col in new_v_agg_cols if col in train_df.columns])
                    print(f"Successfully created and merged Vxxx aggregations by {agg_key}")
                else:
                    print(f"Warning: No valid aggregated columns found in agg_df for merging.")

        # --- Frequency Encoding Example ---
        # Can be useful for high-cardinality features
        high_card_features = ['DeviceInfo', 'id_31', 'card1_addr1_interaction'] # Example
        for col in high_card_features:
            if col in train_df.columns:
                feature_name = f'{col}_freq_encoding'
                freq_map = (train_df[col].value_counts() / len(train_df)).to_dict()
                train_df[feature_name] = train_df[col].map(freq_map)
                if feature_name not in numerical_features: numerical_features.append(feature_name)
                newly_created_features.append(feature_name)
                print(f"Created frequency encoding: {feature_name}")

        print(f"Total newly created features in this run: {len(newly_created_features)}")
        mlflow.log_param("num_advanced_features_created", len(newly_created_features))

    mlflow.log_param("num_features_created", len(newly_created_features))
    if newly_created_features:
        pd.Series(newly_created_features).to_csv("newly_created_features.txt", index=False, header=False)
        mlflow.log_artifact("newly_created_features.txt", "feature_engineering_info")
        os.remove("newly_created_features.txt")

    # --- Categorical Encoding ---
    print(f"Applying {CATEGORICAL_ENCODING_STRATEGY}...")
    if CATEGORICAL_ENCODING_STRATEGY == 'LabelEncoding':
        label_encoders = {}
        for col in categorical_features:
            if col in train_df.columns:
                train_df[col] = train_df[col].astype(str) # Ensure string type
                encoder = LabelEncoder()
                train_df[col] = encoder.fit_transform(train_df[col])
                label_encoders[col] = encoder 

    if CATEGORICAL_ENCODING_STRATEGY == 'LabelEncoding':
        features_after_fe = sorted(list(set(numerical_features + categorical_features)))

    # Ensure no leakage columns
    features_after_fe = [f for f in features_after_fe if f not in [TARGET, 'TransactionID', 'TransactionDT', 'TransactionDateTime']]

    mlflow.log_param("cols_after_fe_encoding", len(features_after_fe))

    # Save the list of features after FE to pass to the FS stage
    pd.Series(features_after_fe).to_csv("features_after_fe.txt", index=False, header=False)
    mlflow.log_artifact("features_after_fe.txt", "feature_engineering_info")
    os.remove("features_after_fe.txt")

Creating time features...
Created time features: ['Transaction_DayNumber', 'Transaction_SecondInDay', 'Transaction_Hour', 'Transaction_DayOfWeek']
Creating interaction features...
Created: Amt_div_ProductCD_mean
Aggregating TransactionAmt by card1...


  df['Transaction_DayNumber'] = df[dt_column] // (24 * 60 * 60)
  df['Transaction_SecondInDay'] = df[dt_column] % (24 * 60 * 60)
  df['Transaction_Hour'] = df['Transaction_SecondInDay'] // 3600
  df['Transaction_DayOfWeek'] = (df['Transaction_DayNumber']) % 7
  train_df[feature_name] = train_df['card1'].astype(str) + '_' + train_df['addr1'].astype(str)
  train_df[feature_name] = train_df['TransactionAmt'] / (map_mean + 1e-6) # Add epsilon
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df[col].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting val

Created aggregations for TransactionAmt by card1: ['card1_TransactionAmt_mean', 'card1_TransactionAmt_std', 'card1_TransactionAmt_count']
Aggregating TransactionAmt by card1_addr1...
Created aggregations for TransactionAmt by card1_addr1: ['card1_addr1_TransactionAmt_mean', 'card1_addr1_TransactionAmt_std']
Creating aggregation: card1_nunique_DeviceInfo
Creating aggregation: card1_trans_count_today
Aggregating ['V12', 'V35', 'V53', 'V75'] by P_emaildomain with functions ['mean', 'std']...
Columns in agg_df after flattening: ['P_emaildomain', 'P_emaildomain_V12_mean', 'P_emaildomain_V12_std', 'P_emaildomain_V35_mean', 'P_emaildomain_V35_std', 'P_emaildomain_V53_mean', 'P_emaildomain_V53_std', 'P_emaildomain_V75_mean', 'P_emaildomain_V75_std']
Merging columns: ['P_emaildomain', 'P_emaildomain_V12_mean', 'P_emaildomain_V12_std', 'P_emaildomain_V35_mean', 'P_emaildomain_V35_std', 'P_emaildomain_V53_mean', 'P_emaildomain_V53_std', 'P_emaildomain_V75_mean', 'P_emaildomain_V75_std']
Successfu

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df[col].fillna(0, inplace=True)


Applying LabelEncoding...
🏃 View run FE_TimeTrue_InteractTrue_AggTrue_EncLabelEncoding at: https://dagshub.com/enelene/ieee-cis-fraud-detection-mlops.mlflow/#/experiments/1/runs/eb9c710978014d74b039a9426c087c41
🧪 View experiment at: https://dagshub.com/enelene/ieee-cis-fraud-detection-mlops.mlflow/#/experiments/1


In [11]:
# 8. Feature Selection

print("\nSplitting data into Train/Validation sets for Feature Selection...")
# Use the feature list from the previous step
X = train_df[features_after_fe]
y = train_df[TARGET]

# Stratified split is important for imbalanced datasets
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.25, random_state=RANDOM_STATE, stratify=y
)
del X, y, train_df # Conserve memory
gc.collect()
print(f"Train shape: {X_train.shape}, Validation shape: {X_val.shape}")


current_X_train = X_train.copy()
current_X_val = X_val.copy()
selected_features_final = features_after_fe.copy()

fs_methods_applied = []

if APPLY_VARIANCE_THRESHOLD:
    fs_run_name = f"FS_VarianceThreshold_{VARIANCE_THRESHOLD_VALUE}"
    with mlflow.start_run(run_name=fs_run_name, nested=True) as run_fs_var:
        print(f"\n--- Starting MLflow Run: {run_fs_var.info.run_name} ({run_fs_var.info.run_uuid}) ---")
        mlflow.log_param("fs_method", "VarianceThreshold")
        mlflow.log_param("variance_threshold", VARIANCE_THRESHOLD_VALUE)
        mlflow.log_param("input_features", len(current_X_train.columns))

        try:
            selector_var = VarianceThreshold(threshold=VARIANCE_THRESHOLD_VALUE)
            selector_var.fit(current_X_train) # Fit ONLY on training data

            support_mask = selector_var.get_support()
            current_X_train = current_X_train.loc[:, support_mask]
            current_X_val = current_X_val.loc[:, support_mask]
            selected_features_final = current_X_train.columns.tolist()

            num_selected = len(selected_features_final)
            print(f"Variance Threshold selected {num_selected} features.")
            mlflow.log_param("num_features_selected", num_selected)
            fs_methods_applied.append("VarianceThreshold")

            # Log selected features list
            pd.Series(selected_features_final).to_csv("selected_features_variance.txt", index=False, header=False)
            mlflow.log_artifact("selected_features_variance.txt", "feature_selection_info")
            os.remove("selected_features_variance.txt")
        except Exception as e:
            print(f"ERROR during Variance Threshold: {e}")
            mlflow.log_param("status", "Error")
            mlflow.set_tag("Error", "VarianceThreshold Failed")
        print(f"--- Finished MLflow Run: {run_fs_var.info.run_name} ---")


if APPLY_CORRELATION_THRESHOLD and not current_X_train.empty:
    fs_run_name = f"FS_CorrelationThreshold_{CORRELATION_THRESHOLD_DROP_COLS}"
    with mlflow.start_run(run_name=fs_run_name, nested=True) as run_fs_corr:
        print(f"\n--- Starting MLflow Run: {run_fs_corr.info.run_name} ({run_fs_corr.info.run_uuid}) ---")
        mlflow.log_param("fs_method", "CorrelationThreshold")
        mlflow.log_param("correlation_threshold", CORRELATION_THRESHOLD_DROP_COLS)
        mlflow.log_param("input_features", len(current_X_train.columns))

        try:
            corr_matrix = current_X_train.corr().abs()
            upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
            to_drop_corr = [column for column in upper_tri.columns if any(upper_tri[column] > CORRELATION_THRESHOLD_DROP_COLS)]

            current_X_train = current_X_train.drop(columns=to_drop_corr)
            current_X_val = current_X_val.drop(columns=to_drop_corr)
            selected_features_final = current_X_train.columns.tolist()

            num_selected = len(selected_features_final)
            print(f"Dropped {len(to_drop_corr)} columns due to high correlation (> {CORRELATION_THRESHOLD_DROP_COLS}).")
            print(f"Correlation Threshold resulted in {num_selected} features.")
            mlflow.log_param("num_features_dropped", len(to_drop_corr))
            mlflow.log_param("num_features_selected", num_selected)
            fs_methods_applied.append("CorrelationThreshold")

            pd.Series(selected_features_final).to_csv("selected_features_correlation.txt", index=False, header=False)
            mlflow.log_artifact("selected_features_correlation.txt", "feature_selection_info")
            os.remove("selected_features_correlation.txt")
        except Exception as e:
            print(f"ERROR during Correlation Threshold: {e}")
            mlflow.log_param("status", "Error")
            mlflow.set_tag("Error", "CorrelationThreshold Failed")
        print(f"--- Finished MLflow Run: {run_fs_corr.info.run_name} ---")


if APPLY_MODEL_BASED_FS and not current_X_train.empty: 
    fs_run_name = f"FS_ModelBased_Top{N_FEATURES_MODEL_BASED}"
    with mlflow.start_run(run_name=fs_run_name, nested=True) as run_fs_model:
        print(f"\n--- Starting MLflow Run: {run_fs_model.info.run_name} ({run_fs_model.info.run_uuid}) ---")
        mlflow.log_param("fs_method", "ModelBased_Importance")
        mlflow.log_param("fs_model", "XGBoost")
        mlflow.log_param("n_features_target", N_FEATURES_MODEL_BASED)
        mlflow.log_param("input_features", len(current_X_train.columns))

        try:
            fs_model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='auc',
                                         use_label_encoder=False, 
                                         random_state=RANDOM_STATE)
            fs_model.fit(current_X_train, y_train)

            importances = pd.DataFrame({
                'feature': current_X_train.columns,
                'importance': fs_model.feature_importances_
            }).sort_values('importance', ascending=False)

            # Select top N features
            selected_cols_model = importances.head(N_FEATURES_MODEL_BASED)['feature'].tolist()

            current_X_train = current_X_train[selected_cols_model]
            current_X_val = current_X_val[selected_cols_model]
            selected_features_final = current_X_train.columns.tolist()

            num_selected = len(selected_features_final)
            print(f"Model-Based FS selected top {num_selected} features.")
            mlflow.log_param("num_features_selected", num_selected)
            fs_methods_applied.append("ModelBased_Importance")

            # Log importance data and selected list
            importances.to_csv("fs_model_feature_importances.csv", index=False)
            mlflow.log_artifact("fs_model_feature_importances.csv", "feature_selection_info")
            pd.Series(selected_features_final).to_csv("selected_features_model_based.txt", index=False, header=False)
            mlflow.log_artifact("selected_features_model_based.txt", "feature_selection_info")
            os.remove("fs_model_feature_importances.csv")
            os.remove("selected_features_model_based.txt")
        except Exception as e:
            print(f"ERROR during Model Based FS: {e}")
            mlflow.log_param("status", "Error")
            mlflow.set_tag("Error", "ModelBasedFS Failed")
        print(f"--- Finished MLflow Run: {run_fs_model.info.run_name} ---")


# --- Log the final selected features from the chosen FS strategy ---
with mlflow.start_run(run_name="Final_Feature_Set", nested=True) as run_fs_final:
     mlflow.log_param("feature_selection_methods_applied", " -> ".join(fs_methods_applied))
     mlflow.log_param("final_num_features", len(selected_features_final))
     print(f"\nFinal selected features ({len(selected_features_final)}): {selected_features_final[:10]}...")
     # Log final list
     pd.Series(selected_features_final).to_csv("final_selected_features.txt", index=False, header=False)
     mlflow.log_artifact("final_selected_features.txt", "feature_selection_info")
     os.remove("final_selected_features.txt")


# Final feature set for training
X_train_final = current_X_train
X_val_final = current_X_val
features_for_pipeline = selected_features_final # Use this list for pipeline construction

del current_X_train, current_X_val, X_train, X_val # Clean up memory
gc.collect()



Splitting data into Train/Validation sets for Feature Selection...
Train shape: (75000, 443), Validation shape: (25000, 443)

--- Starting MLflow Run: FS_VarianceThreshold_0.01 (7f752eb33a6d469db4f3a06b48a80036) ---
Variance Threshold selected 411 features.
--- Finished MLflow Run: FS_VarianceThreshold_0.01 ---
🏃 View run FS_VarianceThreshold_0.01 at: https://dagshub.com/enelene/ieee-cis-fraud-detection-mlops.mlflow/#/experiments/1/runs/7f752eb33a6d469db4f3a06b48a80036
🧪 View experiment at: https://dagshub.com/enelene/ieee-cis-fraud-detection-mlops.mlflow/#/experiments/1

--- Starting MLflow Run: FS_CorrelationThreshold_0.98 (a59bd081e354420c834c17be42a7ae30) ---
Dropped 54 columns due to high correlation (> 0.98).
Correlation Threshold resulted in 357 features.
--- Finished MLflow Run: FS_CorrelationThreshold_0.98 ---
🏃 View run FS_CorrelationThreshold_0.98 at: https://dagshub.com/enelene/ieee-cis-fraud-detection-mlops.mlflow/#/experiments/1/runs/a59bd081e354420c834c17be42a7ae30
🧪 Vi

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Model-Based FS selected top 200 features.
--- Finished MLflow Run: FS_ModelBased_Top200 ---
🏃 View run FS_ModelBased_Top200 at: https://dagshub.com/enelene/ieee-cis-fraud-detection-mlops.mlflow/#/experiments/1/runs/33a1cc740a294f57ade8e1985e00b4a2
🧪 View experiment at: https://dagshub.com/enelene/ieee-cis-fraud-detection-mlops.mlflow/#/experiments/1

Final selected features (200): ['V189', 'V155', 'V232', 'V274', 'V45', 'V173', 'V53', 'V197', 'card3', 'V317']...
🏃 View run Final_Feature_Set at: https://dagshub.com/enelene/ieee-cis-fraud-detection-mlops.mlflow/#/experiments/1/runs/fa9edec06f75456e98a735ad7a55f00d
🧪 View experiment at: https://dagshub.com/enelene/ieee-cis-fraud-detection-mlops.mlflow/#/experiments/1


66

In [12]:
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier


class FraudFeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self,
                 create_time_features: bool      = True,
                 create_interaction_features: bool = True,
                 create_aggregation_features: bool = True,
                 categorical_encoding: str       = "LabelEncoding"):
        self.create_time_features       = create_time_features
        self.create_interaction_features= create_interaction_features
        self.create_aggregation_features= create_aggregation_features
        self.categorical_encoding       = categorical_encoding
        # Will be populated in fit:
        self.label_encoders_ = {}

    def fit(self, X: pd.DataFrame, y=None):
        # For LabelEncoding we need to learn all LabelEncoders on full X
        if self.categorical_encoding == "LabelEncoding":
            cat_cols = X.select_dtypes(include=["object", "category"]).columns
            for col in cat_cols:
                le = LabelEncoder()
                # Fill NAs so fit doesn’t crash
                vals = X[col].fillna("Unknown").astype(str)
                le.fit(vals)
                self.label_encoders_[col] = le
        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        df = X.copy()

        newly_created = []

        # --- 1) Time Features ---
        if self.create_time_features and "TransactionDT" in df:
            df, new_time_feats = create_time_features(df, "TransactionDT")
            newly_created += new_time_feats

        # --- 2) Interaction Features ---
        if self.create_interaction_features:
            if {"card1","addr1"}.issubset(df.columns):
                df["card1_addr1_interaction"] = (
                    df["card1"].astype(str) + "_" + df["addr1"].astype(str)
                )
                newly_created.append("card1_addr1_interaction")

            if {"TransactionAmt","ProductCD"}.issubset(df.columns):
                m = df.groupby("ProductCD")["TransactionAmt"].transform("mean")
                df["Amt_div_ProductCD_mean"] = df["TransactionAmt"] / (m + 1e-6)
                newly_created.append("Amt_div_ProductCD_mean")

        # --- 3) Aggregation Features ---
        if self.create_aggregation_features:
            # card1 ↔ TransactionAmt stats
            if {"card1","TransactionAmt"}.issubset(df.columns):
                agg = (
                    df.groupby("card1")["TransactionAmt"]
                      .agg(["mean","std","count"])
                      .rename(columns={
                          "mean":"card1_TransactionAmt_mean",
                          "std":"card1_TransactionAmt_std",
                          "count":"card1_TransactionAmt_count"
                      })
                      .reset_index()
                )
                df = df.merge(agg, on="card1", how="left")
                df["card1_TransactionAmt_std"].fillna(0, inplace=True)
                newly_created += list(agg.columns[1:])

            # card1+addr1 ↔ TransactionAmt stats
            if {"card1","addr1","TransactionAmt"}.issubset(df.columns):
                agg = (
                    df.groupby(["card1","addr1"])["TransactionAmt"]
                      .agg(["mean","std"])
                      .rename(columns={
                          "mean":"card1_addr1_TransactionAmt_mean",
                          "std":"card1_addr1_TransactionAmt_std"
                      })
                      .reset_index()
                )
                df = df.merge(agg, on=["card1","addr1"], how="left")
                df["card1_addr1_TransactionAmt_std"].fillna(0, inplace=True)
                newly_created += ["card1_addr1_TransactionAmt_mean",
                                  "card1_addr1_TransactionAmt_std"]

            # unique DeviceInfo per card1
            if {"card1","DeviceInfo"}.issubset(df.columns):
                df["card1_nunique_DeviceInfo"] = (
                    df.groupby("card1")["DeviceInfo"]
                      .transform("nunique")
                )
                newly_created.append("card1_nunique_DeviceInfo")

        # --- 4) Frequency Encoding (example) ---
        for col in ["DeviceInfo","id_31","card1_addr1_interaction"]:
            if col in df.columns:
                freq = df[col].value_counts(normalize=True)
                name = f"{col}_freq_encoding"
                df[name] = df[col].map(freq)
                newly_created.append(name)

        # --- 5) Categorical Encoding ---
        if self.categorical_encoding == "LabelEncoding":
            for col, le in self.label_encoders_.items():
                if col in df.columns:
                    vals = df[col].fillna("Unknown").astype(str)
                    df[col] = le.transform(vals)

        return df


In [17]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "n_estimators": [100, 300, 500],
    "max_depth":    [4, 6, 8],
    "learning_rate":[0.01, 0.05, 0.1],
}

clf = XGBClassifier(objective="binary:logistic", eval_metric="auc",
                    use_label_encoder=False, random_state=42)

grid = GridSearchCV(clf,
                    param_grid,
                    scoring="roc_auc",
                    cv=3,           # 3-fold CV
                    n_jobs=-1)
grid.fit(X_train_final, y_train,
         eval_set=[(X_val_final, y_val)],
         verbose=False)

print("Best params:", grid.best_params_)
print("Best CV AUC:",  grid.best_score_)


KeyboardInterrupt: 

In [None]:
from sklearn.model_selection import RandomizedSearchCV
import scipy.stats as st

param_dist = {
    "n_estimators": st.randint(100, 1000),
    "max_depth":    st.randint(3, 12),
    "learning_rate": st.loguniform(1e-3, 1e-1),
    "subsample":    st.uniform(0.6, 0.4),
    "colsample_bytree": st.uniform(0.6, 0.4),
}

rand = RandomizedSearchCV(clf,
                          param_distributions=param_dist,
                          n_iter=50,          # try 50 random combos
                          scoring="roc_auc",
                          cv=3,
                          n_jobs=-1,
                          random_state=42)
rand.fit(X_train_final, y_train,
         eval_set=[(X_val_final, y_val)],
         verbose=False)

print("Best params:", rand.best_params_)
print("Best CV AUC:",  rand.best_score_)


In [None]:
import optuna
from sklearn.metrics import roc_auc_score

def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "max_depth":    trial.suggest_int("max_depth", 3, 12),
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-3, 1e-1),
        "subsample":    trial.suggest_uniform("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.6, 1.0),
        "random_state": 42,
        "use_label_encoder": False,
        "objective": "binary:logistic",
        "eval_metric": "auc",
    }
    model = XGBClassifier(**params)
    model.fit(X_train, y_train,
              early_stopping_rounds=30,
              eval_set=[(X_val, y_val)],
              verbose=False)
    preds = model.predict_proba(X_val)[:,1]
    return roc_auc_score(y_val, preds)

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

print("Best params:", study.best_params)
print("Best AUC:",    study.best_value)


In [None]:
# A typical tuned-params dictionary
XGB_PARAMS = {
    "n_estimators":      500,
    "learning_rate":     0.05,
    "max_depth":         6,
    "min_child_weight":  1,
    "subsample":         0.8,
    "colsample_bytree":  0.8,
    "gamma":             1.0,
    "reg_alpha":         0.0,
    "reg_lambda":        1.0,
    "objective":         "binary:logistic",
    "eval_metric":       "auc",
    "random_state":      42,
}


In [None]:
# from sklearn.pipeline import Pipeline
# from sklearn.compose import ColumnTransformer, make_column_selector
# from sklearn.impute import SimpleImputer
# from sklearn.preprocessing import StandardScaler, OrdinalEncoder
# from xgboost import XGBClassifier

# # 1) Your custom FE transformer (defined earlier)
# fe = FraudFeatureEngineer(
#     create_time_features       = CREATE_TIME_FEATURES,
#     create_interaction_features= CREATE_INTERACTION_FEATURES,
#     create_aggregation_features= CREATE_AGGREGATION_FEATURES,
#     categorical_encoding       = CATEGORICAL_ENCODING_STRATEGY
# )

# # 2) The numeric + categorical “preprocessing” you already built
# numeric_transformer = Pipeline([
#     ('imputer', SimpleImputer(strategy='median')),
#     ('scaler',  StandardScaler())
# ])
# categorical_transformer = Pipeline([
#     ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
#     ('ordinal', OrdinalEncoder(
#         handle_unknown='use_encoded_value',
#         unknown_value=-1
#     ))
# ])
# preprocessor = ColumnTransformer([
#     ('num', numeric_transformer, make_column_selector(dtype_include=np.number)),
#     ('cat', categorical_transformer, make_column_selector(dtype_exclude=np.number))
# ], remainder='drop')

# # 3) The final XGBoost classifier
# clf = XGBClassifier(
#     objective='binary:logistic',
#     eval_metric='auc',
#     use_label_encoder=False,
#     random_state=RANDOM_STATE,
#     **XGB_PARAMS
# )

# # 4) Build the end-to-end pipeline
# full_pipeline = Pipeline([
#     ('feature_engineering', fe),    # <-- Runs all your FE code
#     ('preprocessing',      preprocessor),  # <-- imputes, encodes, scales
#     ('classifier',         clf)            # <-- fits/predicts XGBoost
# ])

# # 5) Fit & predict exactly as before
# full_pipeline.fit(raw_X, raw_y)
# probs = full_pipeline.predict_proba(raw_X_val)[:,1]


In [None]:

# # %% [code]
# # =============================================================================
# # %% 9.3 Hyperparameter Tuning (Using the Robust Pipeline)
# # =============================================================================
# print("\nPerforming Hyperparameter Tuning (RandomizedSearchCV) with Robust Pipeline...")

# # Define parameter distribution for XGBoost (same as before)
# param_dist = {
#     'classifier__n_estimators': [100, 200, 500, 800],
#     'classifier__learning_rate': [0.01, 0.03, 0.05, 0.1],
#     'classifier__max_depth': [3, 4, 5, 6, 7],
#     'classifier__subsample': [0.6, 0.7, 0.8, 0.9],
#     'classifier__colsample_bytree': [0.6, 0.7, 0.8, 0.9],
#     'classifier__gamma': [0, 0.1, 0.2],
#     'classifier__reg_alpha': [0, 0.001, 0.01],
#     'classifier__reg_lambda': [1, 1.1, 1.2]
# }

# # Use the same CV strategy for tuning
# cv_strategy = StratifiedKFold(n_splits=N_SPLITS_CV, shuffle=True, random_state=RANDOM_STATE)

# # Randomized Search on the ROBUST pipeline
# random_search = RandomizedSearchCV(
#     estimator=xgb_pipeline_full, # Use the ROBUST pipeline!
#     param_distributions=param_dist,
#     n_iter=N_ITER_RANDOM_SEARCH,
#     scoring='roc_auc',
#     cv=cv_strategy,
#     random_state=RANDOM_STATE,
#     n_jobs=-1, # Use all available cores
#     verbose=1
# )

# # --- MLflow Run for Hyperparameter Tuning ---
# with mlflow.start_run(run_name=f"HyperparamTuning_RobustPipe_{N_ITER_RANDOM_SEARCH}iters") as run_tune:
#     mlflow.log_param("tuning_method", "RandomizedSearchCV")
#     mlflow.log_param("n_iterations", N_ITER_RANDOM_SEARCH)
#     mlflow.log_param("cv_folds_tuning", N_SPLITS_CV)

#     print(f"Starting RandomizedSearch with {N_ITER_RANDOM_SEARCH} iterations...")
#     start_time = time.time()
#     # Fit on the full CV data (X_cv, y_cv)
#     random_search.fit(X_cv, y_cv)
#     end_time = time.time()
#     print(f"Tuning finished in {(end_time - start_time)/60:.2f} minutes.")

#     best_params = random_search.best_params_
#     best_score = random_search.best_score_

#     print(f"\nBest parameters found: {best_params}")
#     print(f"Best CV AUC score from tuning: {best_score:.5f}")

#     # Log tuning results
#     mlflow.log_params(best_params) # Log the best parameter set found
#     mlflow.log_metric("best_tuning_cv_auc", best_score)
#     mlflow.log_metric("tuning_duration_minutes", (end_time - start_time)/60)

#     # Get the best pipeline found by the search
#     best_pipeline = random_search.best_estimator_

#     # Optional: Log the CV results dataframe from the search
#     cv_results_df = pd.DataFrame(random_search.cv_results_)
#     cv_results_df.to_csv("tuning_cv_results_robust.csv", index=False)
#     mlflow.log_artifact("tuning_cv_results_robust.csv", "hyperparameter_tuning_info")
#     os.remove("tuning_cv_results_robust.csv")

#     print(f"--- Finished MLflow Run: {run_tune.info.run_name} ---")


In [None]:

# # %% [markdown]
# # ## 10. Final Model Training and Logging (Using Robust Pipeline)
# #
# # Train the final robust pipeline using the best hyperparameters found on the entire training dataset (`X_cv`, `y_cv`). Log the final pipeline artifact to MLflow and register it.

# # %% [code]
# # --- MLflow Run for Final Model Training ---
# with mlflow.start_run(run_name="Final_Model_Training_RobustPipe") as run_final:
#     print(f"\n--- Starting MLflow Run: {run_final.info.run_name} ({run_final.info.run_uuid}) ---")

#     # Use best parameters if tuning was done, otherwise use default pipeline
#     if 'best_pipeline' in locals():
#         print("Using best pipeline found during hyperparameter tuning.")
#         final_pipeline = best_pipeline
#         final_params = best_params # From tuning run
#         mlflow.log_params(final_params)
#         mlflow.log_param("hyperparameters_source", "RandomizedSearch")
#     else:
#         print("Using default pipeline (no tuning performed or failed).")
#         # Re-create the default robust pipeline if 'best_pipeline' doesn't exist
#         final_pipeline = Pipeline(steps=[
#             ('preprocessing', preprocessor), # Use the preprocessor defined earlier
#             ('classifier', xgb.XGBClassifier(objective='binary:logistic',
#                                              eval_metric='auc',
#                                              use_label_encoder=False,
#                                              random_state=RANDOM_STATE))
#         ])
#         final_params = final_pipeline.named_steps['classifier'].get_params()
#         mlflow.log_params(final_params)
#         mlflow.log_param("hyperparameters_source", "Default")

#     # Log info about the features the pipeline expects (based on preprocessor)
#     num_feats_pipe = len(pipeline_numerical_features) if pipeline_numerical_features else 0
#     cat_feats_pipe = len(pipeline_categorical_features) if pipeline_categorical_features else 0
#     mlflow.log_param("final_pipeline_num_features_in", num_feats_pipe)
#     mlflow.log_param("final_pipeline_cat_features_in", cat_feats_pipe)
#     mlflow.log_param("training_data_shape", f"{X_cv.shape[0]}x{X_cv.shape[1]}") # Log shape of data used for final fit

#     print(f"Training final {MODEL_ARCHITECTURE} robust pipeline on full training data...")
#     start_time = time.time()
#     # Fit on the combined training data used for CV/tuning (X_cv, y_cv)
#     final_pipeline.fit(X_cv, y_cv)
#     end_time = time.time()
#     training_time = end_time - start_time
#     print(f"Final model training finished in {training_time:.2f} seconds.")
#     mlflow.log_metric("final_model_training_time_seconds", training_time)

#     # --- Log and Register the Final Pipeline ---
#     # Define the signature for the model (optional but recommended)
#     from mlflow.models import infer_signature
#     # Use a sample of the training data for signature inference
#     signature_data_sample = X_cv.head(100) if len(X_cv) > 100 else X_cv
#     signature = infer_signature(signature_data_sample, final_pipeline.predict_proba(signature_data_sample))

#     # Log the pipeline artifact
#     artifact_path = f"{MODEL_ARCHITECTURE.lower()}-robust-pipeline"
#     registered_model_name = f"ieee_fraud_{MODEL_ARCHITECTURE.lower()}_pipeline" # Keep registration name consistent

#     print(f"Logging final robust pipeline to MLflow artifact path: {artifact_path}")
#     print(f"Registering model as: {registered_model_name}")

#     mlflow.sklearn.log_model(
#         sk_model=final_pipeline,
#         artifact_path=artifact_path,
#         signature=signature,
#         registered_model_name=registered_model_name,
#         input_example=signature_data_sample.iloc[:5], # Add input example
#     )

#     # Add tags for easier filtering
#     mlflow.set_tag("model_architecture", MODEL_ARCHITECTURE)
#     mlflow.set_tag("pipeline_type", "Robust_Preprocessed")
#     mlflow.set_tag("final_model_run", "True")

#     print(f"Final robust pipeline logged and registered as '{registered_model_name}'.")
#     print(f"--- Finished MLflow Run: {run_final.info.run_name} ---")


# # %% [markdown]
# # ## 11. Conclusion & Next Steps
# #
# # The XGBoost model experiment is complete. The final **robust pipeline**, incorporating preprocessing and the trained model, has been logged to MLflow and registered under the name `ieee_fraud_xgboost_pipeline`. This pipeline is designed to work directly on raw input data (after merging transaction/identity).
# #
# # **Next Steps:**
# # 1.  Repeat this process for other model architectures (AdaBoost, GLM, Decision Tree, NN, etc.) by creating separate `model_experiment_{Architecture}.ipynb` notebooks, ensuring robust pipelines are used.
# # 2.  Compare the `best_tuning_cv_auc` (or mean CV AUC if no tuning) across all registered models in the MLflow UI.
# # 3.  Promote the best performing model version to "Staging" or "Production" in the MLflow Model Registry.
# # 4.  Use the `model_inference.ipynb` notebook to load the production model from the registry and generate predictions on the competition's test set (feeding it the raw merged test data).
# # 5.  Complete the `README.md` file detailing the experiments, results, and chosen model.

# # %% [code]
# print("\nExperiment Notebook Complete.")
