# SUS2025

In [1]:
!pip install --upgrade imbalanced-learn>=0.12.0

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
category-encoders 2.7.0 requires scikit-learn<1.6.0,>=1.0.0, but you have scikit-learn 1.6.1 which is incompatible.
cesium 0.12.4 requires numpy<3.0,>=2.0, but you have numpy 1.26.4 which is incompatible.[0m[31m
[0m

In [2]:
!pip install xgboost --upgrade

Collecting xgboost
  Downloading xgboost-3.0.2-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.2-py3-none-manylinux_2_28_x86_64.whl (253.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.9/253.9 MB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[?25hInstalling collected packages: xgboost
  Attempting uninstall: xgboost
    Found existing installation: xgboost 2.0.3
    Uninstalling xgboost-2.0.3:
      Successfully uninstalled xgboost-2.0.3
Successfully installed xgboost-3.0.2


In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder, RobustScaler
from sklearn.model_selection import train_test_split
import networkx as nx
from collections import Counter, defaultdict

from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_recall_curve
from sklearn.model_selection import cross_val_score, StratifiedKFold
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')


### 1. Dataset Analysis 

In [4]:
class MoneyLaunderingPreprocessor:
    def __init__(self):
        self.scalers = {}
        self.label_encoders = {}
        self.account_features = {}
        self.network_features = {}
        self.fitted = False
        self.column_mapping = {}  # To handle column name variations
        
    def standardize_column_names(self, df):
        """Standardize column names to handle variations"""
        df_clean = df.copy()
        
        # Define expected column mappings
        column_mappings = {
            # Handle potential variations in column names
            'is laundering': 'Is Laundering',
            'islaundering': 'Is Laundering',
            'from account': 'From Account',
            'fromaccount': 'From Account',
            'to account': 'To Account',
            'toaccount': 'To Account',
            'payment type': 'Payment Type',
            'paymenttype': 'Payment Type',
            'amount paid': 'Amount Paid',
            'amountpaid': 'Amount Paid',
            'type account from': 'Type Account From',
            'typeaccountfrom': 'Type Account From',
            'type account to': 'Type Account To',
            'typeaccountto': 'Type Account To',
            'avg stock from': 'Avg Stock From',
            'avgstockfrom': 'Avg Stock From',
            'avg stock account from': 'Avg Stock From',
            'avg stock to': 'Avg Stock To',
            'avgstockto': 'Avg Stock To',
            'avg stock account to': 'Avg Stock To'
        }
        
        # Create mapping from current columns to standardized names
        new_columns = {}
        for col in df_clean.columns:
            col_lower = col.lower().strip()
            if col_lower in column_mappings:
                new_columns[col] = column_mappings[col_lower]
            else:
                new_columns[col] = col
        
        df_clean.rename(columns=new_columns, inplace=True)
        return df_clean
        
    def load_data(self, train_path, test_path):
        """Load training and test datasets"""
        # The CSV files use space as delimiter with quoted column names
        try:
            # Use space as separator and handle quoted strings
            self.train_df = pd.read_csv(train_path, sep=' ', quotechar='"', skipinitialspace=True)
            self.test_df = pd.read_csv(test_path, sep=' ', quotechar='"', skipinitialspace=True)
            
            print("✓ Successfully loaded with space delimiter")
            
        except Exception as e:
            print(f"Space delimiter failed: {e}")
            
            try:
                # Alternative approach: use whitespace regex separator
                self.train_df = pd.read_csv(train_path, sep=r'\s+', quotechar='"', engine='python')
                self.test_df = pd.read_csv(test_path, sep=r'\s+', quotechar='"', engine='python')
                
                print("✓ Successfully loaded with regex whitespace delimiter")
                
            except Exception as e2:
                print(f"Regex delimiter also failed: {e2}")
                
                # Manual parsing as last resort
                print("Attempting manual parsing...")
                self.train_df = self._manual_parse_csv(train_path, has_target=True)
                self.test_df = self._manual_parse_csv(test_path, has_target=False)
        
        # Clean column names by removing quotes and extra spaces
        self.train_df.columns = self.train_df.columns.str.strip().str.replace('"', '')
        self.test_df.columns = self.test_df.columns.str.strip().str.replace('"', '')
        
        # Standardize column names
        self.train_df = self.standardize_column_names(self.train_df)
        self.test_df = self.standardize_column_names(self.test_df)
        
        print(f"Training set shape: {self.train_df.shape}")
        print(f"Test set shape: {self.test_df.shape}")
        print(f"Training columns: {list(self.train_df.columns)}")
        print(f"Test columns: {list(self.test_df.columns)}")
        
        return self.train_df, self.test_df
    
    def _manual_parse_csv(self, file_path, has_target=True):
        """Manual CSV parsing for space-delimited files with quoted strings"""
        import re
        
        data = []
        with open(file_path, 'r') as f:
            lines = f.readlines()
        
        # Parse header
        header_line = lines[0].strip()
        
        # Extract quoted column names using regex
        column_pattern = r'"([^"]*)"'
        columns = re.findall(column_pattern, header_line)
        
        # If we have target column at the beginning (not quoted)
        if has_target and not header_line.startswith('"'):
            # Split by space, first element is the target column
            parts = header_line.split(' ', 1)
            columns = [parts[0]] + re.findall(column_pattern, parts[1])
        
        print(f"Parsed columns: {columns}")
        
        # Parse data rows
        for line in lines[1:]:
            line = line.strip()
            if not line:
                continue
            
            # Parse each row
            row_data = []
            remaining = line
            
            if has_target:
                # First element (target) is not quoted
                parts = remaining.split(' ', 1)
                row_data.append(int(parts[0]))
                remaining = parts[1] if len(parts) > 1 else ""
            
            # Parse quoted and unquoted values
            while remaining:
                remaining = remaining.strip()
                if not remaining:
                    break
                
                if remaining.startswith('"'):
                    # Find the closing quote
                    end_quote = remaining.find('"', 1)
                    if end_quote != -1:
                        value = remaining[1:end_quote]
                        row_data.append(value)
                        remaining = remaining[end_quote + 1:].strip()
                    else:
                        # Quote not closed, take rest as is
                        row_data.append(remaining[1:])
                        break
                else:
                    # Find next space or quote
                    next_space = remaining.find(' ')
                    next_quote = remaining.find('"')
                    
                    if next_space == -1 and next_quote == -1:
                        # Last value
                        try:
                            row_data.append(float(remaining))
                        except:
                            row_data.append(remaining)
                        break
                    elif next_quote != -1 and (next_space == -1 or next_quote < next_space):
                        # Next value is quoted
                        if next_quote > 0:
                            # There's a value before the quote
                            try:
                                row_data.append(float(remaining[:next_quote].strip()))
                            except:
                                row_data.append(remaining[:next_quote].strip())
                        remaining = remaining[next_quote:]
                    else:
                        # Next value ends at space
                        try:
                            row_data.append(float(remaining[:next_space]))
                        except:
                            row_data.append(remaining[:next_space])
                        remaining = remaining[next_space + 1:]
            
            if len(row_data) == len(columns):
                data.append(row_data)
        
        df = pd.DataFrame(data, columns=columns)
        print(f"Manual parsing result: {df.shape}")
        return df
    
    def print_data_samples(self, n_samples=5):
        """Print sample data from train and test sets"""
        print("\n" + "="*60)
        print("TRAINING SET SAMPLES")
        print("="*60)
        
        print(f"\nFirst {n_samples} rows of training data:")
        print(self.train_df.head(n_samples))
        
        print(f"\nRandom {n_samples} rows of training data:")
        print(self.train_df.sample(n_samples, random_state=42))
        
        print("\n" + "="*60)
        print("TEST SET SAMPLES") 
        print("="*60)
        
        print(f"\nFirst {n_samples} rows of test data:")
        print(self.test_df.head(n_samples))
        
        print(f"\nRandom {n_samples} rows of test data:")
        print(self.test_df.sample(n_samples, random_state=42))
    
    def print_data_info(self):
        """Print detailed information about the datasets"""
        print("\n" + "="*60)
        print("DETAILED DATA INFORMATION")
        print("="*60)
        
        print("\n--- TRAINING SET INFO ---")
        print(f"Shape: {self.train_df.shape}")
        print(f"Memory usage: {self.train_df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
        print("\nData types:")
        print(self.train_df.dtypes)
        print("\nNull values:")
        print(self.train_df.isnull().sum())
        
        print("\n--- TEST SET INFO ---")
        print(f"Shape: {self.test_df.shape}")
        print(f"Memory usage: {self.test_df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
        print("\nData types:")
        print(self.test_df.dtypes)
        print("\nNull values:")
        print(self.test_df.isnull().sum())
    
    def print_statistical_summary(self):
        """Print statistical summary of numerical columns"""
        print("\n" + "="*60)
        print("STATISTICAL SUMMARY")
        print("="*60)
        
        print("\n--- TRAINING SET STATISTICS ---")
        print(self.train_df.describe())
        
        print("\n--- TEST SET STATISTICS ---")
        print(self.test_df.describe())
        
        # Additional statistics for key columns
        if 'Amount Paid' in self.train_df.columns:
            print(f"\n--- AMOUNT PAID ANALYSIS ---")
            print(f"Training - Min: {self.train_df['Amount Paid'].min():,.2f}")
            print(f"Training - Max: {self.train_df['Amount Paid'].max():,.2f}")
            print(f"Training - Median: {self.train_df['Amount Paid'].median():,.2f}")
            print(f"Training - Mean: {self.train_df['Amount Paid'].mean():,.2f}")
            
            print(f"Test - Min: {self.test_df['Amount Paid'].min():,.2f}")
            print(f"Test - Max: {self.test_df['Amount Paid'].max():,.2f}")
            print(f"Test - Median: {self.test_df['Amount Paid'].median():,.2f}")
            print(f"Test - Mean: {self.test_df['Amount Paid'].mean():,.2f}")
    
    def print_categorical_analysis(self):
        """Print analysis of categorical columns"""
        print("\n" + "="*60)
        print("CATEGORICAL VARIABLES ANALYSIS")
        print("="*60)
        
        categorical_cols = ['Payment Type', 'Type Account From', 'Type Account To']
        
        for col in categorical_cols:
            if col in self.train_df.columns:
                print(f"\n--- {col.upper()} DISTRIBUTION ---")
                print("Training set:")
                train_counts = self.train_df[col].value_counts()
                train_pct = self.train_df[col].value_counts(normalize=True) * 100
                for idx in train_counts.index:
                    print(f"  {idx}: {train_counts[idx]:,} ({train_pct[idx]:.1f}%)")
                
                print("Test set:")
                test_counts = self.test_df[col].value_counts()
                test_pct = self.test_df[col].value_counts(normalize=True) * 100
                for idx in test_counts.index:
                    print(f"  {idx}: {test_counts[idx]:,} ({test_pct[idx]:.1f}%)")
    
    def print_target_analysis(self):
        """Print target variable analysis (only for training set)"""
        if 'Is Laundering' not in self.train_df.columns:
            print("Target variable 'Is Laundering' not found in training set")
            return
            
        print("\n" + "="*60)
        print("TARGET VARIABLE ANALYSIS")
        print("="*60)
        
        target_counts = self.train_df['Is Laundering'].value_counts()
        target_pct = self.train_df['Is Laundering'].value_counts(normalize=True) * 100
        
        print(f"Non-laundering (0): {target_counts[0]:,} ({target_pct[0]:.2f}%)")
        print(f"Laundering (1): {target_counts[1]:,} ({target_pct[1]:.2f}%)")
        print(f"Class imbalance ratio: {target_counts[0]/target_counts[1]:.1f}:1")
        
        # Laundering patterns by categorical variables
        categorical_cols = ['Payment Type', 'Type Account From', 'Type Account To']
        
        for col in categorical_cols:
            if col in self.train_df.columns:
                print(f"\n--- LAUNDERING RATE BY {col.upper()} ---")
                laundering_by_cat = self.train_df.groupby(col)['Is Laundering'].agg(['count', 'sum', 'mean'])
                laundering_by_cat.columns = ['Total', 'Laundering_Count', 'Laundering_Rate']
                laundering_by_cat['Laundering_Rate'] = laundering_by_cat['Laundering_Rate'] * 100
                laundering_by_cat = laundering_by_cat.sort_values('Laundering_Rate', ascending=False)
                
                for idx, row in laundering_by_cat.iterrows():
                    print(f"  {idx}: {row['Laundering_Count']}/{row['Total']} ({row['Laundering_Rate']:.2f}%)")
    
    def print_account_analysis(self):
        """Print account-level analysis"""
        print("\n" + "="*60)
        print("ACCOUNT ANALYSIS")
        print("="*60)
        
        # Unique accounts
        if 'From Account' in self.train_df.columns and 'To Account' in self.train_df.columns:
            train_from_accounts = set(self.train_df['From Account'].unique())
            train_to_accounts = set(self.train_df['To Account'].unique())
            test_from_accounts = set(self.test_df['From Account'].unique())
            test_to_accounts = set(self.test_df['To Account'].unique())
            
            all_train_accounts = train_from_accounts.union(train_to_accounts)
            all_test_accounts = test_from_accounts.union(test_to_accounts)
            
            print(f"Unique 'From' accounts in training: {len(train_from_accounts):,}")
            print(f"Unique 'To' accounts in training: {len(train_to_accounts):,}")
            print(f"Total unique accounts in training: {len(all_train_accounts):,}")
            
            print(f"Unique 'From' accounts in test: {len(test_from_accounts):,}")
            print(f"Unique 'To' accounts in test: {len(test_to_accounts):,}")
            print(f"Total unique accounts in test: {len(all_test_accounts):,}")
            
            # Account overlap
            overlapping_accounts = all_train_accounts.intersection(all_test_accounts)
            print(f"Accounts appearing in both train and test: {len(overlapping_accounts):,}")
            print(f"Account overlap percentage: {len(overlapping_accounts)/len(all_train_accounts)*100:.1f}%")
            
            # Most active accounts
            print(f"\n--- MOST ACTIVE SENDER ACCOUNTS (TRAINING) ---")
            from_activity = self.train_df['From Account'].value_counts().head(10)
            for account, count in from_activity.items():
                print(f"  {account}: {count:,} transactions")
            
            print(f"\n--- MOST ACTIVE RECEIVER ACCOUNTS (TRAINING) ---")
            to_activity = self.train_df['To Account'].value_counts().head(10)
            for account, count in to_activity.items():
                print(f"  {account}: {count:,} transactions")
    
    def handle_missing_values(self, df):
        """Handle missing values with domain-specific logic"""
        df_clean = df.copy()
        
        # For financial data, missing average stock might indicate new accounts
        # Fill with 0 or median based on account type
        for col in ['Avg Stock From', 'Avg Stock To']:
            if col in df_clean.columns:
                # Fill missing values with median for each account type
                for acc_type in ['Type Account From', 'Type Account To']:
                    if acc_type in df_clean.columns:
                        type_col = acc_type.replace('Type ', '').replace(' From', '').replace(' To', '')
                        df_clean[col] = df_clean.groupby(acc_type)[col].transform(
                            lambda x: x.fillna(x.median())
                        )
                
                # Fill remaining NaN with overall median
                df_clean[col].fillna(df_clean[col].median(), inplace=True)
        
        # Handle categorical missing values
        categorical_cols = ['Payment Type', 'Type Account From', 'Type Account To']
        for col in categorical_cols:
            if col in df_clean.columns:
                df_clean[col].fillna('Unknown', inplace=True)
        
        return df_clean
    
    def create_account_features(self, df):
        """Create account-level aggregated features"""
        df_enhanced = df.copy()
        
        # Account activity features
        account_stats = {}
        
        # From Account features
        from_stats = df.groupby('From Account').agg({
            'Amount Paid': ['count', 'sum', 'mean', 'std', 'min', 'max'],
            'Payment Type': lambda x: len(x.unique()),
            'To Account': lambda x: len(x.unique())  # Number of unique recipients
        }).reset_index()
        
        from_stats.columns = ['From Account', 'from_tx_count', 'from_total_amount', 
                             'from_avg_amount', 'from_std_amount', 'from_min_amount', 
                             'from_max_amount', 'from_payment_types', 'from_unique_recipients']
        
        # To Account features
        to_stats = df.groupby('To Account').agg({
            'Amount Paid': ['count', 'sum', 'mean', 'std', 'min', 'max'],
            'Payment Type': lambda x: len(x.unique()),
            'From Account': lambda x: len(x.unique())  # Number of unique senders
        }).reset_index()
        
        to_stats.columns = ['To Account', 'to_tx_count', 'to_total_amount', 
                           'to_avg_amount', 'to_std_amount', 'to_min_amount', 
                           'to_max_amount', 'to_payment_types', 'to_unique_senders']
        
        # Merge account features
        df_enhanced = df_enhanced.merge(from_stats, on='From Account', how='left')
        df_enhanced = df_enhanced.merge(to_stats, on='To Account', how='left')
        
        # Fill NaN values for new accounts in test set
        account_feature_cols = [col for col in df_enhanced.columns if col.startswith(('from_', 'to_'))]
        for col in account_feature_cols:
            df_enhanced[col].fillna(0, inplace=True)
        
        return df_enhanced
    
    def create_network_features(self, df):
        """Create network-based features to detect coordinated operations"""
        df_network = df.copy()
        
        # Create transaction network
        G = nx.from_pandas_edgelist(df, source='From Account', target='To Account', 
                                   edge_attr=['Amount Paid', 'Payment Type'], 
                                   create_using=nx.DiGraph())
        
        # Calculate network centrality measures
        try:
            in_degree_centrality = nx.in_degree_centrality(G)
            out_degree_centrality = nx.out_degree_centrality(G)
            betweenness_centrality = nx.betweenness_centrality(G, k=min(1000, len(G.nodes())))
            
            # Map centrality measures back to transactions
            df_network['from_out_degree_centrality'] = df_network['From Account'].map(out_degree_centrality).fillna(0)
            df_network['from_betweenness_centrality'] = df_network['From Account'].map(betweenness_centrality).fillna(0)
            df_network['to_in_degree_centrality'] = df_network['To Account'].map(in_degree_centrality).fillna(0)
            df_network['to_betweenness_centrality'] = df_network['To Account'].map(betweenness_centrality).fillna(0)
            
        except Exception as e:
            print(f"Network analysis warning: {e}")
            # Create dummy features if network analysis fails
            df_network['from_out_degree_centrality'] = 0
            df_network['from_betweenness_centrality'] = 0
            df_network['to_in_degree_centrality'] = 0
            df_network['to_betweenness_centrality'] = 0
        
        # Detect potential circular transactions (A->B->A patterns)
        account_pairs = df.groupby(['From Account', 'To Account']).size().reset_index(name='pair_frequency')
        reverse_pairs = account_pairs.copy()
        reverse_pairs.columns = ['To Account', 'From Account', 'reverse_pair_frequency']
        
        df_network = df_network.merge(account_pairs, on=['From Account', 'To Account'], how='left')
        df_network = df_network.merge(reverse_pairs, on=['From Account', 'To Account'], how='left')
        df_network['pair_frequency'].fillna(1, inplace=True)
        df_network['reverse_pair_frequency'].fillna(0, inplace=True)
        
        # Flag potential circular transactions
        df_network['is_circular'] = (df_network['reverse_pair_frequency'] > 0).astype(int)
        
        return df_network
    
    def create_behavioral_features(self, df):
        """Create behavioral and temporal features"""
        df_behavioral = df.copy()
        
        # Amount-based features
        df_behavioral['amount_to_avg_stock_from_ratio'] = (
            df_behavioral['Amount Paid'] / (df_behavioral['Avg Stock From'] + 1)
        )
        df_behavioral['amount_to_avg_stock_to_ratio'] = (
            df_behavioral['Amount Paid'] / (df_behavioral['Avg Stock To'] + 1)
        )
        
        # Account balance disparity
        df_behavioral['stock_balance_diff'] = (
            df_behavioral['Avg Stock From'] - df_behavioral['Avg Stock To']
        )
        df_behavioral['stock_balance_ratio'] = (
            df_behavioral['Avg Stock From'] / (df_behavioral['Avg Stock To'] + 1)
        )
        
        # Round number detection (common in money laundering)
        df_behavioral['is_round_amount'] = (df_behavioral['Amount Paid'] % 100 == 0).astype(int)
        df_behavioral['is_very_round_amount'] = (df_behavioral['Amount Paid'] % 1000 == 0).astype(int)
        
        # Account type mismatch features
        df_behavioral['account_type_match'] = (
            df_behavioral['Type Account From'] == df_behavioral['Type Account To']
        ).astype(int)
        
        # Create account type interaction features
        df_behavioral['account_type_interaction'] = (
            df_behavioral['Type Account From'] + '_to_' + df_behavioral['Type Account To']
        )
        
        return df_behavioral
    
    def encode_categorical_features(self, df, fit=True):
        """Encode categorical features"""
        df_encoded = df.copy()
        
        categorical_features = ['Payment Type', 'Type Account From', 'Type Account To', 'account_type_interaction']
        
        for feature in categorical_features:
            if feature in df_encoded.columns:
                if fit:
                    self.label_encoders[feature] = LabelEncoder()
                    df_encoded[feature] = self.label_encoders[feature].fit_transform(df_encoded[feature].astype(str))
                else:
                    if feature in self.label_encoders:
                        # Handle unseen categories in test set
                        known_categories = set(self.label_encoders[feature].classes_)
                        df_encoded[feature] = df_encoded[feature].astype(str)
                        df_encoded[feature] = df_encoded[feature].apply(
                            lambda x: x if x in known_categories else 'Unknown'
                        )
                        
                        # Add 'Unknown' to encoder if not present
                        if 'Unknown' not in known_categories:
                            self.label_encoders[feature].classes_ = np.append(
                                self.label_encoders[feature].classes_, 'Unknown'
                            )
                        
                        df_encoded[feature] = self.label_encoders[feature].transform(df_encoded[feature])
        
        return df_encoded
    
    def scale_numerical_features(self, df, fit=True):
        """Scale numerical features using RobustScaler (less sensitive to outliers)"""
        df_scaled = df.copy()
        
        # Identify numerical features (excluding target and IDs)
        numerical_features = df_scaled.select_dtypes(include=[np.number]).columns.tolist()
        
        # Remove target variable and account IDs if present
        exclude_cols = ['Is Laundering', 'From Account', 'To Account']
        numerical_features = [col for col in numerical_features if col not in exclude_cols]
        
        if fit:
            self.scalers['numerical'] = RobustScaler()
            df_scaled[numerical_features] = self.scalers['numerical'].fit_transform(df_scaled[numerical_features])
        else:
            if 'numerical' in self.scalers:
                df_scaled[numerical_features] = self.scalers['numerical'].transform(df_scaled[numerical_features])
        
        return df_scaled
    
    def create_feature_interactions(self, df):
        """Create important feature interactions"""
        df_interactions = df.copy()
        
        # Amount and account type interactions
        for acc_type in ['Type Account From', 'Type Account To']:
            if acc_type in df_interactions.columns:
                interaction_col = f'amount_x_{acc_type.lower().replace(" ", "_")}'
                df_interactions[interaction_col] = (
                    df_interactions['Amount Paid'] * df_interactions[acc_type]
                )
        
        # Payment type and amount interactions
        if 'Payment Type' in df_interactions.columns:
            df_interactions['amount_x_payment_type'] = (
                df_interactions['Amount Paid'] * df_interactions['Payment Type']
            )
        
        return df_interactions
    
    def fit_transform(self, train_df):
        """Fit the preprocessor on training data and transform it"""
        print("\n=== FITTING PREPROCESSOR ON TRAINING DATA ===")
        
        # Step 1: Handle missing values
        print("1. Handling missing values...")
        train_clean = self.handle_missing_values(train_df)
        
        # Step 2: Create account-level features
        print("2. Creating account-level features...")
        train_account = self.create_account_features(train_clean)
        
        # Step 3: Create network features
        print("3. Creating network features...")
        train_network = self.create_network_features(train_account)
        
        # Step 4: Create behavioral features
        print("4. Creating behavioral features...")
        train_behavioral = self.create_behavioral_features(train_network)
        
        # Step 5: Encode categorical features
        print("5. Encoding categorical features...")
        train_encoded = self.encode_categorical_features(train_behavioral, fit=True)
        
        # Step 6: Create feature interactions
        print("6. Creating feature interactions...")
        train_interactions = self.create_feature_interactions(train_encoded)
        
        # Step 7: Scale numerical features
        print("7. Scaling numerical features...")
        train_final = self.scale_numerical_features(train_interactions, fit=True)
        
        self.fitted = True
        print("✓ Preprocessor fitted successfully!")
        
        return train_final
    
    def transform(self, test_df):
        """Transform test data using fitted preprocessor"""
        if not self.fitted:
            raise ValueError("Preprocessor must be fitted before transforming test data")
        
        print("\n=== TRANSFORMING TEST DATA ===")
        
        # Apply same transformation pipeline
        test_clean = self.handle_missing_values(test_df)
        test_account = self.create_account_features(test_clean)
        test_network = self.create_network_features(test_account)
        test_behavioral = self.create_behavioral_features(test_network)
        test_encoded = self.encode_categorical_features(test_behavioral, fit=False)
        test_interactions = self.create_feature_interactions(test_encoded)
        test_final = self.scale_numerical_features(test_interactions, fit=False)
        
        print("✓ Test data transformed successfully!")
        
        return test_final
    
    def get_feature_names(self, df):
        """Get list of feature names (excluding target and IDs)"""
        exclude_cols = ['Is Laundering', 'From Account', 'To Account']
        feature_names = [col for col in df.columns if col not in exclude_cols]
        return feature_names

# Usage example with debugging:
def main():
    # Initialize preprocessor
    preprocessor = MoneyLaunderingPreprocessor()
    
    # Debug CSV loading
    def debug_csv_structure(file_path):
        """Debug function to understand CSV structure"""
        print(f"\n=== DEBUGGING CSV STRUCTURE: {file_path} ===")
        
        # Read first few lines as text
        with open(file_path, 'r') as f:
            for i, line in enumerate(f):
                if i < 3:  # Show first 3 lines
                    print(f"Line {i+1}: {repr(line[:200])}")  # Show first 200 chars
                else:
                    break
        
        # Try different pandas read options
        try:
            df1 = pd.read_csv(file_path, nrows=5)
            print(f"Default read - Shape: {df1.shape}, Columns: {list(df1.columns)}")
        except Exception as e:
            print(f"Default read failed: {e}")
        
        try:
            df2 = pd.read_csv(file_path, nrows=5, quotechar='"')
            print(f"With quotechar - Shape: {df2.shape}, Columns: {list(df2.columns)}")
        except Exception as e:
            print(f"Quotechar read failed: {e}")
            
        try:
            df3 = pd.read_csv(file_path, nrows=5, sep=',', quoting=1)
            print(f"With quoting=1 - Shape: {df3.shape}, Columns: {list(df3.columns)}")
        except Exception as e:
            print(f"Quoting=1 read failed: {e}")
    
    # Debug both files
    try:
        debug_csv_structure('/kaggle/input/statsunderstars/Dataset/train.csv')
        debug_csv_structure('/kaggle/input/statsunderstars/Dataset/test.csv')
    except:
        # If debug files don't exist, try the original names
        debug_csv_structure('sus8_train.csv')
        debug_csv_structure('sus8_test.csv')
    
    # Load data with correct file paths
    try:
        train_df, test_df = preprocessor.load_data('/kaggle/input/statsunderstars/Dataset/train.csv', 
                                                 '/kaggle/input/statsunderstars/Dataset/test.csv')
    except:
        # Fallback to original names
        train_df, test_df = preprocessor.load_data('sus8_train.csv', 'sus8_test.csv')
    
    # If still having issues, stop here for debugging
    if train_df.shape[1] <= 2:
        print("\n❌ CSV parsing issue detected. Please check the debug output above.")
        print("The CSV files might have a different format than expected.")
        return None, None, None, None, None
    
    # Print comprehensive data exploration
    print("\n🔍 EXPLORING LOADED DATA...")
    
    # Basic samples
    preprocessor.print_data_samples(n_samples=3)
    
    # Detailed information
    preprocessor.print_data_info()
    
    # Statistical summaries
    preprocessor.print_statistical_summary()
    
    # Categorical analysis
    preprocessor.print_categorical_analysis()
    
    # Target analysis
    preprocessor.print_target_analysis()
    
    # Account analysis
    preprocessor.print_account_analysis()
    
    # Ask user if they want to continue with preprocessing
    print(f"\n{'='*60}")
    print("DATA EXPLORATION COMPLETE")
    print(f"{'='*60}")
    print("\nReady to proceed with preprocessing...")
    
    # Continue with preprocessing
    train_processed = preprocessor.fit_transform(train_df)
    
    # Preprocess test data
    test_processed = preprocessor.transform(test_df)
    
    # Get feature names
    feature_names = preprocessor.get_feature_names(train_processed)
    
    print(f"\n=== PREPROCESSING COMPLETE ===")
    print(f"Original training features: {len(train_df.columns)}")
    print(f"Processed training features: {len(feature_names)}")
    print(f"Training set shape: {train_processed.shape}")
    print(f"Test set shape: {test_processed.shape}")
    
    # Prepare final datasets
    X_train = train_processed[feature_names]
    y_train = train_processed['Is Laundering'] if 'Is Laundering' in train_processed.columns else None
    X_test = test_processed[feature_names]
    
    return X_train, y_train, X_test, feature_names, preprocessor

if __name__ == "__main__":
    X_train, y_train, X_test, feature_names, preprocessor = main()


=== DEBUGGING CSV STRUCTURE: /kaggle/input/statsunderstars/Dataset/train.csv ===
Line 1: '"Is Laundering" "From Account" "To Account" "Payment Type" "Amount Paid" "Type Account From" "Type Account To" "Avg Stock Account From" "Avg Stock Account To"\n'
Line 2: '1 "U0551" "U2203" "Cheque" 186509.285 "B" "B" 308533361.047 1144577.897\n'
Line 3: '0 "U4572" "U9001" "ACH" 439.995 "D" "B" 37526.141 2084650.425\n'
Default read - Shape: (5, 1), Columns: ['Is Laundering "From Account" "To Account" "Payment Type" "Amount Paid" "Type Account From" "Type Account To" "Avg Stock Account From" "Avg Stock Account To"']
With quotechar - Shape: (5, 1), Columns: ['Is Laundering "From Account" "To Account" "Payment Type" "Amount Paid" "Type Account From" "Type Account To" "Avg Stock Account From" "Avg Stock Account To"']
With quoting=1 - Shape: (5, 1), Columns: ['Is Laundering "From Account" "To Account" "Payment Type" "Amount Paid" "Type Account From" "Type Account To" "Avg Stock Account From" "Avg Stock

In [5]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, balanced_accuracy_score
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.utils import resample
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBClassifier


# Custom metrics implementation
def calculate_balanced_accuracy(y_true, y_pred):
    """Calculate balanced accuracy: average of TPR and TNR"""
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    
    # True Positive Rate (Sensitivity/Recall)
    tpr = tp / (tp + fn) if (tp + fn) > 0 else 0
    
    # True Negative Rate (Specificity)
    tnr = tn / (tn + fp) if (tn + fp) > 0 else 0
    
    balanced_acc = (tpr + tnr) / 2
    return balanced_acc

def calculate_fraud_capture_rate(y_true, y_prob, N=485):
    """Calculate fraud capture rate for top N predictions"""
    # Get indices sorted by probability (highest first)
    sorted_indices = np.argsort(y_prob)[::-1]
    
    # Get top N indices
    top_N_indices = sorted_indices[:N]
    
    # Count frauds in top N predictions
    frauds_in_top_N = np.sum(y_true.iloc[top_N_indices] if hasattr(y_true, 'iloc') else y_true[top_N_indices])
    
    # Total number of frauds in dataset
    total_frauds = np.sum(y_true)
    
    # Fraud capture rate
    fraud_capture_rate = frauds_in_top_N / total_frauds if total_frauds > 0 else 0
    
    return fraud_capture_rate

def calculate_composite_score(y_true, y_pred, y_prob, N=485):
    """Calculate the composite score as arithmetic mean of AUC, Balanced Accuracy, and Fraud Capture Rate"""
    
    # 1. AUC (Area Under the Curve)
    auc_score = roc_auc_score(y_true, y_prob)
    
    # 2. Balanced Accuracy
    balanced_acc = calculate_balanced_accuracy(y_true, y_pred)
    
    # 3. Fraud Capture Rate (Top N predictions)
    fraud_capture = calculate_fraud_capture_rate(y_true, y_prob, N)
    
    # Final composite score (arithmetic mean)
    composite_score = (auc_score + balanced_acc + fraud_capture) / 3
    
    return {
        'auc': auc_score,
        'balanced_accuracy': balanced_acc,
        'fraud_capture_rate': fraud_capture,
        'composite_score': composite_score
    }

class MoneyLaunderingModelTrainer:
    def __init__(self):
        self.models = {}
        self.model_scores = {}
        self.best_model = None
        self.feature_importance = None
        self.N = 485  # Top N predictions for fraud capture rate
        
    def handle_class_imbalance(self, X_train, y_train, method='smote'):
        """Handle severe class imbalance using custom SMOTE implementation"""
        print(f"\nOriginal class distribution:")
        print(f"Class 0: {sum(y_train == 0):,} ({sum(y_train == 0)/len(y_train)*100:.2f}%)")
        print(f"Class 1: {sum(y_train == 1):,} ({sum(y_train == 1)/len(y_train)*100:.2f}%)")
        
        # Convert to numpy arrays for easier manipulation
        if hasattr(X_train, 'values'):
            X_train_np = X_train.values
        else:
            X_train_np = X_train
            
        if hasattr(y_train, 'values'):
            y_train_np = y_train.values
        else:
            y_train_np = y_train
        
        if method == 'smote':
            # Simple oversampling with noise
            minority_class = 1
            majority_class = 0
            
            minority_indices = np.where(y_train_np == minority_class)[0]
            majority_count = np.sum(y_train_np == majority_class)
            minority_count = len(minority_indices)
            
            # Generate synthetic samples to balance classes
            samples_needed = majority_count - minority_count
            
            if samples_needed > 0:
                # Get minority samples
                minority_samples = X_train_np[minority_indices]
                
                # Generate synthetic samples with small random noise
                np.random.seed(42)
                synthetic_samples = []
                for _ in range(samples_needed):
                    # Pick random minority sample and add noise
                    idx = np.random.choice(len(minority_samples))
                    sample = minority_samples[idx].copy()
                    noise = np.random.normal(0, 0.01, sample.shape)
                    synthetic_samples.append(sample + noise)
                
                # Combine original and synthetic data
                X_resampled = np.vstack([X_train_np, np.array(synthetic_samples)])
                y_resampled = np.hstack([y_train_np, np.ones(samples_needed)])
            else:
                X_resampled, y_resampled = X_train_np, y_train_np
                
        elif method == 'combine':
            # First oversample minority class partially
            minority_indices = np.where(y_train_np == 1)[0]
            majority_count = np.sum(y_train_np == 0)
            
            # Oversample to 10% of majority class
            target_minority = int(majority_count * 0.1)
            current_minority = len(minority_indices)
            
            if target_minority > current_minority:
                samples_needed = target_minority - current_minority
                minority_samples = X_train_np[minority_indices]
                
                np.random.seed(42)
                synthetic_samples = []
                for _ in range(samples_needed):
                    idx = np.random.choice(len(minority_samples))
                    sample = minority_samples[idx].copy()
                    noise = np.random.normal(0, 0.01, sample.shape)
                    synthetic_samples.append(sample + noise)
                
                X_temp = np.vstack([X_train_np, np.array(synthetic_samples)])
                y_temp = np.hstack([y_train_np, np.ones(samples_needed)])
            else:
                X_temp, y_temp = X_train_np, y_train_np
            
            # Then undersample majority class
            majority_indices = np.where(y_temp == 0)[0]
            minority_count_new = np.sum(y_temp == 1)
            target_majority = int(minority_count_new * 2)  # 2:1 ratio
            
            if len(majority_indices) > target_majority:
                np.random.seed(42)  # Set seed for reproducibility
                selected_majority = np.random.choice(majority_indices, target_majority, replace=False)
                minority_indices_new = np.where(y_temp == 1)[0]
                
                selected_indices = np.hstack([selected_majority, minority_indices_new])
                X_resampled = X_temp[selected_indices]
                y_resampled = y_temp[selected_indices]
            else:
                X_resampled, y_resampled = X_temp, y_temp
                
        else:  # 'none'
            X_resampled, y_resampled = X_train_np, y_train_np
            
        print(f"\nResampled class distribution ({method}):")
        print(f"Class 0: {sum(y_resampled == 0):,} ({sum(y_resampled == 0)/len(y_resampled)*100:.2f}%)")
        print(f"Class 1: {sum(y_resampled == 1):,} ({sum(y_resampled == 1)/len(y_resampled)*100:.2f}%)")
        
        return X_resampled, y_resampled
    
    def custom_cross_validation(self, model, X, y, cv_folds=5):
        """Custom cross-validation using the composite score"""
        # Convert to numpy arrays if needed
        if hasattr(X, 'values'):
            X_np = X.values
        else:
            X_np = X
            
        if hasattr(y, 'values'):
            y_np = y.values
        else:
            y_np = y
        
        cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)
        
        cv_scores = []
        detailed_scores = []
        
        for fold, (train_idx, val_idx) in enumerate(cv.split(X_np, y_np)):
            X_train_fold, X_val_fold = X_np[train_idx], X_np[val_idx]
            y_train_fold, y_val_fold = y_np[train_idx], y_np[val_idx]
            
            # Train model on fold
            model_fold = model.__class__(**model.get_params())
            model_fold.fit(X_train_fold, y_train_fold)
            
            # Get predictions and probabilities
            y_pred_fold = model_fold.predict(X_val_fold)
            y_prob_fold = model_fold.predict_proba(X_val_fold)[:, 1]
            
            # Calculate composite score
            scores = calculate_composite_score(y_val_fold, y_pred_fold, y_prob_fold, self.N)
            cv_scores.append(scores['composite_score'])
            detailed_scores.append(scores)
        
        return np.array(cv_scores), detailed_scores
    
    def train_models(self, X_train, y_train, use_resampling=True):
        """Train multiple models with custom evaluation metrics"""
        
        # Handle class imbalance if requested
        if use_resampling:
            X_train_balanced, y_train_balanced = self.handle_class_imbalance(X_train, y_train, 'combine')
        else:
            # Convert to numpy arrays
            if hasattr(X_train, 'values'):
                X_train_balanced = X_train.values
            else:
                X_train_balanced = X_train
            if hasattr(y_train, 'values'):
                y_train_balanced = y_train.values
            else:
                y_train_balanced = y_train
        
        # Define models
        models = {
            'RandomForest_Balanced': RandomForestClassifier(
                n_estimators=200,
                max_depth=15,
                min_samples_split=10,
                min_samples_leaf=5,
                class_weight='balanced',
                random_state=42,
                n_jobs=-1
            ),
            
            'RandomForest_Resampled': RandomForestClassifier(
                n_estimators=200,
                max_depth=15,
                min_samples_split=10,
                min_samples_leaf=5,
                random_state=42,
                n_jobs=-1
            ),
            
            'XGBoost_GPU': XGBClassifier(
                n_estimators=500,
                learning_rate=0.05,
                max_depth=8,
                subsample=0.8,
                colsample_bytree=0.8,
                objective='binary:logistic',
                use_label_encoder=False,
                eval_metric='logloss',
                tree_method='gpu_hist',
                predictor='gpu_predictor',
                random_state=42,
                verbosity=0,
                n_jobs=-1
            ),
            'LogisticRegression': LogisticRegression(
                class_weight='balanced',
                random_state=42,
                max_iter=1000,
                C=0.1
            )
        }
        
        print(f"\n{'='*80}")
        print("TRAINING MODELS WITH CUSTOM EVALUATION METRICS")
        print(f"{'='*80}")
        print(f"Using composite score: (AUC + Balanced Accuracy + Fraud Capture Rate@{self.N}) / 3")
        print(f"{'='*80}")
        
        for name, model in models.items():
            print(f"\nTraining {name}...")
            
            try:
                # Choose training data based on model type
                if 'Resampled' in name and use_resampling:
                    X_train_model = X_train_balanced
                    y_train_model = y_train_balanced
                else:
                    # Convert to numpy arrays for consistency
                    if hasattr(X_train, 'values'):
                        X_train_model = X_train.values
                    else:
                        X_train_model = X_train
                    if hasattr(y_train, 'values'):
                        y_train_model = y_train.values
                    else:
                        y_train_model = y_train
                
                # Train model
                model.fit(X_train_model, y_train_model)
                
                # Custom cross-validation on original data
                cv_scores, detailed_scores = self.custom_cross_validation(model, X_train, y_train)
                
                # Calculate average detailed scores
                avg_detailed_scores = {
                    'auc': np.mean([s['auc'] for s in detailed_scores]),
                    'balanced_accuracy': np.mean([s['balanced_accuracy'] for s in detailed_scores]),
                    'fraud_capture_rate': np.mean([s['fraud_capture_rate'] for s in detailed_scores]),
                    'composite_score': cv_scores.mean()
                }
                
                # Store model and scores
                self.models[name] = model
                self.model_scores[name] = {
                    'cv_mean': cv_scores.mean(),
                    'cv_std': cv_scores.std(),
                    'cv_scores': cv_scores,
                    'detailed_scores': avg_detailed_scores
                }
                
                print(f"✓ {name} Results:")
                print(f"   Composite Score: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")
                print(f"   - AUC: {avg_detailed_scores['auc']:.4f}")
                print(f"   - Balanced Accuracy: {avg_detailed_scores['balanced_accuracy']:.4f}")
                print(f"   - Fraud Capture Rate@{self.N}: {avg_detailed_scores['fraud_capture_rate']:.4f}")
                
            except Exception as e:
                print(f"✗ {name} failed: {str(e)}")
        
        return self.models, self.model_scores
    
    def evaluate_models(self, X_train, y_train):
        """Evaluate all trained models using custom metrics"""
        print(f"\n{'='*80}")
        print("MODEL EVALUATION RESULTS")
        print(f"{'='*80}")
        
        # Header
        header = f"{'Model':<25}{'Composite':<12}{'AUC':<8}{'Bal.Acc':<10}{'FCR@485':<10}{'Std':<8}"
        print(header)
        print("-" * 80)
        
        best_score = 0
        best_model_name = None
        
        for name, scores in self.model_scores.items():
            detailed = scores['detailed_scores']
            row = f"{name:<25}"
            row += f"{scores['cv_mean']:<12.4f}"
            row += f"{detailed['auc']:<8.4f}"
            row += f"{detailed['balanced_accuracy']:<10.4f}"
            row += f"{detailed['fraud_capture_rate']:<10.4f}"
            row += f"{scores['cv_std']:<8.4f}"
            print(row)
            
            if scores['cv_mean'] > best_score:
                best_score = scores['cv_mean']
                best_model_name = name
        
        print(f"\n🏆 Best model: {best_model_name}")
        print(f"   Composite Score: {best_score:.4f}")
        
        self.best_model = self.models[best_model_name]
        self.best_model_name = best_model_name
        
        return best_model_name, best_score
    
    def analyze_feature_importance(self, feature_names):
        """Analyze feature importance from the best model"""
        if self.best_model is None:
            print("No best model found. Train models first.")
            return
        
        if hasattr(self.best_model, 'feature_importances_'):
            importance_df = pd.DataFrame({
                'feature': feature_names,
                'importance': self.best_model.feature_importances_
            }).sort_values('importance', ascending=False)
            
            self.feature_importance = importance_df
            
            print(f"\n{'='*60}")
            print(f"TOP 20 MOST IMPORTANT FEATURES ({self.best_model_name})")
            print(f"{'='*60}")
            
            for i, row in importance_df.head(20).iterrows():
                print(f"{row['feature']:<35} {row['importance']:<10.6f}")
            
            return importance_df
        else:
            print("Selected model doesn't have feature_importances_ attribute")
            return None
    
    def make_predictions(self, X_test):
        """Make predictions on test set with detailed analysis"""
        if self.best_model is None:
            print("No best model found. Train and evaluate models first.")
            return None
        
        print(f"\n{'='*60}")
        print(f"MAKING PREDICTIONS WITH {self.best_model_name}")
        print(f"{'='*60}")
        
        # Convert to numpy array if needed
        if hasattr(X_test, 'values'):
            X_test_np = X_test.values
        else:
            X_test_np = X_test
        
        # Get predictions and probabilities
        predictions = self.best_model.predict(X_test_np)
        probabilities = self.best_model.predict_proba(X_test_np)[:, 1]
        
        # Analysis of predictions
        print(f"Total predictions: {len(predictions):,}")
        print(f"Predicted laundering cases: {sum(predictions):,} ({sum(predictions)/len(predictions)*100:.2f}%)")
        
        # Probability analysis
        print(f"\nProbability Analysis:")
        print(f"- Mean probability: {probabilities.mean():.4f}")
        print(f"- Max probability: {probabilities.max():.4f}")
        print(f"- Min probability: {probabilities.min():.4f}")
        print(f"- Std probability: {probabilities.std():.4f}")
        
        # Top N analysis for submission
        top_N_indices = np.argsort(probabilities)[::-1][:self.N]
        top_N_probs = probabilities[top_N_indices]
        
        print(f"\nTop {self.N} Predictions Analysis:")
        print(f"- Probability range: {top_N_probs.min():.4f} to {top_N_probs.max():.4f}")
        print(f"- Mean probability: {top_N_probs.mean():.4f}")
        print(f"- Predictions flagged as laundering: {sum(predictions[top_N_indices])}")
        
        return predictions, probabilities
    
    def create_submission(self, predictions, probabilities, output_file='money_laundering_predictions.csv'):
        """Create submission file with detailed analysis"""
        # Create submission dataframe
        submission_df = pd.DataFrame({
            'Id': range(len(predictions)),
            'Prediction': predictions,
            'Probability': probabilities
        })
        
        # Save to file
        submission_df.to_csv(output_file, index=False)
        
        print(f"\n{'='*60}")
        print("SUBMISSION CREATED")
        print(f"{'='*60}")
        print(f"File: {output_file}")
        print(f"Total cases: {len(predictions):,}")
        print(f"Predicted laundering: {sum(predictions):,}")
        print(f"Top {self.N} cases for review (highest probabilities)")
        
        # Show top 10 cases
        print(f"\nTop 10 highest probability cases:")
        print(submission_df.head(10)[['Id', 'Prediction', 'Probability']].to_string(index=False))
        
        return submission_df

def train_and_evaluate_models(X_train, y_train, X_test, feature_names):
    """Complete pipeline for model training and evaluation with custom metrics"""
    
    # Initialize trainer
    trainer = MoneyLaunderingModelTrainer()
    
    # Train models
    trainer.train_models(X_train, y_train, use_resampling=True)
    
    # Evaluate models
    best_model_name, best_score = trainer.evaluate_models(X_train, y_train)
    
    # Analyze feature importance
    feature_importance = trainer.analyze_feature_importance(feature_names)
    
    # Make predictions
    predictions, probabilities = trainer.make_predictions(X_test)
    
    # Create submission
    submission_df = trainer.create_submission(predictions, probabilities)
    
    return trainer, submission_df, predictions, probabilities, feature_importance

def print_evaluation_methodology():
    """Print explanation of the evaluation methodology"""
    print(f"\n{'='*80}")
    print("EVALUATION METHODOLOGY")
    print(f"{'='*80}")
    
    methodology = """
This model uses a composite evaluation score combining three key metrics:

1. AUC (Area Under the Curve):
   - Measures the model's ability to distinguish between fraudulent and legitimate transactions
   - Range: 0 to 1 (higher is better)
   - Particularly important for imbalanced datasets

2. Balanced Accuracy:
   - Average of True Positive Rate (TPR) and True Negative Rate (TNR)
   - Formula: (TPR + TNR) / 2
   - Ensures good performance on both classes despite imbalance

3. Fraud Capture Rate (Top 485 Predictions):
   - Proportion of actual fraudulent transactions found in top 485 highest-probability predictions
   - Formula: (Frauds in Top 485) / (Total Frauds in Dataset)
   - Critical for real-world AML systems with limited investigation capacity

Final Composite Score:
   - Arithmetic mean of the three metrics: (AUC + Balanced Accuracy + FCR@485) / 3
   - Balances detection performance with practical investigation constraints
"""
    
    print(methodology)
    print(f"{'='*80}")

# Call this to understand the evaluation approach
print_evaluation_methodology()


EVALUATION METHODOLOGY

This model uses a composite evaluation score combining three key metrics:

1. AUC (Area Under the Curve):
   - Measures the model's ability to distinguish between fraudulent and legitimate transactions
   - Range: 0 to 1 (higher is better)
   - Particularly important for imbalanced datasets

2. Balanced Accuracy:
   - Average of True Positive Rate (TPR) and True Negative Rate (TNR)
   - Formula: (TPR + TNR) / 2
   - Ensures good performance on both classes despite imbalance

3. Fraud Capture Rate (Top 485 Predictions):
   - Proportion of actual fraudulent transactions found in top 485 highest-probability predictions
   - Formula: (Frauds in Top 485) / (Total Frauds in Dataset)
   - Critical for real-world AML systems with limited investigation capacity

Final Composite Score:
   - Arithmetic mean of the three metrics: (AUC + Balanced Accuracy + FCR@485) / 3
   - Balances detection performance with practical investigation constraints



In [6]:
# Your existing usage remains the same
trainer, submission_df, predictions, probabilities, feature_importance = train_and_evaluate_models(
    X_train, y_train, X_test, feature_names
)


Original class distribution:
Class 0: 54,621 (98.76%)
Class 1: 686 (1.24%)

Resampled class distribution (combine):
Class 0: 10,924 (66.67%)
Class 1: 5,462 (33.33%)

TRAINING MODELS WITH CUSTOM EVALUATION METRICS
Using composite score: (AUC + Balanced Accuracy + Fraud Capture Rate@485) / 3

Training RandomForest_Balanced...
✓ RandomForest_Balanced Results:
   Composite Score: 0.9982 ± 0.0024
   - AUC: 0.9997
   - Balanced Accuracy: 0.9964
   - Fraud Capture Rate@485: 0.9985

Training RandomForest_Resampled...
✓ RandomForest_Resampled Results:
   Composite Score: 0.9983 ± 0.0013
   - AUC: 0.9999
   - Balanced Accuracy: 0.9949
   - Fraud Capture Rate@485: 1.0000

Training XGBoost_GPU...
✓ XGBoost_GPU Results:
   Composite Score: 0.9993 ± 0.0006
   - AUC: 1.0000
   - Balanced Accuracy: 0.9978
   - Fraud Capture Rate@485: 1.0000

Training LogisticRegression...
✓ LogisticRegression Results:
   Composite Score: 0.9808 ± 0.0050
   - AUC: 0.9950
   - Balanced Accuracy: 0.9576
   - Fraud Captu

In [7]:
with open("submission.txt", "w") as f:
    for pred, prob in zip(predictions, probabilities):
        f.write(f"{prob} {pred}\n")