In [1]:
# Feature Engineering Pipeline (Jupyter Notebook Version)

# Import required libraries
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import (
    StandardScaler,
    OneHotEncoder,
    LabelEncoder,
    FunctionTransformer
)
from sklearn.impute import SimpleImputer, KNNImputer
from xverse.transformer import MonotonicBinning
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

# Load the data
data_path = r"C:\Users\Daniel.Temesgen\Desktop\KIAM-Rsc\week5\Data\data.csv"
data = pd.read_csv(data_path)

# Display basic info
print("Data shape:", data.shape)
data.head()

Data shape: (95662, 16)


Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000,2018-11-15T02:18:49Z,2,0
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20,2018-11-15T02:19:08Z,2,0
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256,ProviderId_6,ProductId_1,airtime,ChannelId_3,500.0,500,2018-11-15T02:44:21Z,2,0
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256,ProviderId_1,ProductId_21,utility_bill,ChannelId_3,20000.0,21800,2018-11-15T03:32:55Z,2,0
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-644.0,644,2018-11-15T03:34:21Z,2,0


In [2]:
# Define custom transformers (same as in .py file)

class FeatureAggregator(BaseEstimator, TransformerMixin):
    """Create aggregate features from transaction data"""
    def __init__(self, customer_id_col='customer_id', amount_col='amount'):
        self.customer_id_col = customer_id_col
        self.amount_col = amount_col
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        # Make a copy to avoid SettingWithCopyWarning
        X = X.copy()
        
        # Group by customer and create aggregate features
        agg_features = X.groupby(self.customer_id_col)[self.amount_col].agg([
            ('total_amount', 'sum'),
            ('avg_amount', 'mean'),
            ('transaction_count', 'count'),
            ('amount_std', 'std'),
            ('amount_min', 'min'),
            ('amount_max', 'max')
        ]).reset_index()
        
        # Fill NA for std (which occurs when only 1 transaction exists)
        agg_features['amount_std'] = agg_features['amount_std'].fillna(0)
        
        # Merge back with original data
        X = X.merge(agg_features, on=self.customer_id_col, how='left')
        
        return X

class DateTimeExtractor(BaseEstimator, TransformerMixin):
    """Extract features from datetime columns"""
    def __init__(self, datetime_col='transaction_date'):
        self.datetime_col = datetime_col
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        
        if self.datetime_col in X.columns:
            X[self.datetime_col] = pd.to_datetime(X[self.datetime_col])
            X['transaction_hour'] = X[self.datetime_col].dt.hour
            X['transaction_day'] = X[self.datetime_col].dt.day
            X['transaction_month'] = X[self.datetime_col].dt.month
            X['transaction_year'] = X[self.datetime_col].dt.year
            X['day_of_week'] = X[self.datetime_col].dt.dayofweek
            
        return X

class CategoricalEncoder(BaseEstimator, TransformerMixin):
    """Handle categorical variable encoding"""
    def __init__(self, one_hot_cols=None, label_encode_cols=None):
        self.one_hot_cols = one_hot_cols or []
        self.label_encode_cols = label_encode_cols or []
        self.one_hot_encoder = None
        self.label_encoders = {}
        
    def fit(self, X, y=None):
        if self.one_hot_cols:
            self.one_hot_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
            self.one_hot_encoder.fit(X[self.one_hot_cols])
            
        if self.label_encode_cols:
            for col in self.label_encode_cols:
                le = LabelEncoder()
                le.fit(X[col])
                self.label_encoders[col] = le
                
        return self
    
    def transform(self, X):
        X = X.copy()
        
        # One-hot encoding
        if self.one_hot_cols and self.one_hot_encoder:
            one_hot_features = self.one_hot_encoder.transform(X[self.one_hot_cols])
            one_hot_df = pd.DataFrame(
                one_hot_features,
                columns=self.one_hot_encoder.get_feature_names_out(self.one_hot_cols))
            X = pd.concat([X.drop(self.one_hot_cols, axis=1), one_hot_df], axis=1)
            
        # Label encoding
        if self.label_encode_cols:
            for col in self.label_encode_cols:
                if col in X.columns:  # Check if column wasn't dropped during one-hot
                    X[col] = self.label_encoders[col].transform(X[col])
                    
        return X

In [4]:
# Add this at the beginning of your notebook or script after loading the data
print("Columns in the dataset:", data.columns.tolist())

Columns in the dataset: ['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId', 'CurrencyCode', 'CountryCode', 'ProviderId', 'ProductId', 'ProductCategory', 'ChannelId', 'Amount', 'Value', 'TransactionStartTime', 'PricingStrategy', 'FraudResult']


In [5]:
# Find which column is likely your target (modify as needed)
# For credit risk models, common target names include: 'default', 'target', 'label', 'bad_flag', etc.
target_col = 'default'  # Change this to your actual target column name

# Or you can try to automatically detect a binary target column
possible_targets = ['default', 'target', 'label', 'bad_flag', 'is_bad', 'fraud']
for col in possible_targets:
    if col in data.columns:
        target_col = col
        break
else:
    # If none of the common names exist, use the last column as target (common in some datasets)
    target_col = data.columns[-1]
    print(f"Using last column '{target_col}' as target variable")

# Now proceed with the split
X = data.drop(target_col, axis=1)
y = data[target_col]

Using last column 'FraudResult' as target variable


In [6]:
# Define column types (modify according to your actual data)
numerical_cols = ['amount', 'total_amount', 'avg_amount', 'transaction_count', 'amount_std']
categorical_ohe_cols = ['product_category', 'transaction_type']
categorical_le_cols = ['country']
datetime_col = 'transaction_date'

# Automatically detect or specify target column
target_col = 'default'  # Change this to your actual target column name
if target_col not in data.columns:
    # Try to find a binary column that might be the target
    binary_cols = [col for col in data.columns if data[col].nunique() == 2]
    if binary_cols:
        target_col = binary_cols[0]
        print(f"Automatically detected target column: {target_col}")
    else:
        # Last resort - use last column
        target_col = data.columns[-1]
        print(f"Using last column '{target_col}' as target variable")

# Split data
X = data.drop(target_col, axis=1)
y = data[target_col]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Automatically detected target column: FraudResult


In [12]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split

# Load your data
data_path = r"C:\Users\Daniel.Temesgen\Desktop\KIAM-Rsc\week5\Data\data.csv"
data = pd.read_csv(data_path)

# Print column information for verification
print("Columns in dataset:", data.columns.tolist())
print("\nSample data:")
print(data.head())

# ======================================================================
# STEP 1: Define your column mappings based on actual data
# ======================================================================

# Map to our expected column types:
TARGET_COL = 'FraudResult'              # Your binary target variable
DATETIME_COL = 'TransactionStartTime'   # Your timestamp column
AMOUNT_COL = 'Amount'                   # Your transaction amount column
CUSTOMER_ID_COL = 'CustomerId'          # Your customer identifier

# Define which features to use for different transformations
NUMERICAL_COLS = ['Amount', 'Value']    # Numerical features to scale
CATEGORICAL_OHE_COLS = [                # Categorical features for one-hot encoding
    'CurrencyCode', 
    'CountryCode',
    'ProductCategory',
    'PricingStrategy'
]

# ======================================================================
# STEP 2: Define custom transformers (updated)
# ======================================================================

class FeatureAggregator(BaseEstimator, TransformerMixin):
    """Create aggregate features per customer"""
    def __init__(self, customer_id_col='CustomerId', amount_col='Amount'):
        self.customer_id_col = customer_id_col
        self.amount_col = amount_col
        self.agg_feature_names = [
            'total_amount', 'avg_amount', 'transaction_count',
            'amount_std', 'amount_min', 'amount_max'
        ]
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        
        if self.customer_id_col in X.columns and self.amount_col in X.columns:
            agg_features = X.groupby(self.customer_id_col)[self.amount_col].agg([
                ('total_amount', 'sum'),
                ('avg_amount', 'mean'),
                ('transaction_count', 'count'),
                ('amount_std', 'std'),
                ('amount_min', 'min'),
                ('amount_max', 'max')
            ]).reset_index()
            
            agg_features['amount_std'] = agg_features['amount_std'].fillna(0)
            X = X.merge(agg_features, on=self.customer_id_col, how='left')
        
        return X

class DateTimeExtractor(BaseEstimator, TransformerMixin):
    """Extract features from datetime column"""
    def __init__(self, datetime_col='TransactionStartTime'):
        self.datetime_col = datetime_col
        self.new_features = [
            'transaction_hour', 'transaction_day',
            'transaction_month', 'transaction_year',
            'day_of_week'
        ]
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        
        if self.datetime_col in X.columns:
            X[self.datetime_col] = pd.to_datetime(X[self.datetime_col])
            X['transaction_hour'] = X[self.datetime_col].dt.hour
            X['transaction_day'] = X[self.datetime_col].dt.day
            X['transaction_month'] = X[self.datetime_col].dt.month
            X['transaction_year'] = X[self.datetime_col].dt.year
            X['day_of_week'] = X[self.datetime_col].dt.dayofweek
        
        return X

# ======================================================================
# STEP 3: Create the processing pipeline (updated)
# ======================================================================

def create_feature_pipeline():
    """Create the complete feature engineering pipeline"""
    
    # Numerical features processing
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    
    # Categorical features processing
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])
    
    # Combine all transformers
    preprocessor = ColumnTransformer(transformers=[
        ('num', numerical_transformer, NUMERICAL_COLS),
        ('cat', categorical_transformer, CATEGORICAL_OHE_COLS)
    ])
    
    # Main pipeline with all steps
    pipeline = Pipeline(steps=[
        ('datetime_extractor', DateTimeExtractor(datetime_col=DATETIME_COL)),
        ('feature_aggregator', FeatureAggregator(
            customer_id_col=CUSTOMER_ID_COL, 
            amount_col=AMOUNT_COL
        )),
        ('preprocessor', preprocessor)
    ])
    
    return pipeline

# ======================================================================
# STEP 4: Process the data (updated)
# ======================================================================

if TARGET_COL in data.columns:
    X = data.drop(TARGET_COL, axis=1)
    y = data[TARGET_COL]
    
    # Split into train/test
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    
    # Create and fit pipeline
    pipeline = create_feature_pipeline()
    X_train_processed = pipeline.fit_transform(X_train, y_train)
    X_test_processed = pipeline.transform(X_test)
    
    # Get the correct feature names from the pipeline
    def get_feature_names(pipeline):
        """Extract feature names from pipeline components"""
        feature_names = []
        
        # Get numerical feature names
        if 'num' in pipeline.named_steps['preprocessor'].named_transformers_:
            feature_names.extend(NUMERICAL_COLS)
        
        # Get one-hot encoded feature names
        if 'cat' in pipeline.named_steps['preprocessor'].named_transformers_:
            ohe = pipeline.named_steps['preprocessor'].named_transformers_['cat'].named_steps['onehot']
            ohe_features = ohe.get_feature_names_out(CATEGORICAL_OHE_COLS)
            feature_names.extend(ohe_features)
        
        return feature_names
    
    feature_names = get_feature_names(pipeline)
    
    # Verify shapes match
    if X_train_processed.shape[1] == len(feature_names):
        X_train_df = pd.DataFrame(X_train_processed, columns=feature_names)
        X_test_df = pd.DataFrame(X_test_processed, columns=feature_names)
    else:
        print(f"Warning: Shape mismatch ({X_train_processed.shape[1]} features, {len(feature_names)} names)")
        print("Using default column names")
        X_train_df = pd.DataFrame(X_train_processed)
        X_test_df = pd.DataFrame(X_test_processed)
    
    # Add target back to DataFrames
    X_train_df['target'] = y_train.reset_index(drop=True)
    X_test_df['target'] = y_test.reset_index(drop=True)
    
    # Save processed data
    X_train_df.to_csv('train_processed.csv', index=False)
    X_test_df.to_csv('test_processed.csv', index=False)
    
    print("\nProcessing completed successfully!")
    print(f"Training data shape: {X_train_df.shape}")
    print(f"Test data shape: {X_test_df.shape}")
    print("\nProcessed columns:", X_train_df.columns.tolist())
else:
    print(f"Error: Target column '{TARGET_COL}' not found in data.")

Columns in dataset: ['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId', 'CurrencyCode', 'CountryCode', 'ProviderId', 'ProductId', 'ProductCategory', 'ChannelId', 'Amount', 'Value', 'TransactionStartTime', 'PricingStrategy', 'FraudResult']

Sample data:
         TransactionId         BatchId       AccountId       SubscriptionId  \
0  TransactionId_76871   BatchId_36123  AccountId_3957   SubscriptionId_887   
1  TransactionId_73770   BatchId_15642  AccountId_4841  SubscriptionId_3829   
2  TransactionId_26203   BatchId_53941  AccountId_4229   SubscriptionId_222   
3    TransactionId_380  BatchId_102363   AccountId_648  SubscriptionId_2185   
4  TransactionId_28195   BatchId_38780  AccountId_4841  SubscriptionId_3829   

        CustomerId CurrencyCode  CountryCode    ProviderId     ProductId  \
0  CustomerId_4406          UGX          256  ProviderId_6  ProductId_10   
1  CustomerId_4406          UGX          256  ProviderId_4   ProductId_6   
2  CustomerId_4683    