In [1]:
import pandas as pd
import numpy as np
from xverse.transformer import WOE
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from lightgbm import LGBMClassifier,plot_importance
import catboost as cb
import xgboost as xgb

import logging

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Initialize the logger
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger()

In [3]:
def load_data(path)->pd.DataFrame:
    """Loads data from a CSV file into a pandas DataFrame using advanced techniques.

        Args:
            path (str): The path to the CSV file.
        Returns:
            pandas.DataFrame: The loaded DataFrame.
    """
    try:
        df = pd.read_csv(path, low_memory=False)
        return df
    except Exception as e:
        logger.info(f"Error on Loding Data {e}")

In [4]:
url = "../data/credit_clean.csv"
credit_df = load_data(url)

logger.info("display five rows of the df")
credit_df.head()

2024-07-21 22:29:18,036 - INFO - display five rows of the df


Unnamed: 0.1,Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,...,Value,TransactionStartTime,PricingStrategy,FraudResult,Day,Month,Year,Hour,Minute,Second
0,0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256,ProviderId_6,ProductId_10,...,1000,2018-11-15 02:18:49+00:00,2,0,15,11,2018,2,18,49
1,1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256,ProviderId_4,ProductId_6,...,20,2018-11-15 02:19:08+00:00,2,0,15,11,2018,2,19,8
2,2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256,ProviderId_6,ProductId_1,...,500,2018-11-15 02:44:21+00:00,2,0,15,11,2018,2,44,21
3,3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256,ProviderId_1,ProductId_21,...,21800,2018-11-15 03:32:55+00:00,2,0,15,11,2018,3,32,55
4,4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,256,ProviderId_4,ProductId_6,...,644,2018-11-15 03:34:21+00:00,2,0,15,11,2018,3,34,21


In [5]:
credit_df.drop(['Unnamed: 0', 'TransactionStartTime'], axis=1, inplace=True)

# Feature Engineering
### 1. Aggregate Features

Create features that summarize transaction behaviors for each customer.

In [6]:
aggregate_features = credit_df.groupby('CustomerId').agg({
    'Amount': ['sum', 'mean', 'count', 'std'] })
aggregate_features.columns = ['Total_Amount', 'Average_Amount', 'Transaction_Count', 'Std_of_Amounts']

logger.info("Print Aggregate Features")
aggregate_features


2024-07-21 22:29:40,177 - INFO - Print Aggregate Features


Unnamed: 0_level_0,Total_Amount,Average_Amount,Transaction_Count,Std_of_Amounts
CustomerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CustomerId_1,-10000.0,-10000.000000,1,
CustomerId_10,-10000.0,-10000.000000,1,
CustomerId_1001,20000.0,4000.000000,5,6558.963333
CustomerId_1002,4225.0,384.090909,11,560.498966
CustomerId_1003,20000.0,3333.333333,6,6030.478146
...,...,...,...,...
CustomerId_992,20000.0,3333.333333,6,6088.240030
CustomerId_993,20000.0,4000.000000,5,6745.368782
CustomerId_994,543873.0,5384.881188,101,14800.656784
CustomerId_996,139000.0,8176.470588,17,4433.329648


In [7]:
# Merge aggregate features back to the original data
credit_df = pd.merge(credit_df, aggregate_features, on='CustomerId', how='left')


In [8]:
credit_df

Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,...,Day,Month,Year,Hour,Minute,Second,Total_Amount,Average_Amount,Transaction_Count,Std_of_Amounts
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256,ProviderId_6,ProductId_10,airtime,...,15,11,2018,2,18,49,109921.75,923.712185,119,3042.294251
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256,ProviderId_4,ProductId_6,financial_services,...,15,11,2018,2,19,8,109921.75,923.712185,119,3042.294251
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256,ProviderId_6,ProductId_1,airtime,...,15,11,2018,2,44,21,1000.00,500.000000,2,0.000000
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256,ProviderId_1,ProductId_21,utility_bill,...,15,11,2018,3,32,55,228727.20,6019.136842,38,17169.241610
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,256,ProviderId_4,ProductId_6,financial_services,...,15,11,2018,3,34,21,228727.20,6019.136842,38,17169.241610
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95657,TransactionId_89881,BatchId_96668,AccountId_4841,SubscriptionId_3829,CustomerId_3078,UGX,256,ProviderId_4,ProductId_6,financial_services,...,13,2,2019,9,54,9,2438140.00,4255.043630,573,22554.029939
95658,TransactionId_91597,BatchId_3503,AccountId_3439,SubscriptionId_2643,CustomerId_3874,UGX,256,ProviderId_6,ProductId_10,airtime,...,13,2,2019,9,54,25,58499.60,1360.455814,43,2274.756582
95659,TransactionId_82501,BatchId_118602,AccountId_4841,SubscriptionId_3829,CustomerId_3874,UGX,256,ProviderId_4,ProductId_6,financial_services,...,13,2,2019,9,54,35,58499.60,1360.455814,43,2274.756582
95660,TransactionId_136354,BatchId_70924,AccountId_1346,SubscriptionId_652,CustomerId_1709,UGX,256,ProviderId_6,ProductId_19,tv,...,13,2,2019,10,1,10,851985.00,1625.925573,524,3207.920536


In [9]:
credit_df.isna().sum()

TransactionId          0
BatchId                0
AccountId              0
SubscriptionId         0
CustomerId             0
CurrencyCode           0
CountryCode            0
ProviderId             0
ProductId              0
ProductCategory        0
ChannelId              0
Amount                 0
Value                  0
PricingStrategy        0
FraudResult            0
Day                    0
Month                  0
Year                   0
Hour                   0
Minute                 0
Second                 0
Total_Amount           0
Average_Amount         0
Transaction_Count      0
Std_of_Amounts       712
dtype: int64

### 2. Encode Categorical Variables and Scaling Numeric variables

Convert categorical variables into numerical format using Label Encoding.

In [10]:
categorical_features, numerical_features = credit_df.select_dtypes(include=['object']),  credit_df.select_dtypes(exclude=['object'])

In [11]:
# checking the variance numerical columns
numerical_features.var()

CountryCode          0.000000e+00
Amount               1.520457e+10
Value                1.515905e+10
PricingStrategy      5.371778e-01
FraudResult          2.013471e-03
Day                  8.033218e+01
Month                2.729342e+01
Year                 2.499943e-01
Hour                 2.349306e+01
Minute               2.989331e+02
Second               2.996997e+02
Total_Amount         4.753580e+14
Average_Amount       7.341666e+09
Transaction_Count    7.719926e+05
Std_of_Amounts       7.973723e+09
dtype: float64

In [13]:

def scale_numerical_features(df:pd.DataFrame, numerical_features:list)->pd.DataFrame:
    """
    Scales numerical features in a DataFrame using StandardScaler.

    Args:
        df (pandas.DataFrame): The DataFrame containing the data.
        numerical_features (pandas.Index): The index of numerical features to scale.

    Returns:
        pandas.DataFrame: The DataFrame with scaled numerical features.
    """

    scaler = StandardScaler()
    for col in numerical_features:
        if col == "FraudResult":
            continue
        else:
            credit_df[col] = scaler.fit_transform(credit_df[col].values.reshape(-1,1))
    return df

# Apply the scaling function
credit_df = scale_numerical_features(credit_df, numerical_features.columns)

# Calculate variance after scaling
credit_df[numerical_features.columns].var()

CountryCode          0.000000
Amount               1.000010
Value                1.000010
PricingStrategy      1.000010
FraudResult          0.002013
Day                  1.000010
Month                1.000010
Year                 1.000010
Hour                 1.000010
Minute               1.000010
Second               1.000010
Total_Amount         1.000010
Average_Amount       1.000010
Transaction_Count    1.000010
Std_of_Amounts       1.000011
dtype: float64

In [14]:
def encode_categorical_features(df:pd.DataFrame, categorical_cols:list)->pd.DataFrame:
    """
    Encode categorical columns in the DataFrame using LabelEncoder.

    Args:
    df (pd.DataFrame): The input DataFrame.
    categorical_cols (list of str): List of column names to be label encoded.

    Returns:
    pd.DataFrame: The DataFrame with label encoded columns.
    """
    encoder = LabelEncoder()
    for col in categorical_cols:
        df[col] = encoder.fit_transform(df[col])
    
    return df

# Specify categorical features
categorical_features = categorical_features.columns

# Encode categorical features
credit_df = encode_categorical_features(credit_df, categorical_features)

### 3. Handle Missing Values

I'll use imputation to handle missing values.

In [15]:
# Imputation for missing values
def imputation(df:pd.DataFrame)->pd.DataFrame:
    try:
        categorical_features, numerical_features = df.select_dtypes(include=['object']),  df.select_dtypes(exclude=['object'])
        for col in numerical_features.columns:
            df[col].fillna(df[col].mean(), inplace=True)

        # For categorical variables, I use mode imputation
        for column in categorical_features.columns:
            df[column].fillna(df[column].mode()[0], inplace=True)
        
        return df
    except Exception as e:
        logger.info(f"Error: {e}")

credit_df = imputation(credit_df)

# 4. Perform WoE Binning 

In [18]:
""" # Define the target variable
target_variable = 'FraudResult'

# Initialize WoE transformer
woe_transformer = WOE()

# Fit the transformer and transform the data
woe_features = woe_transformer.fit_transform(credit_df.drop(target_variable, axis=1), credit_df[target_variable])

# Ensure WoE-transformed features are in DataFrame format
woe_features = pd.DataFrame(woe_features) """

" # Define the target variable\ntarget_variable = 'FraudResult'\n\n# Initialize WoE transformer\nwoe_transformer = WOE()\n\n# Fit the transformer and transform the data\nwoe_features = woe_transformer.fit_transform(credit_df.drop(target_variable, axis=1), credit_df[target_variable])\n\n# Ensure WoE-transformed features are in DataFrame format\nwoe_features = pd.DataFrame(woe_features) "

# 5. Model Building and Evaluation

In [19]:
# Split the data into training and testing sets
X = credit_df.drop('FraudResult', axis=1)
y = credit_df['FraudResult']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [20]:
# Initialize models
log_reg = LogisticRegression()
decision_tree = DecisionTreeClassifier()
random_forest = RandomForestClassifier()
gbm = GradientBoostingClassifier()
catgb = cb.CatBoostClassifier()
xgboo = xgb.XGBClassifier()
light = LGBMClassifier()

# Train the models
log_reg.fit(X_train, y_train)
decision_tree.fit(X_train, y_train)
random_forest.fit(X_train, y_train)
gbm.fit(X_train, y_train)
catgb.fit(X_train, y_train)
xgboo.fit(X_train, y_train)
light.fit(X_train, y_train)

Learning rate set to 0.065664
0:	learn: 0.4711031	total: 134ms	remaining: 2m 13s
1:	learn: 0.2969498	total: 183ms	remaining: 1m 31s
2:	learn: 0.1915762	total: 208ms	remaining: 1m 9s
3:	learn: 0.1251625	total: 234ms	remaining: 58.2s
4:	learn: 0.0814657	total: 259ms	remaining: 51.5s
5:	learn: 0.0547353	total: 282ms	remaining: 46.7s
6:	learn: 0.0375332	total: 312ms	remaining: 44.3s
7:	learn: 0.0266166	total: 341ms	remaining: 42.3s
8:	learn: 0.0188306	total: 376ms	remaining: 41.4s
9:	learn: 0.0137582	total: 412ms	remaining: 40.8s
10:	learn: 0.0104707	total: 447ms	remaining: 40.2s
11:	learn: 0.0081764	total: 475ms	remaining: 39.1s
12:	learn: 0.0065451	total: 510ms	remaining: 38.7s
13:	learn: 0.0054093	total: 535ms	remaining: 37.6s
14:	learn: 0.0045844	total: 560ms	remaining: 36.8s
15:	learn: 0.0039769	total: 589ms	remaining: 36.2s
16:	learn: 0.0034705	total: 616ms	remaining: 35.6s
17:	learn: 0.0031495	total: 646ms	remaining: 35.3s
18:	learn: 0.0028397	total: 682ms	remaining: 35.2s
19:	learn

In [22]:
# Evaluate the models
models = {'Logistic Regression': log_reg, 'Decision Tree': decision_tree, 'Random Forest': random_forest, 'GBM': gbm, 'CatBoost':catgb, 'XGBoost':xgboo, 'LightGBM':light}
for name, model in models.items():
    y_pred = model.predict(X_test)
    print(f'{name} Performance:')
    print(f'Accuracy: {accuracy_score(y_test, y_pred):.2f}')
    print(f'Precision: {precision_score(y_test, y_pred):.2f}')
    print(f'Recall: {recall_score(y_test, y_pred):.2f}')
    print(f'F1 Score: {f1_score(y_test, y_pred):.2f}')
    print(f'ROC-AUC: {roc_auc_score(y_test, y_pred):.2f}\n')

Logistic Regression Performance:
Accuracy: 1.00
Precision: 0.56
Recall: 0.38
F1 Score: 0.45
ROC-AUC: 0.69

Decision Tree Performance:
Accuracy: 1.00
Precision: 0.89
Recall: 0.85
F1 Score: 0.87
ROC-AUC: 0.92

Random Forest Performance:
Accuracy: 1.00
Precision: 0.92
Recall: 0.87
F1 Score: 0.89
ROC-AUC: 0.94

GBM Performance:
Accuracy: 1.00
Precision: 0.94
Recall: 0.77
F1 Score: 0.85
ROC-AUC: 0.88

CatBoost Performance:
Accuracy: 1.00
Precision: 0.97
Recall: 0.79
F1 Score: 0.87
ROC-AUC: 0.90

XGBoost Performance:
Accuracy: 1.00
Precision: 0.50
Recall: 0.51
F1 Score: 0.51
ROC-AUC: 0.76

LightGBM Performance:
Accuracy: 1.00
Precision: 0.20
Recall: 0.44
F1 Score: 0.28
ROC-AUC: 0.72



### Overall Analysis

- Best Models: Random Forest, CatBoost, and Decision Tree stand out with high precision, recall, F1 Score, and ROC-AUC. They are performing well in terms of both classification accuracy and distinguishing between classes.

- Underperformers: Logistic Regression, XGBoost, and LightGBM show lower performance in precision, recall, and F1 Score. These models might be struggling with certain aspects of the classification task, possibly due to the model’s complexity or its ability to handle imbalanced data.