In [24]:
import pandas as pd
import sys
import numpy as np
import os
from sklearn.model_selection import train_test_split
sys.path.append(os.path.abspath(".."))
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from src.feature_engineering import apply_smote, encode_categorical_features # import encode_categorical_features
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
import warnings
warnings.filterwarnings('ignore','futurewarning')

# User-defined modules
from src.utils import drop_missing_columns, drop_correlated_columns
from src.feature_engineering import (
    create_product_amount_interaction,
    create_customer_aggregate_features,
    create_rfms_features,
    encode_categorical_features
)


In [25]:
# Load the data
transact_df = pd.read_csv(r'C:\Users\user\Desktop\BatiBank_SmartCredit\data\Cleaned_data.csv')

In [26]:
# Datatime Conversion
transact_df['TransactionStartTime'] = pd.to_datetime(transact_df['TransactionStartTime'])

# **Feature Engineering**

In [27]:
# Create Product Amount Interaction
transact_df = create_product_amount_interaction(transact_df)

In [28]:
# Create Customer Aggregate Feature
transact_df = create_customer_aggregate_features(transact_df, customer_id_col='CustomerId', amount_col='Amount')

In [29]:
# Create RFM Features
transact_df = create_rfms_features(transact_df, customer_id_col='CustomerId', transaction_time_col='TransactionStartTime', amount_col='Amount')

In [30]:
# Encode Categorical Variables
categorical_cols = ['CountryCode', 'ProviderId', 'ProductId', 'ProductCategory', 'ChannelId']
transact_df = encode_categorical_features(transact_df, categorical_cols, target_col='FraudResult')

In [31]:
transact_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95662 entries, 0 to 95661
Data columns (total 28 columns):
 #   Column                       Non-Null Count  Dtype              
---  ------                       --------------  -----              
 0   TransactionId                95662 non-null  object             
 1   BatchId                      95662 non-null  object             
 2   AccountId                    95662 non-null  object             
 3   SubscriptionId               95662 non-null  object             
 4   CustomerId                   95662 non-null  object             
 5   Amount                       95662 non-null  float64            
 6   TransactionStartTime         95662 non-null  datetime64[ns, UTC]
 7   PricingStrategy              95662 non-null  int64              
 8   FraudResult                  95662 non-null  int64              
 9   Month                        95662 non-null  int64              
 10  Day                          95662 non-null  i

In [32]:
# Fix TransactionDate_x and drop redundant columns
transact_df['TransactionDate_x'] = pd.to_datetime(transact_df['TransactionDate_x'])
transact_df = transact_df.drop(['TransactionDate_x', 'TransactionDate_y'], axis=1)

In [33]:
# Handle missing values
transact_df['customer_amount_std'] = transact_df['customer_amount_std'].fillna(transact_df['customer_amount_std'].mean())

In [34]:
# Drop Redundant Columns
id_cols = ['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId', 'TransactionStartTime']
transact_df = transact_df.drop(id_cols, axis=1)

In [35]:
# Data Splitting
X = transact_df.drop('FraudResult', axis=1)
y = transact_df['FraudResult']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)

In [36]:
# Scale numerical features
scaler = StandardScaler()
numerical_cols = X.select_dtypes(include=np.number).columns
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

In [37]:
# Handling Class Imbalance (SMOTE)
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
train_balanced = pd.concat([pd.DataFrame(X_train_resampled, columns=X.columns), pd.Series(y_train_resampled, name='FraudResult')], axis=1)

In [38]:
train_balanced.head()

Unnamed: 0,Amount,PricingStrategy,Month,Day,Hour,Weekday,IsWeekend,ProductCategory_Amount_Mean,customer_total_amount,customer_mean_amount,customer_transaction_count,customer_amount_std,Recency,Frequency,Monetary,ProviderId_encoded,ProductId_encoded,ProductCategory_encoded,ChannelId_encoded,FraudResult
0,-0.056453,-0.34828,-1.064909,-0.321961,-0.919098,1.604086,1.959418,0.626597,0.172292,-0.064638,-0.302877,-0.176384,-0.776423,-0.302877,0.172292,-0.435339,-0.384003,0.712656,-1.200709,0
1,-0.015089,-0.34828,1.040922,0.571105,0.730544,0.53104,-0.510356,-0.783679,0.179151,-0.023416,-0.370115,-0.136331,-0.522962,-0.370115,0.179151,-0.445267,-0.314554,-0.756159,0.775578,0
2,-0.047531,-0.34828,-1.064909,0.794371,1.142955,-0.542005,-0.510356,-0.783679,0.171434,-0.061053,-0.340485,-0.166179,-0.725731,-0.340485,0.171434,-0.445267,-0.294666,-0.756159,0.775578,0
3,-0.056209,-0.34828,-1.064909,-0.433594,-0.300482,1.067563,1.959418,0.626597,0.185842,-0.066662,-0.013413,-0.164355,0.693651,-0.013413,0.185842,-0.435339,-0.384003,0.712656,-1.200709,0
4,0.106567,-0.34828,-0.87347,-0.880127,-1.125304,0.53104,-0.510356,-0.783679,0.275953,-0.030453,0.205394,0.047578,-0.776423,0.205394,0.275953,-0.445267,-0.314554,-0.756159,0.775578,0


In [39]:
# Checking the balance of the data
print('The number of Non-Frauds are: ' + str(train_balanced['FraudResult'].value_counts()[0]) + ' which is', round(train_balanced['FraudResult'].value_counts()[0]/len(train_balanced) * 100,2), '% of the dataset')
print('The number of Frauds are: ' + str(train_balanced['FraudResult'].value_counts()[1]) + ' which is', round(train_balanced['FraudResult'].value_counts()[1]/len(train_balanced) * 100,2), '% of the dataset')

The number of Non-Frauds are: 76372 which is 50.0 % of the dataset
The number of Frauds are: 76372 which is 50.0 % of the dataset


In [20]:
train_balanced.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 152744 entries, 0 to 152743
Data columns (total 20 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Amount                       152744 non-null  float64
 1   PricingStrategy              152744 non-null  float64
 2   Month                        152744 non-null  float64
 3   Day                          152744 non-null  float64
 4   Hour                         152744 non-null  float64
 5   Weekday                      152744 non-null  float64
 6   IsWeekend                    152744 non-null  float64
 7   ProductCategory_Amount_Mean  152744 non-null  float64
 8   customer_total_amount        152744 non-null  float64
 9   customer_mean_amount         152744 non-null  float64
 10  customer_transaction_count   152744 non-null  float64
 11  customer_amount_std          152744 non-null  float64
 12  Recency                      152744 non-null  float64
 13 

In [41]:
test['FraudResult'].value_counts()

FraudResult
0    19097
1       36
Name: count, dtype: int64

In [42]:
# Save the prepared data for later use
train_balanced.to_csv(r'C:\Users\user\Desktop\BatiBank_SmartCredit\data\train_balanced.csv', index=False)
test.to_csv(r'C:\Users\user\Desktop\BatiBank_SmartCredit\data\test.csv', index=False)

**Feature Engineering Summary:**

The feature engineering process focused on transforming raw transaction data into a robust set of features for fraud detection. Key steps included:

* **Datetime Handling:** Redundant datetime columns were corrected and removed, and a 'Recency' feature was derived from transaction dates.
* **Missing Value Imputation:** Missing values in the 'customer_amount_std' column were imputed using the mean.
* **Identifier Removal:** Irrelevant identifier columns (TransactionId, BatchId, etc.) and the 'TransactionStartTime' column were dropped.
* **Categorical Encoding:** Categorical features (ProviderId, ProductId, etc.) were encoded using target encoding.
* **Feature Creation:** New features were created, including 'ProductCategory_Amount_Mean', customer aggregate features (total amount, mean amount, transaction count, standard deviation), and RFM (Recency, Frequency, Monetary) features.
* **Data Scaling:** Numerical features were scaled using StandardScaler to ensure consistent scales for model training.
* **Class Imbalance Handling:** The training dataset was balanced using SMOTE (Synthetic Minority Over-sampling Technique), resulting in an equal distribution of fraud and non-fraud cases (50% each), with 76,372 cases of each class in the train_balanced dataset.
* **Data Type Optimization:** Data types were optimized to improve memory efficiency.

The resulting DataFrame contains numerical features, encoded categorical features, and the target variable 'FraudResult', ready for model training.