In [1]:
import pandas as pd
import sys
import os
from sklearn.model_selection import train_test_split
sys.path.append(os.path.abspath(".."))
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from src.feature_engineering import apply_smote, encode_categorical_features # import encode_categorical_features
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
import warnings
warnings.filterwarnings('ignore','futurewarning')

# User-defined modules
from src.utils import drop_missing_columns, drop_correlated_columns
from src.feature_engineering import (
    create_time_features,
    create_product_amount_interaction,
    create_customer_aggregate_features,
    create_rfms_features,
    encode_categorical_features,
    apply_smote
)


In [2]:
# Load the data
transact_df = pd.read_csv(r'C:\Users\user\Desktop\BatiBank_SmartCredit\data\data.csv')

In [3]:
transact_df['TransactionDate'] = pd.to_datetime(transact_df['TransactionStartTime']).dt.date
recent_date = pd.to_datetime(transact_df['TransactionDate'].max())
print(type(recent_date))

<class 'pandas._libs.tslibs.timestamps.Timestamp'>


In [4]:
# Create Time-Based Features
transact_df ['TransactionStartTime'] = pd.to_datetime(transact_df['TransactionStartTime'])
transact_df = create_time_features(transact_df)

In [5]:
# Create Interaction Features for Product Category and Amount)
transact_df = create_product_amount_interaction (transact_df)

In [6]:
# Create Customer Aggregate Features
transact_df = create_customer_aggregate_features(transact_df)

In [7]:
# Create RFMS Features
transact_df = create_rfms_features(transact_df)

In [8]:
# Handling High Correlation (Amount/Value)
transact_df = drop_correlated_columns(transact_df,['Value'])

In [9]:
# Encodie Categorical Variables
categorical_cols = ['ProductCategory', 'ChannelId', 'PricingStrategy', 'ProviderId', 'ProductId']
transact_df = encode_categorical_features(transact_df, categorical_cols)


In [None]:
print(transact_df.columns)

In [10]:
# Drop identifiers
identifiers = ['CurrencyCode', 'TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId']
transact_df = transact_df.drop(identifiers, axis=1)

# Drop original time columns
time_cols = ['TransactionStartTime', 'TransactionDate']
transact_df = transact_df.drop(time_cols, axis=1)

In [11]:
print(transact_df.columns)

Index(['CountryCode', 'Amount', 'FraudResult', 'TransactionHour',
       'TransactionDayOfWeek', 'TransactionMonth', 'TransactionYear',
       'ProductCategory_Amount', 'customer_total_amount',
       'customer_mean_amount', 'customer_transaction_count',
       'customer_amount_std', 'Recency', 'Frequency', 'Monetary',
       'ProductCategory_airtime', 'ProductCategory_data_bundles',
       'ProductCategory_financial_services', 'ProductCategory_movies',
       'ProductCategory_other', 'ProductCategory_ticket',
       'ProductCategory_transport', 'ProductCategory_tv',
       'ProductCategory_utility_bill', 'ChannelId_ChannelId_1',
       'ChannelId_ChannelId_2', 'ChannelId_ChannelId_3',
       'ChannelId_ChannelId_5', 'PricingStrategy_0', 'PricingStrategy_1',
       'PricingStrategy_2', 'PricingStrategy_4', 'ProviderId_ProviderId_1',
       'ProviderId_ProviderId_2', 'ProviderId_ProviderId_3',
       'ProviderId_ProviderId_4', 'ProviderId_ProviderId_5',
       'ProviderId_ProviderId_6',

In [16]:
# Data Splitting
X = transact_df.drop('FraudResult', axis=1)
y = transact_df['FraudResult']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)

In [17]:
# One-Hot Encode ProductCategory_Amount
train = encode_categorical_features(train, ['ProductCategory_Amount'])
test = encode_categorical_features(test, ['ProductCategory_Amount'])

In [19]:
# Impute Missing Values (Mean Imputation)
for col in train.columns:
    if train[col].isnull().any():
        train[col] = train[col].fillna(train[col].mean())
        test[col] = test[col].fillna(test[col].mean()) 

In [20]:
# Apply SMOTE for Training Set Only
train_balanced = apply_smote(train)

In [23]:
# Feature Scaling
numerical_cols = train_balanced.select_dtypes(include=['number']).columns.tolist()
numerical_cols.remove('FraudResult') #remove target variable
scaler = StandardScaler()
train_balanced[numerical_cols] = scaler.fit_transform(train_balanced[numerical_cols])
test[numerical_cols] = scaler.transform(test[numerical_cols])

In [24]:
# Data Type Optimization
for col in train_balanced.columns:
    if train_balanced[col].dtype == 'int64':
        train_balanced[col] = pd.to_numeric(train_balanced[col], downcast='integer')
    elif train_balanced[col].dtype == 'float64':
        train_balanced[col] = pd.to_numeric(train_balanced[col], downcast='float')
    elif train_balanced[col].dtype == 'object':
        train_balanced[col] = train_balanced[col].astype('category')

for col in test.columns:
    if test[col].dtype == 'int64':
        test[col] = pd.to_numeric(test[col], downcast='integer')
    elif test[col].dtype == 'float64':
        test[col] = pd.to_numeric(test[col], downcast='float')
    elif test[col].dtype == 'object':
        test[col] = test[col].astype('category')


In [25]:
train_balanced.head()

Unnamed: 0,CountryCode,Amount,TransactionHour,TransactionDayOfWeek,TransactionMonth,TransactionYear,customer_total_amount,customer_mean_amount,customer_transaction_count,customer_amount_std,...,ProductCategory_Amount_utility_bill_8800.0,ProductCategory_Amount_utility_bill_8840.0,ProductCategory_Amount_utility_bill_8900.0,ProductCategory_Amount_utility_bill_9000.0,ProductCategory_Amount_utility_bill_90000.0,ProductCategory_Amount_utility_bill_91000.0,ProductCategory_Amount_utility_bill_9500.0,ProductCategory_Amount_utility_bill_95000.0,ProductCategory_Amount_utility_bill_9800.0,FraudResult
0,0.0,-0.487126,-1.034983,1.868401,-0.9884,1.006766,-0.377174,-0.322046,-0.171162,-0.611225,...,False,False,False,False,False,False,False,False,False,0
1,0.0,-0.483643,0.691916,0.730151,1.231405,-0.993279,-0.372279,-0.31815,-0.262924,-0.604897,...,False,False,False,False,False,False,False,False,False,0
2,0.0,-0.486375,1.12364,-0.408098,-0.9884,1.006766,-0.377786,-0.321707,-0.222487,-0.609612,...,False,False,False,False,False,False,False,False,False,0
3,0.0,-0.487105,-0.387396,1.299276,-0.9884,1.006766,-0.367504,-0.322238,0.223884,-0.609324,...,False,False,False,False,False,False,False,False,False,0
4,0.0,-0.473401,-1.250845,0.730151,-0.7866,1.006766,-0.303198,-0.318815,0.522501,-0.57584,...,False,False,False,False,False,False,False,False,False,0


In [26]:
print(train_balanced.columns)

Index(['CountryCode', 'Amount', 'TransactionHour', 'TransactionDayOfWeek',
       'TransactionMonth', 'TransactionYear', 'customer_total_amount',
       'customer_mean_amount', 'customer_transaction_count',
       'customer_amount_std',
       ...
       'ProductCategory_Amount_utility_bill_8800.0',
       'ProductCategory_Amount_utility_bill_8840.0',
       'ProductCategory_Amount_utility_bill_8900.0',
       'ProductCategory_Amount_utility_bill_9000.0',
       'ProductCategory_Amount_utility_bill_90000.0',
       'ProductCategory_Amount_utility_bill_91000.0',
       'ProductCategory_Amount_utility_bill_9500.0',
       'ProductCategory_Amount_utility_bill_95000.0',
       'ProductCategory_Amount_utility_bill_9800.0', 'FraudResult'],
      dtype='object', length=1962)


In [22]:
# Save the prepared data for later use
train_balanced.to_csv(r'C:\Users\user\Desktop\BatiBank_SmartCredit\data\train_balanced.csv', index=False)
test.to_csv(r'C:\Users\user\Desktop\BatiBank_SmartCredit\data\test.csv', index=False)