# Task 3: Feature Engineering

In [1]:
import sys
import os
import pandas as pd
from glob import glob

In [2]:
from pathlib import Path
from importlib import reload
# add project root to sys.path
project_root = Path('..').resolve()
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

## Loading the dataset

In [3]:
# import the module to load the data into dataframes
from src.data_loader import DataLoader
import src.data_loader  
reload(src.data_loader)


<module 'src.data_loader' from 'D:\\Research & Project\\10academy\\week 5\\Challenge\\bati-bank-credit-risk-model\\src\\data_loader.py'>

In [4]:
# Load the dataset
data_loader = DataLoader('../data/raw/data.csv')
df = data_loader.load_data()

In [5]:
# Display the dataset information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95662 entries, 0 to 95661
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   TransactionId         95662 non-null  object 
 1   BatchId               95662 non-null  object 
 2   AccountId             95662 non-null  object 
 3   SubscriptionId        95662 non-null  object 
 4   CustomerId            95662 non-null  object 
 5   CurrencyCode          95662 non-null  object 
 6   CountryCode           95662 non-null  int64  
 7   ProviderId            95662 non-null  object 
 8   ProductId             95662 non-null  object 
 9   ProductCategory       95662 non-null  object 
 10  ChannelId             95662 non-null  object 
 11  Amount                95662 non-null  float64
 12  Value                 95662 non-null  int64  
 13  TransactionStartTime  95662 non-null  object 
 14  PricingStrategy       95662 non-null  int64  
 15  FraudResult        

In [6]:
# columns of the dataset with corresponding data types and missing values in tabular format
df_columns = pd.DataFrame({
    'Column Name': df.columns,
    'Data Type': df.dtypes,
    'Missing Values': df.isnull().sum()
})
print(df_columns)

                               Column Name Data Type  Missing Values
TransactionId                TransactionId    object               0
BatchId                            BatchId    object               0
AccountId                        AccountId    object               0
SubscriptionId              SubscriptionId    object               0
CustomerId                      CustomerId    object               0
CurrencyCode                  CurrencyCode    object               0
CountryCode                    CountryCode     int64               0
ProviderId                      ProviderId    object               0
ProductId                        ProductId    object               0
ProductCategory            ProductCategory    object               0
ChannelId                        ChannelId    object               0
Amount                              Amount   float64               0
Value                                Value     int64               0
TransactionStartTime  TransactionS

In [7]:
# load the module to perform feature engineering
from src.data_processing import FeatureEngineering
import src.data_processing
reload(src.data_processing)


<module 'src.data_processing' from 'D:\\Research & Project\\10academy\\week 5\\Challenge\\bati-bank-credit-risk-model\\src\\data_processing.py'>

In [8]:
# initialize the feature engineering class
feature_engineering = FeatureEngineering(df)

In [9]:
# remove duplicate rows
df = feature_engineering.remove_duplicates()

Removed 0 duplicate rows.


### create aggregated features

In [10]:
# create aggregated features
df = feature_engineering.create_aggregated_features()
print("Aggregated features created successfully.")
print(df.head())


Aggregated features created successfully.
         TransactionId         BatchId       AccountId       SubscriptionId  \
0  TransactionId_76871   BatchId_36123  AccountId_3957   SubscriptionId_887   
1  TransactionId_73770   BatchId_15642  AccountId_4841  SubscriptionId_3829   
2  TransactionId_26203   BatchId_53941  AccountId_4229   SubscriptionId_222   
3    TransactionId_380  BatchId_102363   AccountId_648  SubscriptionId_2185   
4  TransactionId_28195   BatchId_38780  AccountId_4841  SubscriptionId_3829   

        CustomerId CurrencyCode  CountryCode    ProviderId     ProductId  \
0  CustomerId_4406          UGX          256  ProviderId_6  ProductId_10   
1  CustomerId_4406          UGX          256  ProviderId_4   ProductId_6   
2  CustomerId_4683          UGX          256  ProviderId_6   ProductId_1   
3   CustomerId_988          UGX          256  ProviderId_1  ProductId_21   
4   CustomerId_988          UGX          256  ProviderId_4   ProductId_6   

      ProductCategory    C

### Extract Features

#### Extract Time Features

- Transaction Hour: The hour of the day when the transaction occurred.
- Transaction Day: The day of the month when the transaction occurred.
- Transaction Month: The month when the transaction occurred.
- Transaction Year: The year when the transaction occurred.

In [11]:
df = feature_engineering.extract_time_features()
print("Time features extracted successfully.")
print(df.head())

Time features extracted successfully.
         TransactionId         BatchId       AccountId       SubscriptionId  \
0  TransactionId_76871   BatchId_36123  AccountId_3957   SubscriptionId_887   
1  TransactionId_73770   BatchId_15642  AccountId_4841  SubscriptionId_3829   
2  TransactionId_26203   BatchId_53941  AccountId_4229   SubscriptionId_222   
3    TransactionId_380  BatchId_102363   AccountId_648  SubscriptionId_2185   
4  TransactionId_28195   BatchId_38780  AccountId_4841  SubscriptionId_3829   

        CustomerId CurrencyCode  CountryCode    ProviderId     ProductId  \
0  CustomerId_4406          UGX          256  ProviderId_6  ProductId_10   
1  CustomerId_4406          UGX          256  ProviderId_4   ProductId_6   
2  CustomerId_4683          UGX          256  ProviderId_6   ProductId_1   
3   CustomerId_988          UGX          256  ProviderId_1  ProductId_21   
4   CustomerId_988          UGX          256  ProviderId_4   ProductId_6   

      ProductCategory  ... Pri

#### Encode Categorical Variables
Convertion of categorical variables into numerical format by using:

- One-Hot Encoding: Converts categorical values into binary vectors.
- Label Encoding: Assigns a unique integer to each category.

In [12]:
# encode using One-Hot Encoding and label encoding
df = feature_engineering.encode_categorical_variables(df)
print("Categorical variables encoded successfully.")
print(df.head())

Categorical variables encoded successfully.
         TransactionId         BatchId       AccountId       SubscriptionId  \
0  TransactionId_76871   BatchId_36123  AccountId_3957   SubscriptionId_887   
1  TransactionId_73770   BatchId_15642  AccountId_4841  SubscriptionId_3829   
2  TransactionId_26203   BatchId_53941  AccountId_4229   SubscriptionId_222   
3    TransactionId_380  BatchId_102363   AccountId_648  SubscriptionId_2185   
4  TransactionId_28195   BatchId_38780  AccountId_4841  SubscriptionId_3829   

        CustomerId CurrencyCode  CountryCode  ProviderId     ProductId  \
0  CustomerId_4406          UGX          256           5  ProductId_10   
1  CustomerId_4406          UGX          256           3   ProductId_6   
2  CustomerId_4683          UGX          256           5   ProductId_1   
3   CustomerId_988          UGX          256           0  ProductId_21   
4   CustomerId_988          UGX          256           3   ProductId_6   

   ProductCategory  ...  ProductCate

#### Handle Missing Values

In [13]:
df =  feature_engineering.handle_missing_values(df)
print("Missing values handled successfully.")
print(df.head())

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)


Missing values handled successfully.
         TransactionId         BatchId       AccountId       SubscriptionId  \
0  TransactionId_76871   BatchId_36123  AccountId_3957   SubscriptionId_887   
1  TransactionId_73770   BatchId_15642  AccountId_4841  SubscriptionId_3829   
2  TransactionId_26203   BatchId_53941  AccountId_4229   SubscriptionId_222   
3    TransactionId_380  BatchId_102363   AccountId_648  SubscriptionId_2185   
4  TransactionId_28195   BatchId_38780  AccountId_4841  SubscriptionId_3829   

        CustomerId CurrencyCode  CountryCode  ProviderId     ProductId  \
0  CustomerId_4406          UGX          256           5  ProductId_10   
1  CustomerId_4406          UGX          256           3   ProductId_6   
2  CustomerId_4683          UGX          256           5   ProductId_1   
3   CustomerId_988          UGX          256           0  ProductId_21   
4   CustomerId_988          UGX          256           3   ProductId_6   

   ProductCategory  ...  ProductCategory_tv

#### Normalize/Standardize Numerical Features

In [14]:
# Normalize/Standardize Numerical Features
df = feature_engineering.normalize_numerical_features(df)
print("Numerical features normalized successfully.")
print(df.head())
df = feature_engineering.standardize_numerical_features(df)
print("Numerical features standardized successfully.")


Numerical features normalized successfully.
         TransactionId         BatchId       AccountId       SubscriptionId  \
0  TransactionId_76871   BatchId_36123  AccountId_3957   SubscriptionId_887   
1  TransactionId_73770   BatchId_15642  AccountId_4841  SubscriptionId_3829   
2  TransactionId_26203   BatchId_53941  AccountId_4229   SubscriptionId_222   
3    TransactionId_380  BatchId_102363   AccountId_648  SubscriptionId_2185   
4  TransactionId_28195   BatchId_38780  AccountId_4841  SubscriptionId_3829   

        CustomerId CurrencyCode  CountryCode  ProviderId     ProductId  \
0  CustomerId_4406          UGX          256           5  ProductId_10   
1  CustomerId_4406          UGX          256           3   ProductId_6   
2  CustomerId_4683          UGX          256           5   ProductId_1   
3   CustomerId_988          UGX          256           0  ProductId_21   
4   CustomerId_988          UGX          256           3   ProductId_6   

   ProductCategory  ...  ProductCate

#### Save the Processed Data

```python

In [15]:
outout_file = '../data/processed/processed_data.csv'
feature_engineering.save_processed_data(df, outout_file)


Processed data saved to ../data/processed/processed_data.csv
