In [1]:
# Import pandas
import pandas as pd
import os
import sys
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../src')))
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../scripts')))


In [2]:
# Load your Excel file
transactions_df = pd.read_excel('../data/data.xlsx',  sheet_name='data') 

In [None]:
from feature_engineering.feature_engineering import AggregateFeatures

agg_features = AggregateFeatures(data=transactions_df, customer_id_col='CustomerId', transaction_amount_col='Amount')
agg_data = agg_features.compute_aggregate_features()
agg_data.head()

In [None]:
from feature_engineering.feature_engineering import TimeFeatureExtractor


time_features = TimeFeatureExtractor(data=transactions_df, datetime_col='TransactionStartTime')
time_data = time_features.extract_time_features()

time_data.head()


In [None]:
from feature_engineering.feature_engineering import CategoricalEncoder


encoder = CategoricalEncoder(data=transactions_df)
# Perform one-hot encoding for relevant columns
one_hot_encoded_data = encoder.one_hot_encode(['CurrencyCode', 'CountryCode', 'ProviderId', 'ProductId', 'ProductCategory', 'ChannelId', 'PricingStrategy'])

# Perform label encoding for binary FraudResult
label_encoded_data = encoder.label_encode(['FraudResult'])
print("one hot encoded data")
print(one_hot_encoded_data.head())

print("label encoded data")

label_encoded_data.head()

In [None]:
# Numerical and categorical columns
from feature_engineering.feature_engineering import MissingValueHandler


numerical_columns = ['Amount', 'Value']
categorical_columns = ['CurrencyCode', 'CountryCode', 'ProviderId', 'ProductId', 'ProductCategory', 'ChannelId', 'PricingStrategy', 'FraudResult']

# Instantiate handler and impute missing values
handler = MissingValueHandler(data=transactions_df)

# Impute missing values (using mean for numerical and most frequent for categorical)
data_imputed = handler.impute_missing_values(numerical_columns=numerical_columns, categorical_columns=categorical_columns)

# Optionally remove columns with more than 5% missing values
data_cleaned = handler.remove_missing_values(threshold=0.05)

# Save the cleaned data to CSV
handler.save_cleaned_data("../data/cleaned_data.csv")


In [None]:
from feature_engineering.feature_engineering import Scaler


scaler = Scaler(data=transactions_df)
normalized_data = scaler.normalize(columns=['amount'])
standardized_data = scaler.standardize(columns=['amount'])


In [None]:
from feature_engineering.feature_engineering import WOEIVFeatureEngineering


woe_iv = WOEIVFeatureEngineering(data=transactions_df, target_col='target')
woe_df, iv_scores = woe_iv.calculate_woe_iv(feature_cols=['feature1', 'feature2'])
