## Import Dependencies

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
import warnings


## Load Datasets

In [4]:
print("Loading pre-processed datasets for Data Transformation...")
try:
    fraud_data = pd.read_csv('../data/Fraud_Data_merged.csv')
    creditcard_data = pd.read_csv('../data/creditcard_cleaned.csv')

    if 'signup_time' in fraud_data.columns and not pd.api.types.is_datetime64_any_dtype(fraud_data['signup_time']):
        fraud_data['signup_time'] = pd.to_datetime(fraud_data['signup_time'], errors='coerce')
    if 'purchase_time' in fraud_data.columns and not pd.api.types.is_datetime64_any_dtype(fraud_data['purchase_time']):
        fraud_data['purchase_time'] = pd.to_datetime(fraud_data['purchase_time'], errors='coerce')

    initial_rows_fraud = fraud_data.shape[0]
    fraud_data.dropna(subset=['signup_time', 'purchase_time'], inplace=True)
    if fraud_data.shape[0] < initial_rows_fraud:
        print(f"Dropped {initial_rows_fraud - fraud_data.shape[0]} rows from Fraud_Data due to invalid signup_time or purchase_time after loading merged CSV.")
    
    print("Datasets loaded successfully.")
except FileNotFoundError as e:
    print(f"Error loading file: {e}. Please ensure the specified CSV files are in the Data directory.")
    exit()

Loading pre-processed datasets for Data Transformation...
Datasets loaded successfully.


## Data Transformation

### Handle Class Imbalance

In [7]:
X_fraud = fraud_data.drop(columns=[
    'user_id', 'signup_time', 'purchase_time', 'device_id', 'ip_address', 'class'
])
y_fraud = fraud_data['class']

X_creditcard = creditcard_data.drop(columns=['Time', 'Class'])
y_creditcard = creditcard_data['Class']

print("\nPerforming Train-Test Split...")
X_fraud_train, X_fraud_test, y_fraud_train, y_fraud_test = train_test_split(
    X_fraud, y_fraud, test_size=0.3, random_state=42, stratify=y_fraud
)
print(f"Fraud_Data train shape: {X_fraud_train.shape}, test shape: {X_fraud_test.shape}")

X_creditcard_train, X_creditcard_test, y_creditcard_train, y_creditcard_test = train_test_split(
    X_creditcard, y_creditcard, test_size=0.3, random_state=42, stratify=y_creditcard
)
print(f"Creditcard_Data train shape: {X_creditcard_train.shape}, test shape: {X_creditcard_test.shape}")


print("\nHandling Class Imbalance...")

print(f"Original Fraud_Data training set shape: {Counter(y_fraud_train)}")

numerical_cols_fraud_train = X_fraud_train.select_dtypes(include=np.number).columns
categorical_cols_fraud_train = X_fraud_train.select_dtypes(include='object').columns

encoder_fraud_smote = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
X_fraud_train_cat_encoded = encoder_fraud_smote.fit_transform(X_fraud_train[categorical_cols_fraud_train])
X_fraud_train_cat_encoded_df = pd.DataFrame(X_fraud_train_cat_encoded, columns=encoder_fraud_smote.get_feature_names_out(categorical_cols_fraud_train), index=X_fraud_train.index)

X_fraud_train_combined = pd.concat([X_fraud_train[numerical_cols_fraud_train], X_fraud_train_cat_encoded_df], axis=1)

smote = SMOTE(random_state=42)
X_fraud_train_res, y_fraud_train_res = smote.fit_resample(X_fraud_train_combined, y_fraud_train)
print(f"Resampled Fraud_Data training set shape (SMOTE): {Counter(y_fraud_train_res)}")


print(f"Original Creditcard_Data training set shape: {Counter(y_creditcard_train)}")
rus = RandomUnderSampler(random_state=42)
X_creditcard_train_res, y_creditcard_train_res = rus.fit_resample(X_creditcard_train, y_creditcard_train)
print(f"Resampled Creditcard_Data training set shape (RandomUnderSampler): {Counter(y_creditcard_train_res)}")




Performing Train-Test Split...
Fraud_Data train shape: (105778, 7), test shape: (45334, 7)
Creditcard_Data train shape: (198608, 29), test shape: (85118, 29)

Handling Class Imbalance...
Original Fraud_Data training set shape: Counter({0: 95872, 1: 9906})
Resampled Fraud_Data training set shape (SMOTE): Counter({0: 95872, 1: 95872})
Original Creditcard_Data training set shape: Counter({0: 198277, 1: 331})
Resampled Creditcard_Data training set shape (RandomUnderSampler): Counter({0: 331, 1: 331})


### Normalization and Scaling

In [8]:

print("\nPerforming Normalization and Scaling...")
numerical_cols_fraud_resampled = X_fraud_train_res.columns
scaler_fraud = StandardScaler()
X_fraud_train_res[numerical_cols_fraud_resampled] = scaler_fraud.fit_transform(X_fraud_train_res[numerical_cols_fraud_resampled])

# Transform X_fraud_test using the scaler fitted on the training data
X_fraud_test_cat_encoded = encoder_fraud_smote.transform(X_fraud_test[categorical_cols_fraud_train])
X_fraud_test_cat_encoded_df = pd.DataFrame(X_fraud_test_cat_encoded, columns=encoder_fraud_smote.get_feature_names_out(categorical_cols_fraud_train), index=X_fraud_test.index)

# Combine numerical and encoded categorical features for X_fraud_test
X_fraud_test_combined = pd.concat([X_fraud_test[numerical_cols_fraud_train], X_fraud_test_cat_encoded_df], axis=1)
X_fraud_test = scaler_fraud.transform(X_fraud_test_combined)
# Convert X_fraud_test back to DataFrame with column names after scaling
X_fraud_test = pd.DataFrame(X_fraud_test, columns=X_fraud_train_res.columns, index=X_fraud_test_combined.index)


# Identify numerical columns for scaling in Creditcard_Data
numerical_cols_creditcard = X_creditcard_train_res.select_dtypes(include=np.number).columns
scaler_creditcard = StandardScaler()
X_creditcard_train_res[numerical_cols_creditcard] = scaler_creditcard.fit_transform(X_creditcard_train_res[numerical_cols_creditcard])
X_creditcard_test[numerical_cols_creditcard] = scaler_creditcard.transform(X_creditcard_test[numerical_cols_creditcard])
print("Creditcard_Data numerical features scaled using StandardScaler.")


Performing Normalization and Scaling...
Creditcard_Data numerical features scaled using StandardScaler.


### Encode Categorical Features

In [9]:
print("\nCategorical Feature Encoding for Fraud_Data.csv was performed before SMOTE.")
print("Creditcard.csv has no categorical features requiring encoding.")


print("\n--- Data Transformation Complete ---")

print("\nFinal shapes of transformed data:")
print(f"Fraud_Data X_train_res: {X_fraud_train_res.shape}, y_train_res: {y_fraud_train_res.shape}")
print(f"Fraud_Data X_test: {X_fraud_test.shape}, y_test: {y_fraud_test.shape}")
print(f"Creditcard_Data X_train_res: {X_creditcard_train_res.shape}, y_train_res: {y_creditcard_train_res.shape}")
print(f"Creditcard_Data X_test: {X_creditcard_test.shape}, y_test: {y_creditcard_test.shape}")

print("\nFirst 5 rows of transformed X_fraud_train_res (Fraud_Data):")
print(X_fraud_train_res.head())
print("\nFirst 5 rows of transformed X_creditcard_train_res (Creditcard_Data):")
print(X_creditcard_train_res.head())


Categorical Feature Encoding for Fraud_Data.csv was performed before SMOTE.
Creditcard.csv has no categorical features requiring encoding.

--- Data Transformation Complete ---

Final shapes of transformed data:
Fraud_Data X_train_res: (191744, 191), y_train_res: (191744,)
Fraud_Data X_test: (45334, 191), y_test: (45334,)
Creditcard_Data X_train_res: (662, 29), y_train_res: (662,)
Creditcard_Data X_test: (85118, 29), y_test: (85118,)

First 5 rows of transformed X_fraud_train_res (Fraud_Data):
   purchase_value       age  ip_address_int  source_Ads  source_Direct  \
0        0.283339 -0.257716       -0.596492    1.287288      -0.545219   
1        0.057481 -0.137021       -1.028707    1.287288      -0.545219   
2       -1.410600  0.345759       -0.656143   -0.836398      -0.545219   
3       -1.410600 -1.706056        0.329138   -0.836398      -0.545219   
4        0.226875 -0.619801        0.598759   -0.836398      -0.545219   

   source_SEO  browser_Chrome  browser_FireFox  browser