In [None]:
import time  # Module to measure time intervals
import pandas as pd  # For data manipulation and analysis
import numpy as np  # For numerical operations
import matplotlib.pyplot as plt  # For plotting and data visualization
import seaborn as sns  # For statistical data visualization

# Measure start time for script execution
t0 = time.time()

# Set Seaborn style for better aesthetics in plots
sns.set_style("whitegrid")

# Measure end time for imports
t1 = time.time()
print("Time consumed for imports:", t1 - t0, "seconds")

# Load the dataset (ensure the file path is correct)
data = pd.read_excel("ProjectCreditCard.xlsx")
print(data.head())  # Display the first few rows of the dataset

# Measure time taken for loading the dataset
t2 = time.time()
print("Time consumed for loading data:", t2 - t1, "seconds")

# Display basic dataset information
data.info()

# Set display option for floating-point numbers
pd.set_option("display.float", "{:.2f}".format)

# Display descriptive statistics of the dataset
data.describe()

# Check for missing values in the dataset
print("Total missing values:", data.isnull().sum().sum())

# Print column names for reference
print("Dataset columns:", data.columns)

# Define class labels for visualization
LABELS = ["Normal", "Fraud"]

# Visualize the class distribution
count_classes = pd.value_counts(data['Class'], sort=True)
count_classes.plot(kind='bar', rot=0)
plt.title("Transaction Class Distribution")
plt.xticks(range(2), LABELS)
plt.xlabel("Class")
plt.ylabel("Frequency")
plt.show()

# Count class distribution
print(data.Class.value_counts())

# Measure time taken for initial data exploration
t3 = time.time()
print("Time consumed for initial exploration:", t3 - t2, "seconds")

# Separate fraudulent and non-fraudulent transactions
fraud = data[data['Class'] == 1]
normal = data[data['Class'] == 0]

# Display shapes of fraudulent and non-fraudulent data
print(f"Shape of fraudulent transactions: {fraud.shape}")
print(f"Shape of non-fraudulent transactions: {normal.shape}")

# Compare descriptive statistics for transaction amounts
print(pd.concat([fraud.Amount.describe(), normal.Amount.describe()], axis=1))

# Measure time taken for data separation
t4 = time.time()
print("Time consumed for data separation:", t4 - t3, "seconds")

# Compare time distributions for fraudulent and non-fraudulent transactions
print(pd.concat([fraud.Time.describe(), normal.Time.describe()], axis=1))

# Plot the time distribution
plt.figure(figsize=(14, 10))

# Time distribution for all transactions
plt.subplot(2, 2, 1)
plt.title('Time Distribution (Seconds)')
sns.displot(data['Time'], color='blue')

# Amount distribution for all transactions
plt.subplot(2, 2, 2)
plt.title('Distribution of Amount')
sns.displot(data['Amount'], color='blue')

# Histograms for time distributions
plt.figure(figsize=(14, 12))

# Fraudulent transactions
plt.subplot(2, 2, 1)
data[data.Class == 1].Time.hist(bins=35, color='blue', alpha=0.6, label="Fraudulent Transactions")
plt.legend()

# Non-fraudulent transactions
plt.subplot(2, 2, 2)
data[data.Class == 0].Time.hist(bins=35, color='blue', alpha=0.6, label="Non-Fraudulent Transactions")
plt.legend()

# Heatmap for correlation analysis
plt.figure(figsize=(10, 10))
sns.heatmap(data=data.corr(), cmap="seismic")
plt.title("Feature Correlation Heatmap")
plt.show()

# Measure time taken for visualization
t5 = time.time()
print("Time consumed for visualization:", t5 - t4, "seconds")

# ========================
# DATA PREPROCESSING
# ========================
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Initialize a standard scaler
scalar = StandardScaler()

# Separate features and target
X = data.drop('Class', axis=1)
y = data.Class

# Split data into training, validation, and test sets
X_train_v, X_test, y_train_v, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train, X_validate, y_train, y_validate = train_test_split(X_train_v, y_train_v, test_size=0.2, random_state=42)

# Scale the data
X_train = scalar.fit_transform(X_train)
X_validate = scalar.transform(X_validate)
X_test = scalar.transform(X_test)

# Calculate class weights for imbalance handling
w_p = y_train.value_counts()[0] / len(y_train)  # Weight for non-fraudulent class
w_n = y_train.value_counts()[1] / len(y_train)  # Weight for fraudulent class

# Print class weights and dataset shapes
print(f"Fraudulent transaction weight: {w_n}")
print(f"Non-Fraudulent transaction weight: {w_p}")
print(f"TRAINING: X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"VALIDATION: X_validate: {X_validate.shape}, y_validate: {y_validate.shape}")
print(f"TESTING: X_test: {X_test.shape}, y_test: {y_test.shape}")

# Measure time taken for preprocessing
t6 = time.time()
print("Time consumed for preprocessing:", t6 - t5, "seconds")

# Import necessary libraries for evaluation metrics
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score

# Function to print evaluation metrics for training and testing
def print_score(label, prediction, train=True):
    if train:
        # Generate classification report as a DataFrame
        clf_report = pd.DataFrame(classification_report(label, prediction, output_dict=True))
        print("Train Result:\n================================================")
        print(f"Accuracy Score: {accuracy_score(label, prediction) * 100:.2f}%")
        print("_______________________________________________")
        print(f"Classification Report:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(y_train, prediction)}\n")
    elif train == False:
        # Generate classification report for test data
        clf_report = pd.DataFrame(classification_report(label, prediction, output_dict=True))
        print("Test Result:\n================================================")        
        print(f"Accuracy Score: {accuracy_score(label, prediction) * 100:.2f}%")
        print("_______________________________________________")
        print(f"Classification Report:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(label, prediction)}\n")

# Measure and print time taken for metrics calculation
t7 = time.time()
print("Time consumed:", t7 - t6, "sec")

# Importing TensorFlow's Keras library for building the ANN model
from tensorflow import keras

# Building the ANN model with multiple dense layers
model = keras.Sequential([
    keras.layers.Dense(256, activation='relu', input_shape=(X_train.shape[-1],)),  # Input layer
    keras.layers.BatchNormalization(),  # Normalize layer output
    keras.layers.Dropout(0.3),  # Dropout for regularization
    keras.layers.Dense(256, activation='relu'),  # Hidden layer 1
    keras.layers.BatchNormalization(),
    keras.layers.Dropout(0.3),
    keras.layers.Dense(256, activation='relu'),  # Hidden layer 2
    keras.layers.BatchNormalization(),
    keras.layers.Dropout(0.3),
    keras.layers.Dense(1, activation='sigmoid'),  # Output layer for binary classification
])

# Display the model summary
model.summary()

# Measure and print time taken for model building
t8 = time.time()
print("Time consumed:", t8 - t7, "sec")

# Define evaluation metrics for the model
METRICS = [
    keras.metrics.FalseNegatives(name='fn'),
    keras.metrics.FalsePositives(name='fp'),
    keras.metrics.TrueNegatives(name='tn'),
    keras.metrics.TruePositives(name='tp'),
    keras.metrics.Precision(name='precision'),
    keras.metrics.Recall(name='recall')
]

# Compile the model with Adam optimizer and binary cross-entropy loss
model.compile(optimizer=keras.optimizers.Adam(1e-4), loss='binary_crossentropy', metrics=METRICS)

# Define callback to save the model at each epoch
callbacks = [keras.callbacks.ModelCheckpoint('fraud_model_at_epoch_{epoch}.keras')]

# Set class weights to handle class imbalance
class_weight = {0: w_p, 1: w_n}

# Train the model with the training data
r = model.fit(
    X_train, y_train, 
    validation_data=(X_validate, y_validate),  # Validation data
    batch_size=2048,  # Batch size
    epochs=300,  # Number of epochs
    callbacks=callbacks,  # Model checkpoint callback
)

# Measure and print time taken for training
t9 = time.time()
print("Time consumed:", t9 - t8, "sec")

# Evaluate the model on test data and print metrics
score = model.evaluate(X_test, y_test)
print(score)

# Plot training metrics over epochs
plt.figure(figsize=(12, 16))

plt.subplot(4, 2, 1)
plt.plot(r.history['loss'], label='Loss')
plt.plot(r.history['val_loss'], label='val_Loss')
plt.title('Loss Function evolution during training')
plt.legend()

plt.subplot(4, 2, 2)
plt.plot(r.history['fn'], label='fn')
plt.plot(r.history['val_fn'], label='val_fn')
plt.title('False Negatives evolution during training')
plt.legend()

plt.subplot(4, 2, 3)
plt.plot(r.history['precision'], label='Precision')
plt.plot(r.history['val_precision'], label='val_Precision')
plt.title('Precision evolution during training')
plt.legend()

plt.subplot(4, 2, 4)
plt.plot(r.history['recall'], label='Recall')
plt.plot(r.history['val_recall'], label='val_Recall')
plt.title('Recall evolution during training')
plt.legend()

# Generate predictions for training and test data
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Print evaluation metrics for training and test data
print_score(y_train, y_train_pred.round(), train=True)
print_score(y_test, y_test_pred.round(), train=False)

# Save F1 scores for comparison
scores_dict = {
    'ANNs': {
        'Train': f1_score(y_train, y_train_pred.round()),
        'Test': f1_score(y_test, y_test_pred.round()),
    },
}

# Measure and print time taken for evaluation
t10 = time.time()
print("Time consumed:", t10 - t9, "sec")

# Import Random Forest classifier from sklearn
from sklearn.ensemble import RandomForestClassifier

# Train Random Forest classifier
rf_clf = RandomForestClassifier(n_estimators=100, oob_score=False)
rf_clf.fit(X_train, y_train)

# Generate predictions for training and test data
y_train_pred = rf_clf.predict(X_train)
y_test_pred = rf_clf.predict(X_test)

# Print evaluation metrics for Random Forest model
print_score(y_train, y_train_pred, train=True)
print_score(y_test, y_test_pred, train=False)

# Save F1 scores for comparison
scores_dict['Random Forest'] = {
    'Train': f1_score(y_train, y_train_pred),
    'Test': f1_score(y_test, y_test_pred),
}

# Measure and print time taken for Random Forest training and evaluation
t11 = time.time()
print("Time consumed:", t11 - t10, "sec")

# Import LightGBM classifier
from lightgbm import LGBMClassifier

# Train LightGBM classifier
lgbm_clf = LGBMClassifier()
lgbm_clf.fit(X_train, y_train)

# Generate predictions for training and test data
y_train_pred = lgbm_clf.predict(X_train)
y_test_pred = lgbm_clf.predict(X_test)

# Print evaluation metrics for LightGBM model
print_score(y_train, y_train_pred, train=True)
print_score(y_test, y_test_pred, train=False)

# Save F1 scores for comparison
scores_dict['LigthGBM'] = {
    'Train': f1_score(y_train, y_train_pred),
    'Test': f1_score(y_test, y_test_pred),
}

# Measure and print total time consumed
t12 = time.time()
print("Time consumed:", t12 - t11, "sec")
print("Total time consumed:", t12 - t0, "sec")
