In [None]:
import pandas as pd

# Store file path in a variable
file_path = "/content/drive/MyDrive/Colab Notebooks/processed_fraud_data.csv"

# Load dataset
df = pd.read_csv(file_path)

# Expand Pandas display settings
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', 1000)  # Increase width for better visibility
pd.set_option('display.max_colwidth', None)  # Show full content of each column



# Display first few rows
print(df.head())



   Transaction_Amount  Transaction_Type  Account_Balance  Device_Type  IP_Address_Flag  Previous_Fraudulent_Activity  Daily_Transaction_Count  Avg_Transaction_Amount_7d  Failed_Transaction_Count_7d  Card_Type  Transaction_Distance  Authentication_Method  Is_Weekend  Fraud_Label  Year  Month  Day  Hour  Minute  Second
0               39.79                 3         93213.17            0                0                             0                        7                     437.63                            3          0                883.17                      0           0            0  2023      8   14    19      30       0
1                1.19                 1         75725.25            1                0                             0                       13                     478.76                            4          2               2203.36                      3           0            1  2023      6    7     4       1       0
2               28.96                 2    

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Ensure Fraud_Label is numeric
df["Fraud_Label"] = pd.to_numeric(df["Fraud_Label"], errors='coerce')

# Convert y to a NumPy array to avoid filtering issues
y = df["Fraud_Label"].values  # Explicitly extract fraud labels

# Ensure X and y are aligned
X = df.iloc[:, :-1].values  # Features (excluding 'Fraud_Label')

# Normalize feature values
scaler = MinMaxScaler(feature_range=(-1, 1))  # Ensure correct feature range
X_scaled = scaler.fit_transform(X.astype(float))  # Convert to float if needed

# Refilter X_minority correctly
X_majority = X_scaled[y == 0]  # Non-fraud
X_minority = X_scaled[y == 1]  # Fraud

# Debugging: Check if X_minority is empty
print(f"Fraud cases in dataset: {np.sum(y == 1)}")
print(f"Size of X_minority: {X_minority.shape[0]}")
if X_minority.shape[0] == 0:
    raise ValueError("Error: No fraud samples found in X_minority. Please check dataset.")


Fraud cases in dataset: 16067
Size of X_minority: 16067


In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, LeakyReLU
from tensorflow.keras.models import Model

latent_dim = 10  # Size of noise vector

# Generator Model
def build_generator():
    input_noise = Input(shape=(latent_dim,))
    x = Dense(16)(input_noise)
    x = LeakyReLU(alpha=0.2)(x)
    x = Dense(32)(x)
    x = LeakyReLU(alpha=0.2)(x)
    x = Dense(X.shape[1], activation='tanh')(x)  # Output layer
    model = Model(input_noise, x)
    return model

generator = build_generator()
generator.summary()



In [None]:
# Discriminator Model
def build_discriminator():
    input_data = Input(shape=(X.shape[1],))
    x = Dense(32)(input_data)
    x = LeakyReLU(alpha=0.2)(x)
    x = Dense(16)(x)
    x = LeakyReLU(alpha=0.2)(x)
    x = Dense(1, activation='sigmoid')(x)
    model = Model(input_data, x)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

discriminator = build_discriminator()
discriminator.summary()

In [None]:
discriminator.trainable = False  # Freeze discriminator while training GAN

gan_input = Input(shape=(latent_dim,))
generated_sample = generator(gan_input)
gan_output = discriminator(generated_sample)

gan = Model(gan_input, gan_output)
gan.compile(loss='binary_crossentropy', optimizer='adam')

gan.summary()

In [None]:
print(f"X_minority shape: {X_minority.shape}")
print(f"Number of fraud samples: {np.sum(y == 1)}")


X_minority shape: (16067, 19)
Number of fraud samples: 16067


In [None]:
import numpy as np

def train_gan(epochs=5000, batch_size=32):
    for epoch in range(epochs):
        # Generate fake samples
        noise = np.random.normal(0, 1, (batch_size, latent_dim))
        generated_data = generator.predict(noise, verbose=0)

        # Select random real fraud samples
        idx = np.random.randint(0, X_minority.shape[0], batch_size)
        real_data = X_minority[idx]

        # Labels for training
        real_labels = np.ones((batch_size, 1))
        fake_labels = np.zeros((batch_size, 1))

        # Train Discriminator
        d_loss_real = discriminator.train_on_batch(real_data, real_labels)
        d_loss_fake = discriminator.train_on_batch(generated_data, fake_labels)
        d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)

        # Train Generator
        noise = np.random.normal(0, 1, (batch_size, latent_dim))
        g_loss = gan.train_on_batch(noise, np.ones((batch_size, 1)))

        # Print loss only every 1000 epochs
        if (epoch + 1) % 1000 == 0:
            print(f"Epoch {epoch + 1}, D Loss: {d_loss[0]:.4f}, G Loss: {g_loss:.4f}")

train_gan()


Epoch 1000, D Loss: 1.3687, G Loss: 0.1256
Epoch 2000, D Loss: 1.3697, G Loss: 0.1251
Epoch 3000, D Loss: 1.3706, G Loss: 0.1247
Epoch 4000, D Loss: 1.3712, G Loss: 0.1244
Epoch 5000, D Loss: 1.3717, G Loss: 0.1242


In [None]:
num_samples = 17000  # Required synthetic fraud samples
noise = np.random.normal(0, 1, (num_samples, latent_dim))
synthetic_samples = generator.predict(noise)

[1m532/532[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step


In [None]:
# Combine original and synthetic data
X_balanced = np.vstack((X_scaled, synthetic_samples))
y_balanced = np.hstack((y, np.ones(len(synthetic_samples))))  # Label synthetic samples as fraud

# Convert to DataFrame
balanced_df = pd.DataFrame(X_balanced, columns=df.columns[:-1])
balanced_df["Fraud_Label"] = y_balanced

# Save the dataset
balanced_df.to_csv("balanced_sdggan.csv", index=False)

print("Balanced dataset saved successfully!")

Balanced dataset saved successfully!


In [None]:
from google.colab import files
files.download("balanced_sdggan.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from sklearn.preprocessing import MinMaxScaler

# 🔹 Step 1: Reload the original dataset (already loaded in your SDGGAN notebook)
# Ensure you have the dataset before refitting the scaler
original_df = df.copy()  # df is your original dataset

# 🔹 Step 2: Identify numerical columns used in normalization
numerical_columns = ['Transaction_Amount', 'Account_Balance', 'Daily_Transaction_Count',
                     'Avg_Transaction_Amount_7d', 'Failed_Transaction_Count_7d',
                     'Transaction_Distance', 'Year', 'Month', 'Day', 'Hour', 'Minute']

# 🔹 Step 3: Refit the MinMaxScaler on original dataset (to get original min/max values)
scaler = MinMaxScaler(feature_range=(-1, 1))
scaler.fit(original_df[numerical_columns])  # Fit on original data

# 🔹 Step 4: Apply inverse transform to denormalize the balanced dataset
balanced_df[numerical_columns] = scaler.inverse_transform(balanced_df[numerical_columns])

# 🔹 Step 5: Save the denormalized dataset
balanced_df.to_csv("denormalized_balanced_sdggan.csv", index=False)

print("✅ Denormalization complete! File saved as 'denormalized_balanced_sdggan.csv'.")



✅ Denormalization complete! File saved as 'denormalized_balanced_sdggan.csv'.


In [None]:
from google.colab import files

# Download the denormalized dataset
files.download("denormalized_balanced_sdggan.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>