In [None]:
!pip install faker tensorflow

import numpy as np
import pandas as pd
from faker import Faker
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import random

# Initialize Faker for synthetic data generation
fake = Faker()

# Generate a small dataset to train the GAN
def create_real_data(num_samples):
    records = []
    for _ in range(num_samples):
        amount = round(np.random.uniform(10.0, 2000.0), 2)
        card_type = random.choice(['visa', 'mastercard', 'amex'])
        timestamp = fake.date_time_this_year()
        location = np.random.choice(['New York', 'Los Angeles', 'San Francisco', 'Houston', 'Chicago'])
        transaction_type = random.choice(['Purchase', 'Refund', 'Cash Withdrawal'])
        card_number = fake.credit_card_number(card_type=card_type)
        masked_card_number = f"{card_number[:4]} ******** {card_number[-4:]}"
        records.append([masked_card_number, card_type, timestamp, amount, transaction_type, location])
    return np.array(records)

# Create a dataset of real transactions
real_data = create_real_data(1000)  # Create 1000 samples for training

# Preprocess the data
def preprocess_data(data):
    amounts = data[:, 3].astype(np.float32)
    card_types = data[:, 1]
    locations = data[:, 5]
    transaction_types = data[:, 4]

    card_type_encoded = pd.get_dummies(card_types, drop_first=True)
    location_encoded = pd.get_dummies(locations, drop_first=True)
    transaction_type_encoded = pd.get_dummies(transaction_types, drop_first=True)

    processed_data = np.hstack((amounts.reshape(-1, 1), card_type_encoded.values, location_encoded.values.astype(np.float32), transaction_type_encoded.values))

    assert processed_data.shape[1] == 9, f"Expected 9 features, got {processed_data.shape[1]}"

    return processed_data

processed_real_data = preprocess_data(real_data)

# Create the generator model
def build_generator():
    model = keras.Sequential()
    model.add(layers.Dense(128, activation='relu', input_dim=10))
    model.add(layers.Dense(9, activation='linear'))  # Changed to 9 to match processed data shape
    return model

# Create the discriminator model
def build_discriminator():
    model = keras.Sequential()
    model.add(layers.Dense(128, activation='relu', input_shape=(9,)))  # Changed to (9,) to match processed data shape
    model.add(layers.Dense(1, activation='sigmoid'))
    return model

# Build and compile the models
generator = build_generator()
discriminator = build_discriminator()
discriminator.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Create the GAN model
def build_gan(generator, discriminator):
    discriminator.trainable = False
    model = keras.Sequential([generator, discriminator])
    model.compile(loss='binary_crossentropy', optimizer='adam')
    return model

gan = build_gan(generator, discriminator)

# Training the GAN
def train_gan(epochs, batch_size):
    for epoch in range(epochs):
        idx = np.random.randint(0, processed_real_data.shape[0], batch_size)
        real_samples = processed_real_data[idx]

        noise = np.random.normal(0, 1, (batch_size, 10))
        fake_samples = generator.predict(noise)

        real_labels = np.ones((batch_size, 1))
        fake_labels = np.zeros((batch_size, 1))

        d_loss_real = discriminator.train_on_batch(real_samples, real_labels)
        d_loss_fake = discriminator.train_on_batch(fake_samples, fake_labels)
        d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)

        noise = np.random.normal(0, 1, (batch_size, 10))
        g_loss = gan.train_on_batch(noise, real_labels)

        if (epoch + 1) % 100 == 0:
            print(f"{epoch + 1}/{epochs} [D loss: {d_loss[0]:.4f}, acc.: {100 * d_loss[1]:.2f}%] [G loss: {g_loss:.4f}]")

# Train the GAN for 50 epochs with a batch size of 32
train_gan(epochs=50, batch_size=32)

# Generate synthetic data using the trained generator
def generate_synthetic_data(num_samples):
    noise = np.random.normal(0, 1, (num_samples, 10))
    generated_samples = generator.predict(noise)

    amounts = generated_samples[:, 0]
    card_types_encoded = generated_samples[:, 1:3]  # Adjusted to match the new shape
    locations_encoded = generated_samples[:, 3:7]  # Adjusted to match the new shape
    transaction_types_encoded = generated_samples[:, 7:]  # Adjusted to match the new shape

    card_types = np.argmax(card_types_encoded, axis=1)
    locations = np.argmax(locations_encoded, axis=1)
    transaction_types = np.argmax(transaction_types_encoded, axis=1)

    card_type_names = ['visa', 'mastercard', 'amex']
    location_names = ['New York', 'Los Angeles', 'San Francisco', 'Houston', 'Chicago']
    transaction_type_names = ['Purchase', 'Refund', 'Cash Withdrawal']

    card_type_output = [card_type_names[i] for i in card_types]
    location_output = [location_names[i] for i in locations]
    transaction_type_output = [transaction_type_names[i] for i in transaction_types]

    transaction_ids = np.arange(1, num_samples + 1)
    card_numbers = [f"{random.randint(1000, 9999)} ******** {random.randint(1000, 9999)}" for _ in range(num_samples)]
    timestamps = [fake.date_time_this_year() for _ in range(num_samples)]

    return np.column_stack((transaction_ids, card_numbers, card_type_output, timestamps, amounts, transaction_type_output, location_output))

# Generate 500 synthetic transaction samples
synthetic_data = generate_synthetic_data(500)

# Convert synthetic data to DataFrame for easy viewing
synthetic_df = pd.DataFrame(synthetic_data, columns=['Transaction ID', 'Card Number', 'Card Type', 'Transaction Date & Time', 'Amount', 'Transaction Type', 'Location'])
print(synthetic_df.head())

# Optionally, save to CSV
synthetic_df.to_csv('synthetic_transaction_data_gan.csv', index=False)

Collecting faker
  Downloading Faker-30.4.0-py3-none-any.whl.metadata (15 kB)
Downloading Faker-30.4.0-py3-none-any.whl (1.8 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/1.8 MB[0m [31m5.0 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━[0m [32m1.4/1.8 MB[0m [31m20.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-30.4.0
