In [1]:
import pandas as pd
import numpy as np
from random import choices
from faker import Faker

# Instantiate a Faker object for generating random data
fake = Faker()

# Set a seed for reproducibility
np.random.seed(0)

# Specify the number of transactions to generate
num_transactions = 100000

# Generate transaction IDs
transaction_ids = range(1, num_transactions + 1)

# Generate account IDs
account_ids = ['12345678'] * num_transactions  # Assuming it's the same account

# Generate transaction datetimes over the past year
date_range = pd.date_range(end='today', periods=num_transactions, freq='1min')
datetimes = date_range.to_pydatetime().tolist()

# Generate transaction amounts
amounts = np.random.gamma(4, 200, num_transactions)  # Randomly generate amounts

# Generate merchant names
merchants = [fake.company() for _ in range(num_transactions)]

# Generate transaction types
transaction_types = choices(['POS', 'ATM', 'Online'], k=num_transactions)

# Generate fraud labels (use a small probability for fraud to reflect reality)
fraud_labels = np.random.choice([0, 1], size=num_transactions, p=[0.99, 0.01])

# Create DataFrame
df = pd.DataFrame({
    'TransactionID': transaction_ids,
    'AccountID': account_ids,
    'Datetime': datetimes,
    'Amount': amounts,
    'Merchant': merchants,
    'Type': transaction_types,
    'Fraud': fraud_labels
})


In [2]:
df.head()

Unnamed: 0,TransactionID,AccountID,Datetime,Amount,Merchant,Type,Fraud
0,1,12345678,2023-04-14 11:38:45.274892,1637.608278,"Lawrence, Hamilton and Gonzalez",ATM,0
1,2,12345678,2023-04-14 11:39:45.274892,897.504799,Brooks and Sons,ATM,0
2,3,12345678,2023-04-14 11:40:45.274892,1706.269011,Young LLC,ATM,0
3,4,12345678,2023-04-14 11:41:45.274892,419.125226,Bolton-Mullen,Online,0
4,5,12345678,2023-04-14 11:42:45.274892,694.509544,Fitzgerald LLC,Online,0


In [12]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.feature_extraction import FeatureHasher

# Define features and target
X = df[['Datetime', 'Amount', 'Merchant', 'Type']]
y = df['Fraud']

# Convert 'Datetime' to ordinal
X['Datetime'] = X['Datetime'].apply(lambda x: x.toordinal())

# Initialize the feature hasher
hasher = FeatureHasher(n_features=100, input_type='string')

# Transform the 'Merchant' column
merchant_hashed = hasher.transform([list(x) for x in X['Merchant']])

# Create a DataFrame from the hashed features
merchant_cols = [f'merchant_{i}' for i in range(100)]
merchant_df = pd.DataFrame(merchant_hashed.toarray(), columns=merchant_cols)

# Drop the original 'Merchant' column and concatenate the hashed features
X_encoded = pd.concat([X.drop('Merchant', axis=1), merchant_df], axis=1)

# Create an instance of SMOTE
smote = SMOTE(random_state=42)

# Resample the data
X_res, y_res = smote.fit_resample(X_encoded, y)

# Now you can see the distribution of classes
print(pd.Series(y_res).value_counts())

# Split the data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

# Define the number of input features
n_features = X_train.shape[1]

# Define the encoder
inputs = Input(shape=(n_features,))
encoded = Dense(16, activation='relu')(inputs)
encoded = Dense(8, activation='relu')(encoded)

# Define the decoder
decoded = Dense(16, activation='relu')(encoded)
decoded = Dense(n_features, activation='sigmoid')(decoded)

# Combine the encoder and the decoder into an autoencoder
autoencoder = Model(inputs, decoded)

# Compile the model
autoencoder.compile(optimizer='adam', loss='mean_squared_error')

# Fit the model on non-fraudulent transactions
autoencoder.fit(X_train[y_train == 0], X_train[y_train == 0],
                epochs=100, batch_size=256, shuffle=True,
                validation_split=0.2)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Datetime'] = X['Datetime'].apply(lambda x: x.toordinal())


ValueError: could not convert string to float: 'ATM'