<a href="https://colab.research.google.com/github/dyna478/Paz/blob/main/AR_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
unique_location_types = df['Location'].unique()
print(unique_location_types)
df2 = pd.read_csv('/content/financial_anomaly_data.csv')
df2['TransactionType'] = df2['TransactionType'].fillna('Unknown')
df2['Location'] = df2['Location'].fillna('Unknown')
df_encoded = pd.get_dummies(df2, columns=['TransactionType'], prefix='TransactionType')
df_encoded = pd.get_dummies(df2, columns=['Location', 'TransactionType'], prefix=['Location', 'TransactionType'])
print(df_encoded.head())

In [None]:
print(df_encoded.isnull().sum())
data1 = df_encoded.dropna()
print(data1.isnull().sum())

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, Dropout, Conv1DTranspose
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from tensorflow.keras.utils import Sequence
import matplotlib.pyplot as plt

def generate_time_series(num_samples, time_steps, num_features):
    """Generate synthetic time-series data."""
    data = np.random.randn(num_samples, time_steps, num_features).astype(np.float32)
    return data

num_samples = 10000
time_steps = 20
num_features = 128

data = data1

train_data = data[:8000]
val_data = data[8000:]

class TimeSeriesGenerator(Sequence):
    """Custom data generator for time-series data."""
    def __init__(self, data, batch_size=300):
        self.data = data
        self.batch_size = batch_size

    def __len__(self):
        return len(self.data) // self.batch_size

    def __getitem__(self, idx):
        batch = self.data[idx * self.batch_size : (idx + 1) * self.batch_size]
        return batch, batch
batch_size = 300
train_generator = TimeSeriesGenerator(train_data, batch_size)
val_generator = TimeSeriesGenerator(val_data, batch_size)

def build_autoencoder(input_shape):
    """Build a convolutional autoencoder."""
    model = Sequential([
        # Encoder
        Conv1D(32, kernel_size=3, activation='relu', padding='same', input_shape=input_shape),
        Dropout(0.2),
        Conv1D(16, kernel_size=3, activation='relu', padding='same'),

        # Decoder
        Conv1DTranspose(16, kernel_size=3, activation='relu', padding='same'),
        Dropout(0.2),
        Conv1DTranspose(32, kernel_size=3, activation='relu', padding='same'),
        Conv1DTranspose(num_features, kernel_size=3, padding='same')  # Output layer (no activation)
    ])
    return model
input_shape = (time_steps, num_features)
autoencoder = build_autoencoder(input_shape)
autoencoder.compile(optimizer='adam', loss='mse')
autoencoder.summary()
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=1e-6)
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
epochs = 40
steps_per_epoch = len(train_generator)
validation_steps = len(val_generator)
history = autoencoder.fit(
    train_generator,
    epochs=epochs,
    steps_per_epoch=steps_per_epoch,
    validation_data=val_generator,
    validation_steps=validation_steps,
    callbacks=[reduce_lr, early_stopping]
)


plt.figure(figsize=(10, 6))
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss (MSE)')
plt.legend()
plt.grid(True)
plt.show()
sample = val_data[:1]
reconstructed = autoencoder.predict(sample)
plt.figure(figsize=(10, 6))
plt.plot(sample[0, :, 0], label='Original')
plt.plot(reconstructed[0, :, 0], label='Reconstructed')
plt.title('Original vs Reconstructed Time-Series')
plt.xlabel('Time Steps')
plt.ylabel('Feature Value')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
import pandas as pd
import numpy as np
from tensorflow.keras.utils import Sequence

print(data1.head())
print(data1.isnull().sum())
print(data1.dtypes)

time_series_data = data1[['Amount', 'Location_London', 'Location_Los Angeles', 'Location_New York', 'Location_San Francisco', 'Location_Tokyo', 'TransactionType_Purchase', 'TransactionType_Transfer', 'TransactionType_Unknown', 'TransactionType_Withdrawal']]


time_series_data = time_series_data.to_numpy()


time_steps = 20
num_features = time_series_data.shape[1]
num_samples = len(time_series_data) // time_steps

time_series_data = time_series_data[:num_samples * time_steps]
time_series_data = time_series_data.reshape((num_samples, time_steps, num_features))

print(f"Reshaped data shape: {time_series_data.shape}")

class TimeSeriesGenerator(Sequence):
    """Custom data generator for time-series data."""
    def __init__(self, data, batch_size=300):
        self.data = data
        self.batch_size = batch_size

    def __len__(self):
        return int(np.ceil(len(self.data) / self.batch_size))

    def __getitem__(self, idx):
        batch_x = self.data[idx * self.batch_size : (idx + 1) * self.batch_size]
        batch_y = batch_x
        return batch_x, batch_y

    def on_epoch_end(self):
        """Shuffle data at the end of each epoch."""
        np.random.shuffle(self.data)

train_data = time_series_data[:8000]
val_data = time_series_data[8000:]
batch_size = 300
train_generator = TimeSeriesGenerator(train_data, batch_size)
val_generator = TimeSeriesGenerator(val_data, batch_size)

batch_x, batch_y = train_generator[0]
print(f"Batch shape (inputs): {batch_x.shape}")
print(f"Batch shape (targets): {batch_y.shape}")

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, Conv1DTranspose, Dropout, BatchNormalization, LeakyReLU
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
import matplotlib.pyplot as plt

def build_autoencoder(input_shape):
    """Build a convolutional autoencoder."""
    model = Sequential([
        # Encoder
        Conv1D(32, kernel_size=3, strides=1, padding='same', input_shape=input_shape),
        BatchNormalization(),
        LeakyReLU(alpha=0.1),
        Dropout(0.2),

        Conv1D(16, kernel_size=3, strides=2, padding='same'),
        BatchNormalization(),
        LeakyReLU(alpha=0.1),
        Dropout(0.2),


        Conv1DTranspose(16, kernel_size=3, strides=2, padding='same'),
        BatchNormalization(),
        LeakyReLU(alpha=0.1),
        Dropout(0.2),

        Conv1DTranspose(32, kernel_size=3, strides=1, padding='same'),
        BatchNormalization(),
        LeakyReLU(alpha=0.1),
        Dropout(0.2),


        Conv1DTranspose(input_shape[-1], kernel_size=3, padding='same')  # Reconstruct input
    ])
    return model
input_shape = (20, 10)

autoencoder = build_autoencoder(input_shape)
autoencoder.compile(optimizer='adam', loss='mse')
autoencoder.summary()

reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=1e-6)
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

epochs = 40
steps_per_epoch = len(train_generator)
validation_steps = len(val_generator)

history = autoencoder.fit(
    train_generator,
    epochs=epochs,
    steps_per_epoch=steps_per_epoch,
    validation_data=val_generator,
    validation_steps=validation_steps,
    callbacks=[reduce_lr, early_stopping]
)

plt.figure(figsize=(10, 6))
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss (MSE)')
plt.legend()
plt.grid(True)
plt.show()

sample = val_data[:1]  # Take one sample
reconstructed = autoencoder.predict(sample)

plt.figure(figsize=(10, 6))
plt.plot(sample[0, :, 0], label='Original')
plt.plot(reconstructed[0, :, 0], label='Reconstructed')
plt.title('Original vs Reconstructed Time-Series')
plt.xlabel('Time Steps')
plt.ylabel('Feature Value')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.utils import Sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, Conv1DTranspose, Dropout, BatchNormalization, LeakyReLU
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
import matplotlib.pyplot as plt

time_series_data = data1[['Amount', 'Location_London', 'Location_Los Angeles',
                          'Location_New York', 'Location_San Francisco', 'Location_Tokyo',
                          'TransactionType_Purchase', 'TransactionType_Transfer',
                          'TransactionType_Unknown', 'TransactionType_Withdrawal']]

print("Data types before conversion:", time_series_data.dtypes)

time_series_data = time_series_data.astype('float32')
if time_series_data.isnull().sum().sum() > 0:
    print("Found NaN values. Replacing with zeros.")
    time_series_data = time_series_data.fillna(0)
time_series_data = time_series_data.to_numpy()

print("Data shape before reshaping:", time_series_data.shape)
print("Data min/max values:", np.min(time_series_data), np.max(time_series_data))
print("Data contains NaN:", np.isnan(time_series_data).any())
print("Data contains Inf:", np.isinf(time_series_data).any())

time_steps = 20
num_features = time_series_data.shape[1]
num_samples = len(time_series_data) // time_steps

# Reshape the data
time_series_data = time_series_data[:num_samples * time_steps]
time_series_data = time_series_data.reshape((num_samples, time_steps, num_features))
print(f"Reshaped data shape: {time_series_data.shape}")
class TimeSeriesGenerator(Sequence):

    def __init__(self, data, batch_size=300):
        self.data = data
        self.batch_size = batch_size

    def __len__(self):
        return int(np.ceil(len(self.data) / self.batch_size))

    def __getitem__(self, idx):
        batch_x = self.data[idx * self.batch_size : (idx + 1) * self.batch_size]
        batch_y = batch_x
        return batch_x, batch_y

    def on_epoch_end(self):

        indices = np.arange(len(self.data))
        np.random.shuffle(indices)
        self.data = self.data[indices]
train_size = int(0.8 * num_samples)
train_data = time_series_data[:train_size]
val_data = time_series_data[train_size:]

print(f"Training data shape: {train_data.shape}")
print(f"Validation data shape: {val_data.shape}")
batch_size = min(300, len(train_data))
train_generator = TimeSeriesGenerator(train_data, batch_size)
val_generator = TimeSeriesGenerator(val_data, batch_size)

# Verify the generator - this is important to check for errors
try:
    batch_x, batch_y = train_generator[0]
    print(f"Batch shape (inputs): {batch_x.shape}")
    print(f"Batch shape (targets): {batch_y.shape}")
    print(f"Data type of batch: {batch_x.dtype}")
except Exception as e:
    print(f"Error in generator: {e}")

# Step 3: Build the Autoencoder Model
def build_autoencoder(input_shape):

    model = Sequential([

        Conv1D(32, kernel_size=3, strides=1, padding='same', input_shape=input_shape),
        BatchNormalization(),
        LeakyReLU(alpha=0.1),
        Dropout(0.2),

        Conv1D(16, kernel_size=3, strides=1, padding='same'),
        BatchNormalization(),
        LeakyReLU(alpha=0.1),
        Dropout(0.2),
        Conv1DTranspose(16, kernel_size=3, strides=1, padding='same'),
        BatchNormalization(),
        LeakyReLU(alpha=0.1),
        Dropout(0.2),

        Conv1DTranspose(32, kernel_size=3, strides=1, padding='same'),
        BatchNormalization(),
        LeakyReLU(alpha=0.1),
        Dropout(0.2),
        Conv1DTranspose(input_shape[-1], kernel_size=3, padding='same')
    ])
    return model
input_shape = (time_steps, num_features)
print(f"Model input shape: {input_shape}")
autoencoder = build_autoencoder(input_shape)
autoencoder.compile(optimizer='adam', loss='mse')
autoencoder.summary()
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=1e-6)
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

epochs = 40
steps_per_epoch = len(train_generator)
validation_steps = len(val_generator)

history = autoencoder.fit(
    train_generator,
    epochs=epochs,
    steps_per_epoch=steps_per_epoch,
    validation_data=val_generator,
    validation_steps=validation_steps,
    callbacks=[reduce_lr, early_stopping]
)

plt.figure(figsize=(10, 6))
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss (MSE)')
plt.legend()
plt.grid(True)
plt.show()
sample = val_data[:1]
reconstructed = autoencoder.predict(sample)

plt.figure(figsize=(10, 6))
plt.plot(sample[0, :, 0], label='Original')
plt.plot(reconstructed[0, :, 0], label='Reconstructed')
plt.title('Original vs Reconstructed Time-Series (First Feature)')
plt.xlabel('Time Steps')
plt.ylabel('Feature Value')
plt.legend()
plt.grid(True)
plt.show()

def detect_anomalies(model, data, threshold_multiplier=3.0):

    reconstructions = model.predict(data)
    mse = np.mean(np.square(data - reconstructions), axis=(1, 2))
    threshold = np.mean(mse) + threshold_multiplier * np.std(mse)
    anomalies = mse > threshold

    return mse, anomalies, threshold
anomaly_scores, anomalies, threshold = detect_anomalies(autoencoder, val_data)

print(f"Number of anomalies detected: {np.sum(anomalies)}")
print(f"Percentage of anomalies: {np.sum(anomalies) / len(anomalies) * 100:.2f}%")
print(f"Anomaly threshold: {threshold:.6f}")
plt.figure(figsize=(10, 6))
plt.plot(anomaly_scores)
plt.axhline(y=threshold, color='r', linestyle='-', label=f'Threshold ({threshold:.6f})')
plt.title('Anomaly Scores')
plt.xlabel('Sample Index')
plt.ylabel('Reconstruction Error (MSE)')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, LSTM, RepeatVector, TimeDistributed, Input, Dropout, BatchNormalization
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_fscore_support, roc_curve, auc
import seaborn as sns

def prepare_data(data1):

    time_series_data = data1[['Amount', 'Location_London', 'Location_Los Angeles',
                             'Location_New York', 'Location_San Francisco', 'Location_Tokyo',
                             'TransactionType_Purchase', 'TransactionType_Transfer',
                             'TransactionType_Unknown', 'TransactionType_Withdrawal']]
    time_series_data = time_series_data.fillna(0)
    raw_data = time_series_data.copy()
    scaler = StandardScaler()
    time_series_data_scaled = scaler.fit_transform(time_series_data)
    time_series_data_scaled = time_series_data_scaled.astype(np.float32)
    time_steps = 20
    num_features = time_series_data_scaled.shape[1]
    num_samples = len(time_series_data_scaled) // time_steps

    time_series_data_scaled = time_series_data_scaled[:num_samples * time_steps]
    shaped_data = time_series_data_scaled.reshape((num_samples, time_steps, num_features))

    print(f"Data shape after reshaping: {shaped_data.shape}")

    return shaped_data, scaler, raw_data, time_steps, num_features
def build_lstm_autoencoder(input_shape, encoding_dim=8):

    inputs = Input(shape=input_shape)
    x = LSTM(64, activation='relu', return_sequences=True)(inputs)
    x = Dropout(0.2)(x)
    x = BatchNormalization()(x)

    x = LSTM(32, activation='relu', return_sequences=False)(x)
    x = Dropout(0.2)(x)
    x = BatchNormalization()(x)
    encoded = Dense(encoding_dim, activation='relu')(x)
    x = Dense(32, activation='relu')(encoded)
    x = RepeatVector(input_shape[0])(x)

    x = LSTM(32, activation='relu', return_sequences=True)(x)
    x = Dropout(0.2)(x)
    x = BatchNormalization()(x)

    x = LSTM(64, activation='relu', return_sequences=True)(x)
    x = Dropout(0.2)(x)
    x = BatchNormalization()(x)
    outputs = TimeDistributed(Dense(input_shape[1]))(x)
    autoencoder = Model(inputs=inputs, outputs=outputs)
    optimizer = Adam(learning_rate=0.001)
    autoencoder.compile(optimizer=optimizer, loss='mse')

    return autoencoder

def train_model(autoencoder, train_data, val_data, epochs=100, batch_size=64):
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6, verbose=1)
    early_stopping = EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True, verbose=1)
    checkpoint = ModelCheckpoint('best_autoencoder.h5', monitor='val_loss', save_best_only=True, verbose=0)
    history = autoencoder.fit(
        train_data, train_data,
        epochs=epochs,
        batch_size=batch_size,
        validation_data=(val_data, val_data),
        callbacks=[reduce_lr, early_stopping, checkpoint],
        shuffle=True,
        verbose=1
    )

    return history

def detect_anomalies(model, data, threshold_percentile=95):
    reconstructions = model.predict(data)
    mse = np.mean(np.square(data - reconstructions), axis=(1, 2))
    threshold = np.percentile(mse, threshold_percentile)
    anomalies = mse > threshold

    return mse, anomalies, threshold, reconstructions

def plot_training_history(history):
    """Plot the training and validation loss"""
    plt.figure(figsize=(12, 6))
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Model Loss During Training')
    plt.xlabel('Epoch')
    plt.ylabel('Loss (MSE)')
    plt.legend()
    plt.grid(True)
    plt.show()

def plot_reconstructions(original, reconstructed, sample_idx=0, feature_idx=0):

    plt.figure(figsize=(12, 6))
    plt.plot(original[sample_idx, :, feature_idx], 'b-', label='Original', linewidth=2)
    plt.plot(reconstructed[sample_idx, :, feature_idx], 'r-', label='Reconstructed', linewidth=2)
    plt.title(f'Original vs Reconstructed Time-Series (Sample {sample_idx}, Feature {feature_idx})')
    plt.xlabel('Time Steps')
    plt.ylabel('Normalized Feature Value')
    plt.legend()
    plt.grid(True)
    plt.show()

def plot_anomaly_scores(mse, threshold, anomalies=None):

    plt.figure(figsize=(14, 7))
    plt.plot(mse, 'b-', alpha=0.6, label='Reconstruction Error')
    if anomalies is not None:
        plt.scatter(np.where(anomalies)[0], mse[anomalies],
                    color='red', alpha=0.7, s=50, label='Anomalies')

    plt.axhline(y=threshold, color='r', linestyle='-',
               label=f'Threshold ({threshold:.4f})')

    plt.title('Anomaly Scores')
    plt.xlabel('Sample Index')
    plt.ylabel('Reconstruction Error (MSE)')
    plt.yscale('log')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

def plot_feature_importance(original, reconstructed):

    feature_mse = np.mean(np.square(original - reconstructed), axis=(0, 1))
    feature_names = ['Amount', 'London', 'Los Angeles', 'New York',
                     'San Francisco', 'Tokyo', 'Purchase', 'Transfer',
                     'Unknown', 'Withdrawal']
    sorted_idx = np.argsort(feature_mse)[::-1]

    # Plot
    plt.figure(figsize=(10, 6))
    plt.bar(range(len(feature_mse)), feature_mse[sorted_idx])
    plt.xticks(range(len(feature_mse)), [feature_names[i] for i in sorted_idx], rotation=45)
    plt.title('Feature Contribution to Reconstruction Error')
    plt.xlabel('Features')
    plt.ylabel('Mean Squared Error')
    plt.tight_layout()
    plt.show()

def run_fraud_detection(data1):
    print("Preparing data...")
    shaped_data, scaler, raw_data, time_steps, num_features = prepare_data(data1)
    train_size = int(0.8 * shaped_data.shape[0])
    train_data = shaped_data[:train_size]
    val_data = shaped_data[train_size:]

    print(f"Training data shape: {train_data.shape}")
    print(f"Validation data shape: {val_data.shape}")
    print("Building LSTM autoencoder model...")
    input_shape = (time_steps, num_features)
    autoencoder = build_lstm_autoencoder(input_shape)
    autoencoder.summary()
    print("Training model...")
    history = train_model(autoencoder, train_data, val_data)
    print("Evaluating model...")
    plot_training_history(history)
    mse, anomalies, threshold, reconstructions = detect_anomalies(autoencoder, val_data)
    plot_reconstructions(val_data, reconstructions)
    plot_anomaly_scores(mse, threshold, anomalies)
    anomaly_percentage = np.mean(anomalies) * 100
    print(f"Number of anomalies detected: {np.sum(anomalies)} out of {len(anomalies)}")
    print(f"Percentage of anomalies: {anomaly_percentage:.2f}%")
    print(f"Anomaly threshold: {threshold:.6f}")
    plot_feature_importance(val_data, reconstructions)
    return autoencoder, scaler, mse, anomalies, threshold

def predict_anomalies(model, scaler, new_data, threshold, time_steps=20):

    scaled_data = scaler.transform(new_data)


    num_samples = len(scaled_data) // time_steps
    if num_samples > 0:

        scaled_data = scaled_data[:num_samples * time_steps]

        shaped_data = scaled_data.reshape((num_samples, time_steps, scaled_data.shape[1]))


        reconstructions = model.predict(shaped_data)


        mse = np.mean(np.square(shaped_data - reconstructions), axis=(1, 2))


        anomalies = mse > threshold


        result = pd.DataFrame({
            'reconstruction_error': mse,
            'is_anomaly': anomalies
        })

        return result
    else:
        print("Not enough data points for a complete sequence")
        return None
