In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
import time
import random

In [2]:
df = pd.read_csv("/Users/judepereira/Downloads/ieee-fraud-detection/train_transaction.csv")

# derive “day” from TransactionDT, then drop the raw column
df["day"] = (df["TransactionDT"] // (3600 * 24)).astype(int)
df.drop("TransactionDT", axis=1, inplace=True)

# drop TransactionID, as it is not useful for modeling
df.drop("TransactionID", axis=1, inplace=True)

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

In [4]:
SEED = 42
seed_everything(SEED)

In [5]:
# compute missing % for all columns
nulls = df.isna().mean() * 100

# find columns with more than 80% missing values
cols_80 = nulls[nulls >= 80].index.tolist()

# and drop them!
df.drop(columns=cols_80, inplace=True)

In [6]:
from sklearn.impute import SimpleImputer

# numeric imputation (median) – exclude the target “isFraud”
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
num_cols.remove("isFraud")

# among num_cols, find columns with nans that need to be imputed
nan_cols = [c for c in num_cols if df[c].isna().any()]

# exclude the categorical columns card2, card3, card5, addr1, addr2
cat_cols = ["card2", "card3", "card5", "addr1", "addr2"]
nan_cols = [c for c in nan_cols if c not in cat_cols]

imputer = SimpleImputer(strategy="median")
df[nan_cols] = imputer.fit_transform(df[nan_cols])

In [7]:
# for remaining categoricals, one‐hot encode small‐cardinaliy ones else drop them
cat_cols_rem = df.select_dtypes(include=["object"]).columns.tolist()

# include the cat_cols that were excluded earlier
cat_cols_rem.extend(cat_cols)

# e.g. “ProductCD”, “MISSING” placeholders, etc.
for c in cat_cols_rem:
    n_uniq = df[c].nunique()
    if n_uniq <= 10:
        dummies = pd.get_dummies(df[c], prefix=c, drop_first=True)
        df = pd.concat([df.drop(c, axis=1), dummies], axis=1)
    else:
        df.drop(columns=c, inplace=True)

In [8]:
df.head()

Unnamed: 0,isFraud,TransactionAmt,card1,dist1,C1,C2,C3,C4,C5,C6,...,M1_T,M2_T,M3_T,M4_M1,M4_M2,M5_T,M6_T,M7_T,M8_T,M9_T
0,0,68.5,13926,19.0,1.0,1.0,0.0,0.0,0.0,1.0,...,True,True,True,False,True,False,True,False,False,False
1,0,29.0,2755,8.0,1.0,1.0,0.0,0.0,0.0,1.0,...,False,False,False,False,False,True,True,False,False,False
2,0,59.0,4663,287.0,1.0,1.0,0.0,0.0,0.0,1.0,...,True,True,True,False,False,False,False,False,False,False
3,0,50.0,18132,8.0,2.0,5.0,0.0,0.0,0.0,4.0,...,False,False,False,False,False,True,False,False,False,False
4,0,50.0,4497,8.0,1.0,1.0,0.0,0.0,0.0,1.0,...,False,False,False,False,False,False,False,False,False,False


In [9]:
# find all bool columns in df
bool_cols = df.select_dtypes(include="bool").columns

# cast them to int (True→1, False→0)
df[bool_cols] = df[bool_cols].astype(int)

In [10]:
df.head()

Unnamed: 0,isFraud,TransactionAmt,card1,dist1,C1,C2,C3,C4,C5,C6,...,M1_T,M2_T,M3_T,M4_M1,M4_M2,M5_T,M6_T,M7_T,M8_T,M9_T
0,0,68.5,13926,19.0,1.0,1.0,0.0,0.0,0.0,1.0,...,1,1,1,0,1,0,1,0,0,0
1,0,29.0,2755,8.0,1.0,1.0,0.0,0.0,0.0,1.0,...,0,0,0,0,0,1,1,0,0,0
2,0,59.0,4663,287.0,1.0,1.0,0.0,0.0,0.0,1.0,...,1,1,1,0,0,0,0,0,0,0
3,0,50.0,18132,8.0,2.0,5.0,0.0,0.0,0.0,4.0,...,0,0,0,0,0,1,0,0,0,0
4,0,50.0,4497,8.0,1.0,1.0,0.0,0.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
# Create training and CV sets
# all non‐fraud examples
df_norm = df[df.isFraud == 0].copy()
# all fraud examples
df_fraud = df[df.isFraud == 1].copy()

# hold out 20% of normals for CV
norm_train, norm_cv = train_test_split(
    df_norm, test_size=0.2, random_state=42
)

# CV set = held‐out normals + all frauds
df_cv = pd.concat([norm_cv, df_fraud], axis=0)
y_cv  = df_cv["isFraud"].values

# drop labels for modeling
X_train = norm_train.drop("isFraud", axis=1)
X_cv    = df_cv.drop("isFraud", axis=1)

print("→ Training on normals only:", X_train.shape)
print("→ CV set (normals+fraud):", X_cv.shape)

→ Training on normals only: (455901, 338)
→ CV set (normals+fraud): (134639, 338)


In [12]:
import numpy as np
from sklearn.preprocessing import StandardScaler

# Identify one-hot columns (all values are 0 or 1)
one_hot_cols = [col for col in X_train.columns if set(X_train[col].unique()) <= {0, 1}]
non_one_hot_cols = [col for col in X_train.columns if col not in one_hot_cols]


In [13]:
scaler = StandardScaler()

X_train_scaled = X_train.copy()
X_cv_scaled    = X_cv.copy()

X_train_scaled[non_one_hot_cols] = scaler.fit_transform(X_train[non_one_hot_cols])
X_cv_scaled[non_one_hot_cols]    = scaler.transform(X_cv[non_one_hot_cols])


In [14]:
X_train_final = X_train_scaled.values
X_cv_final    = X_cv_scaled.values

In [15]:
# Convert to PyTorch tensors
x_train = torch.FloatTensor(X_train_final)
x_valid = torch.FloatTensor(X_cv_final)
y_valid = torch.FloatTensor(y_cv)

In [16]:
# The output must match the input for autoencoders

class FraudDatasetUnsupervised(Dataset):
    
    def __init__(self, x,output=True):
        'Initialization'
        self.x = x
        self.output = output

    def __len__(self):
        'Returns the total number of samples'
        return len(self.x)

    def __getitem__(self, index):
        'Generates one sample of data'
        # Select sample index
        item = self.x[index]
        if self.output:
            return item, item
        else:
            return item

In [17]:
training_set = FraudDatasetUnsupervised(x_train)
valid_set = FraudDatasetUnsupervised(x_valid)

In [None]:
# Build Pytorch loaders
BATCH_SIZE = 64

train_loader = DataLoader(training_set, batch_size=BATCH_SIZE, shuffle=True)
val_loader   = DataLoader(valid_set,   batch_size=BATCH_SIZE, shuffle=False)

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class DropAutoencoder(nn.Module):
    def __init__(self, input_size, intermediate_size_1, intermediate_size_2, code_size, dropout_rate=0.2):
        super(DropAutoencoder, self).__init__()

        # Encoder
        self.fc1 = nn.Linear(input_size, intermediate_size_1)
        self.fc2 = nn.Linear(intermediate_size_1, intermediate_size_2)
        self.fc3 = nn.Linear(intermediate_size_2, code_size)
        
        # Decoder
        self.fc4 = nn.Linear(code_size, intermediate_size_2)
        self.fc5 = nn.Linear(intermediate_size_2, intermediate_size_1)
        self.fc6 = nn.Linear(intermediate_size_1, input_size)
        
        self.dropout = nn.Dropout(dropout_rate)
        
    def forward(self, x):
        # Encoder with dropout noise
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        
        code = F.relu(self.fc3(x))
        
        # Decoder
        x = F.relu(self.fc4(code))
        x = self.dropout(x)
        
        x = F.relu(self.fc5(x))
        x = self.dropout(x)
        
        output = self.fc6(x)  # Linear activation
        return output


In [21]:
criterion = torch.nn.MSELoss()

In [22]:
def per_sample_mse(model, generator):
    
    model.eval()
    criterion = torch.nn.MSELoss(reduction="none")
    batch_losses = []
    
    for x_batch, y_batch in generator:
        # Forward pass
        y_pred = model(x_batch)
        # Compute Loss
        loss = criterion(y_pred.squeeze(), y_batch)
        loss_app = list(torch.mean(loss,axis=1).detach().cpu().numpy())
        batch_losses.extend(loss_app)
    
    return batch_losses

In [23]:
seed_everything(SEED)
model = DropoutAutoencoder(x_train.shape[1], 128, 64, 16, dropout_rate=0.2)
losses = per_sample_mse(model, val_loader)

In [24]:
print(losses[0:5])
print(np.mean(losses))

[np.float32(0.10466372), np.float32(3.11748), np.float32(0.12473496), np.float32(0.24563716), np.float32(0.9639551)]
2.0011084


In [25]:
def evaluate_model(model,generator,criterion):
    model.eval()
    batch_losses = []
    for x_batch, y_batch in generator:
        # Forward pass
        y_pred = model(x_batch)
        # Compute Loss
        loss = criterion(y_pred.squeeze(), y_batch)
        batch_losses.append(loss.item())
    mean_loss = np.mean(batch_losses)    
    return mean_loss

In [26]:
class EarlyStopping:
    
    def __init__(self, patience=3, verbose=False):
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = np.inf
    
    def continue_training(self,current_score):
        if self.best_score > current_score:
            self.best_score = current_score
            self.counter = 0
            if self.verbose:
                print("New best score:", current_score)
        else:
            self.counter+=1
            if self.verbose:
                print(self.counter, " iterations since best score.")
                
        return self.counter <= self.patience 

In [27]:
def training_loop(model,training_generator,valid_generator,optimizer,criterion,max_epochs=100,apply_early_stopping=True,patience=3,verbose=False):
    #Setting the model in training mode
    model.train()

    if apply_early_stopping:
        early_stopping = EarlyStopping(verbose=verbose,patience=patience)
    
    all_train_losses = []
    all_valid_losses = []
    
    #Training loop
    start_time=time.time()
    for epoch in range(max_epochs):
        model.train()
        train_loss=[]
        for x_batch, y_batch in training_generator:
            optimizer.zero_grad()
            # Forward pass
            y_pred = model(x_batch)
            # Compute Loss
            loss = criterion(y_pred.squeeze(), y_batch)
            # Backward pass
            loss.backward()
            optimizer.step()   
            train_loss.append(loss.item())
        
        #showing last training loss after each epoch
        all_train_losses.append(np.mean(train_loss))
        if verbose:
            print('')
            print('Epoch {}: train loss: {}'.format(epoch, np.mean(train_loss)))
        #evaluating the model on the test set after each epoch    
        valid_loss = evaluate_model(model,valid_generator,criterion)
        all_valid_losses.append(valid_loss)
        if verbose:
            print('valid loss: {}'.format(valid_loss))
        if apply_early_stopping:
            if not early_stopping.continue_training(valid_loss):
                if verbose:
                    print("Early stopping")
                break
        
    training_execution_time=time.time()-start_time
    return model,training_execution_time,all_train_losses,all_valid_losses

In [28]:
optimizer = torch.optim.Adam(model.parameters(), lr = 0.0001)

In [29]:
model,training_execution_time,train_losses,valid_losses = training_loop(model,train_loader,val_loader,optimizer,criterion,verbose=True)


Epoch 0: train loss: 0.6331581974392657
valid loss: 1.2728861896923174
New best score: 1.2728861896923174

Epoch 1: train loss: 0.4862475634815568
valid loss: 1.1259676744743559
New best score: 1.1259676744743559

Epoch 2: train loss: 0.43702311404593236
valid loss: 1.0216602943448116
New best score: 1.0216602943448116

Epoch 3: train loss: 0.41130547048518795
valid loss: 0.9358362278798347
New best score: 0.9358362278798347

Epoch 4: train loss: 0.39362815840421833
valid loss: 0.8720011000163825
New best score: 0.8720011000163825

Epoch 5: train loss: 0.37965981093230605
valid loss: 0.8317107914950514
New best score: 0.8317107914950514

Epoch 6: train loss: 0.3662534802001658
valid loss: 0.8018558256172296
New best score: 0.8018558256172296

Epoch 7: train loss: 0.3542225208731681
valid loss: 0.7583249913202775
New best score: 0.7583249913202775

Epoch 8: train loss: 0.3467012395060494
valid loss: 0.746712317991251
New best score: 0.746712317991251

Epoch 9: train loss: 0.34104405609

In [30]:
losses = per_sample_mse(model, val_loader)
print(losses[0:5])
print(np.mean(losses))

[np.float32(0.0058943215), np.float32(0.9047979), np.float32(0.01538955), np.float32(0.035126124), np.float32(0.35696024)]
0.647031


In [31]:
genuine_losses = np.array(losses)[y_valid.numpy() == 0]
fraud_losses = np.array(losses)[y_valid.numpy() == 1]
print("Average fraud reconstruction error:", np.mean(fraud_losses))
print("Average genuine reconstruction error:", np.mean(genuine_losses))

Average fraud reconstruction error: 2.977164
Average genuine reconstruction error: 0.22459523


In [32]:
# evaluation
from sklearn.metrics import (average_precision_score, roc_auc_score)

# compute AUC-ROC and Average Precision on the validation set by considering the reconstruction errors as predicted fraud scores

AUC_ROC = roc_auc_score(y_cv, losses)
AP = average_precision_score(y_cv, losses)
    
performances = pd.DataFrame([[AUC_ROC, AP]], columns=['AUC ROC','Average precision'])

In [33]:
performances

Unnamed: 0,AUC ROC,Average precision
0,0.743221,0.477684


In [34]:
from sklearn.metrics import recall_score

thr      = np.percentile(losses, 70)      # e.g. top 30% as “fraud”
y_pred   = (losses >= thr).astype(int)
recall   = recall_score(y_cv, y_pred)    # binary‐class recall
print(f"Recall at threshold {thr:.4f} = {recall:.4f}")

Recall at threshold 0.1756 = 0.6274
