In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
import time
import random

In [2]:
df = pd.read_csv("/Users/judepereira/Downloads/ieee-fraud-detection/train_transaction.csv")

# derive “day” from TransactionDT, then drop the raw column
df["day"] = (df["TransactionDT"] // (3600 * 24)).astype(int)
df.drop("TransactionDT", axis=1, inplace=True)

# drop TransactionID, as it is not useful for modeling
df.drop("TransactionID", axis=1, inplace=True)

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

In [4]:
SEED = 42
seed_everything(SEED)

In [5]:
# compute missing % for all columns
nulls = df.isna().mean() * 100

# find columns with more than 80% missing values
cols_80 = nulls[nulls >= 80].index.tolist()

# and drop them!
df.drop(columns=cols_80, inplace=True)

In [6]:
from sklearn.impute import SimpleImputer

# numeric imputation (median) – exclude the target “isFraud”
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
num_cols.remove("isFraud")

# among num_cols, find columns with nans that need to be imputed
nan_cols = [c for c in num_cols if df[c].isna().any()]

# exclude the categorical columns card1, card2, card3, card5, addr1, addr2
cat_cols = ["card1", "card2", "card3", "card5", "addr1", "addr2"]
nan_cols = [c for c in nan_cols if c not in cat_cols]

imputer = SimpleImputer(strategy="median")
df[nan_cols] = imputer.fit_transform(df[nan_cols])

In [7]:
# for remaining categoricals, one‐hot encode small‐cardinaliy ones else drop them
cat_cols_rem = df.select_dtypes(include=["object"]).columns.tolist()

# include the cat_cols that were excluded earlier
cat_cols_rem.extend(cat_cols)

# extract high cardinality categorical columns
high_card_cols = [c for c in cat_cols_rem if df[c].nunique() > 10]

# e.g. “ProductCD”, “MISSING” placeholders, etc.
for c in cat_cols_rem:
    n_uniq = df[c].nunique()
    print(f"Column: {c}, Unique values: {n_uniq}")
    if n_uniq <= 10:
        dummies = pd.get_dummies(df[c], prefix=c, drop_first=True)
        df = pd.concat([df.drop(c, axis=1), dummies], axis=1)

Column: ProductCD, Unique values: 5
Column: card4, Unique values: 4
Column: card6, Unique values: 4
Column: P_emaildomain, Unique values: 59
Column: R_emaildomain, Unique values: 60
Column: M1, Unique values: 2
Column: M2, Unique values: 2
Column: M3, Unique values: 2
Column: M4, Unique values: 3
Column: M5, Unique values: 2
Column: M6, Unique values: 2
Column: M7, Unique values: 2
Column: M8, Unique values: 2
Column: M9, Unique values: 2
Column: card1, Unique values: 13553
Column: card2, Unique values: 500
Column: card3, Unique values: 114
Column: card5, Unique values: 119
Column: addr1, Unique values: 332
Column: addr2, Unique values: 74


In [8]:
print(f"High cardinality categorical columns: {high_card_cols}")

High cardinality categorical columns: ['P_emaildomain', 'R_emaildomain', 'card1', 'card2', 'card3', 'card5', 'addr1', 'addr2']


In [9]:
# find all bool columns in df
bool_cols = df.select_dtypes(include="bool").columns

# cast them to int (True→1, False→0)
df[bool_cols] = df[bool_cols].astype(int)

In [10]:
df.head()

Unnamed: 0,isFraud,TransactionAmt,card1,card2,card3,card5,addr1,addr2,dist1,P_emaildomain,...,M1_T,M2_T,M3_T,M4_M1,M4_M2,M5_T,M6_T,M7_T,M8_T,M9_T
0,0,68.5,13926,,150.0,142.0,315.0,87.0,19.0,,...,1,1,1,0,1,0,1,0,0,0
1,0,29.0,2755,404.0,150.0,102.0,325.0,87.0,8.0,gmail.com,...,0,0,0,0,0,1,1,0,0,0
2,0,59.0,4663,490.0,150.0,166.0,330.0,87.0,287.0,outlook.com,...,1,1,1,0,0,0,0,0,0,0
3,0,50.0,18132,567.0,150.0,117.0,476.0,87.0,8.0,yahoo.com,...,0,0,0,0,0,1,0,0,0,0
4,0,50.0,4497,514.0,150.0,102.0,420.0,87.0,8.0,gmail.com,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# label encode high cardinality categorical columns
from sklearn.preprocessing import LabelEncoder

# Store encoders for later inference
encoders = {}
for col in high_card_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    encoders[col] = le


In [25]:
# separate all columns from high cardinality categorical columns
other_cols = [c for c in df.columns if c not in high_card_cols]
other_cols.remove("isFraud")

In [16]:
# Create training and CV sets
# all non‐fraud examples
df_norm = df[df.isFraud == 0].copy()
# all fraud examples
df_fraud = df[df.isFraud == 1].copy()

# hold out 20% of normals for CV
norm_train, norm_cv = train_test_split(
    df_norm, test_size=0.2, random_state=42
)

# CV set = held‐out normals + all frauds
df_cv = pd.concat([norm_cv, df_fraud], axis=0)
y_cv  = df_cv["isFraud"].values

# drop labels for modeling
X_train = norm_train.drop("isFraud", axis=1)
X_cv    = df_cv.drop("isFraud", axis=1)

print("→ Training on normals only:", X_train.shape)
print("→ CV set (normals+fraud):", X_cv.shape)

→ Training on normals only: (455901, 345)
→ CV set (normals+fraud): (134639, 345)


In [20]:
import numpy as np
from sklearn.preprocessing import StandardScaler

# Identify one-hot columns (all values are 0 or 1)
one_hot_cols = [col for col in X_train.columns if set(X_train[col].unique()) <= {0, 1}]
non_one_hot_cols = [col for col in X_train.columns if col not in one_hot_cols and col not in high_card_cols]


In [22]:
scaler = StandardScaler()

X_train_scaled = X_train.copy()
X_cv_scaled    = X_cv.copy()

X_train_scaled[non_one_hot_cols] = scaler.fit_transform(X_train[non_one_hot_cols])
X_cv_scaled[non_one_hot_cols]    = scaler.transform(X_cv[non_one_hot_cols])


In [26]:
# Convert to PyTorch tensors
X_num_train = torch.tensor(X_train_scaled[other_cols].values, dtype=torch.float32)
X_cat_train = torch.tensor(X_train_scaled[high_card_cols].values, dtype=torch.long)

X_num_val = torch.tensor(X_cv_scaled[other_cols].values, dtype=torch.float32)
X_cat_val = torch.tensor(X_cv_scaled[high_card_cols].values, dtype=torch.long)

y_valid = torch.FloatTensor(y_cv)

In [29]:
from torch.utils.data import TensorDataset, DataLoader

train_ds = TensorDataset(X_num_train, X_cat_train, X_num_train, X_cat_train)
val_ds   = TensorDataset(X_num_val, X_cat_val, X_num_val, X_cat_val)

# Build Pytorch loaders
BATCH_SIZE = 512

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_loader   = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False)


In [30]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class AutoEncoderWithEmbeddings(nn.Module):
    def __init__(self, num_numeric, cat_cardinalities, hidden1=128, hidden2=64, code_size=8, dropout_rate=0.2):
        super(AutoEncoderWithEmbeddings, self).__init__()
        
        # Create embeddings for each categorical feature
        self.embeddings = nn.ModuleList([
            nn.Embedding(num_categories, min(50, (num_categories + 1)//2))
            for num_categories in cat_cardinalities
        ])
        
        emb_size_total = sum([emb.embedding_dim for emb in self.embeddings])
        total_input_size = num_numeric + emb_size_total
        
        self.dropout = nn.Dropout(dropout_rate)
        
        # Encoder
        self.fc1 = nn.Linear(total_input_size, hidden1)
        self.fc2 = nn.Linear(hidden1, hidden2)
        self.fc3 = nn.Linear(hidden2, code_size)
        
        # Decoder
        self.fc4 = nn.Linear(code_size, hidden2)
        self.fc5 = nn.Linear(hidden2, hidden1)
        self.fc6 = nn.Linear(hidden1, total_input_size)
    
    def forward(self, x_num, x_cat):
        # Embed categorical variables
        embeds = [emb(x_cat[:, i]) for i, emb in enumerate(self.embeddings)]
        embeds = torch.cat(embeds, dim=1)
        
        # Concatenate numerical + embeddings
        x = torch.cat([x_num, embeds], dim=1)

        # Encoder with dropout noise
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        
        code = F.relu(self.fc3(x))
        
        # Decoder
        x = F.relu(self.fc4(code))
        x = self.dropout(x)
        
        x = F.relu(self.fc5(x))
        x = self.dropout(x)
        
        output = self.fc6(x)  # Linear activation
        return output

In [31]:
criterion = torch.nn.MSELoss()

In [39]:
def per_sample_mse(model, generator):
    model.eval()
    criterion = torch.nn.MSELoss(reduction="none")
    batch_losses = []

    for xb_num, xb_cat, yb_num, yb_cat in generator:
        # Forward pass
        y_pred = model(xb_num, xb_cat)

        # Construct full target (numeric + embeddings)
        true_embeds = [model.embeddings[i](yb_cat[:, i]) for i in range(len(model.embeddings))]
        y_true_full = torch.cat([yb_num] + true_embeds, dim=1)

        # Compute Loss
        loss = criterion(y_pred, y_true_full)
        loss_app = list(torch.mean(loss,axis=1).detach().cpu().numpy())
        batch_losses.extend(loss_app)
    
    return batch_losses


In [40]:
# Number of numeric features (including one-hot encodings)
num_numeric = len(other_cols)

# Cardinalities (unique values) of each categorical feature
cat_cardinalities = [df[col].nunique() for col in high_card_cols]

# Initialize the embedding autoencoder model
seed_everything(SEED)
model = AutoEncoderWithEmbeddings(num_numeric=num_numeric, 
                            cat_cardinalities=cat_cardinalities,
                            hidden1=128, hidden2=64, code_size=8, 
                            dropout_rate=0.2)

losses = per_sample_mse(model, val_loader)


In [41]:
print(losses[0:5])
print(np.mean(losses))

[np.float32(0.56106347), np.float32(2.0109246), np.float32(0.55420184), np.float32(0.6527646), np.float32(0.9973502)]
1.4967284


In [42]:
def evaluate_model(model,generator,criterion):
    model.eval()
    batch_losses = []
    for xb_num, xb_cat, yb_num, yb_cat in generator:
        # Forward pass
        y_pred = model(xb_num, xb_cat)

        # Construct full target (numeric + embeddings)
        true_embeds = [model.embeddings[i](yb_cat[:, i]) for i in range(len(model.embeddings))]
        y_true_full = torch.cat([yb_num] + true_embeds, dim=1)

        # Compute Loss
        loss = criterion(y_pred, y_true_full)
        batch_losses.append(loss.item())
    mean_loss = np.mean(batch_losses)    
    return mean_loss

In [43]:
class EarlyStopping:
    
    def __init__(self, patience=3, verbose=False):
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = np.inf
    
    def continue_training(self,current_score):
        if self.best_score > current_score:
            self.best_score = current_score
            self.counter = 0
            if self.verbose:
                print("New best score:", current_score)
        else:
            self.counter+=1
            if self.verbose:
                print(self.counter, " iterations since best score.")
                
        return self.counter <= self.patience 

In [44]:
def training_loop(model,training_generator,valid_generator,optimizer,criterion,max_epochs=100,apply_early_stopping=True,patience=3,verbose=False):
    #Setting the model in training mode
    model.train()

    if apply_early_stopping:
        early_stopping = EarlyStopping(verbose=verbose,patience=patience)
    
    all_train_losses = []
    all_valid_losses = []
    
    #Training loop
    start_time=time.time()
    for epoch in range(max_epochs):
        model.train()
        train_loss=[]
        for xb_num, xb_cat, yb_num, yb_cat in training_generator:
            optimizer.zero_grad()
            # Forward pass
            y_pred = model(xb_num, xb_cat)

            # Construct full target (numeric + embeddings)
            true_embeds = [model.embeddings[i](yb_cat[:, i]) for i in range(len(model.embeddings))]
            y_true_full = torch.cat([yb_num] + true_embeds, dim=1)

            # Compute Loss
            loss = criterion(y_pred, y_true_full)
            # Backward pass
            loss.backward()
            optimizer.step()   
            train_loss.append(loss.item())
        
        #showing last training loss after each epoch
        all_train_losses.append(np.mean(train_loss))
        if verbose:
            print('')
            print('Epoch {}: train loss: {}'.format(epoch, np.mean(train_loss)))
        #evaluating the model on the test set after each epoch    
        valid_loss = evaluate_model(model,valid_generator,criterion)
        all_valid_losses.append(valid_loss)
        if verbose:
            print('valid loss: {}'.format(valid_loss))
        if apply_early_stopping:
            if not early_stopping.continue_training(valid_loss):
                if verbose:
                    print("Early stopping")
                break
        
    training_execution_time=time.time()-start_time
    return model,training_execution_time,all_train_losses,all_valid_losses

In [45]:
optimizer = torch.optim.Adam(model.parameters(), lr = 0.0001)

In [46]:
model,training_execution_time,train_losses,valid_losses = training_loop(model,train_loader,val_loader,optimizer,criterion,verbose=True)


Epoch 0: train loss: 0.7100744528735661
valid loss: 1.0759438130112202
New best score: 1.0759438130112202

Epoch 1: train loss: 0.5721140254694471
valid loss: 0.9307100716199258
New best score: 0.9307100716199258

Epoch 2: train loss: 0.5084841774837203
valid loss: 0.8701037218362206
New best score: 0.8701037218362206

Epoch 3: train loss: 0.46317482911108854
valid loss: 0.8159573852109365
New best score: 0.8159573852109365

Epoch 4: train loss: 0.4268100562721792
valid loss: 0.7773280569809018
New best score: 0.7773280569809018

Epoch 5: train loss: 0.3986800983297303
valid loss: 0.7464546328941679
New best score: 0.7464546328941679

Epoch 6: train loss: 0.37613097525606254
valid loss: 0.7225671778840257
New best score: 0.7225671778840257

Epoch 7: train loss: 0.3561944396608205
valid loss: 0.7017435869449898
New best score: 0.7017435869449898

Epoch 8: train loss: 0.33799978398313424
valid loss: 0.6796386339365302
New best score: 0.6796386339365302

Epoch 9: train loss: 0.3209940885

In [47]:
losses = per_sample_mse(model, val_loader)
print(losses[0:5])
print(np.mean(losses))

[np.float32(0.0038353705), np.float32(0.53931046), np.float32(0.008435252), np.float32(0.019826617), np.float32(0.29362282)]
0.36923152


In [48]:
genuine_losses = np.array(losses)[y_valid.numpy() == 0]
fraud_losses = np.array(losses)[y_valid.numpy() == 1]
print("Average fraud reconstruction error:", np.mean(fraud_losses))
print("Average genuine reconstruction error:", np.mean(genuine_losses))

Average fraud reconstruction error: 1.7159481
Average genuine reconstruction error: 0.12508184


In [49]:
# evaluation
from sklearn.metrics import (average_precision_score, roc_auc_score)

# compute AUC-ROC and Average Precision on the validation set by considering the reconstruction errors as predicted fraud scores

AUC_ROC = roc_auc_score(y_cv, losses)
AP = average_precision_score(y_cv, losses)
    
performances = pd.DataFrame([[AUC_ROC, AP]], columns=['AUC ROC','Average precision'])

In [50]:
performances

Unnamed: 0,AUC ROC,Average precision
0,0.74295,0.475899


In [51]:
from sklearn.metrics import recall_score

thr      = np.percentile(losses, 70)      # e.g. top 30% as “fraud”
y_pred   = (losses >= thr).astype(int)
recall   = recall_score(y_cv, y_pred)    # binary‐class recall
print(f"Recall at threshold {thr:.4f} = {recall:.4f}")

Recall at threshold 0.1027 = 0.6271
