In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
import time
import random

In [2]:
# read in data
txn = pd.read_csv("/Users/judepereira/Downloads/ieee-fraud-detection/train_transaction.csv")
idm = pd.read_csv("/Users/judepereira/Downloads/ieee-fraud-detection/train_identity.csv")

# derive “day” from TransactionDT
txn["day"] = (txn["TransactionDT"] // (3600 * 24)).astype(int)

# derive card activation date from TransactionDT
txn['D1new'] = (txn['TransactionDT'] // (60*60*24)) - txn['D1'] + 2000

# drop TransactionDT as it is no longer needed
txn.drop("TransactionDT", axis=1, inplace=True)

# merge identity info into transaction table
df = txn.merge(idm, on="TransactionID", how="left")
df.drop("TransactionID", axis=1, inplace=True)

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

In [4]:
SEED = 42
seed_everything(SEED)

In [5]:
# compute missing % for all columns
nulls = df.isna().mean() * 100

# find columns with more than 80% missing values
cols_80 = nulls[nulls >= 80].index.tolist()

# and drop them!
df.drop(columns=cols_80, inplace=True)

In [6]:
# function to identify UIDs
def compute_uids(df):

    return (
        df['card1'].astype(str) + "_" +
        df['addr1'].astype(str) + "_" +
        df['D1new'].astype(str) + "_" +
        df['P_emaildomain'].astype(str) + "_" +
        df['C1'].astype(str)
    )


In [7]:
id_cols = [c for c in df.columns if c.startswith("id_")]

# but id_01 to id_11 are numerical so need to exclude them
id_cat_cols = [c for c in id_cols if not c.startswith("id_0")]
id_cat_cols.remove("id_11")  # id_11 is a numerical column

# extract high cardinality categorical ID columns
id_high_card_cols = [c for c in id_cat_cols if df[c].nunique() > 10]

# one-hot encode categorical features with low cardinality
for c in id_cat_cols:
    n_uniq = df[c].nunique()
    print(f"Column: {c}, Unique values: {n_uniq}")
    if n_uniq <= 10:
        print(f"One-hot encoding {c} with {n_uniq} unique values")
        dummies = pd.get_dummies(df[c], prefix=c, drop_first=True)
        df = pd.concat([df.drop(c, axis=1), dummies], axis=1)

Column: id_12, Unique values: 2
One-hot encoding id_12 with 2 unique values
Column: id_13, Unique values: 54
Column: id_15, Unique values: 3
One-hot encoding id_15 with 3 unique values
Column: id_16, Unique values: 2
One-hot encoding id_16 with 2 unique values
Column: id_17, Unique values: 104
Column: id_19, Unique values: 522
Column: id_20, Unique values: 394
Column: id_28, Unique values: 2
One-hot encoding id_28 with 2 unique values
Column: id_29, Unique values: 2
One-hot encoding id_29 with 2 unique values
Column: id_31, Unique values: 130
Column: id_35, Unique values: 2
One-hot encoding id_35 with 2 unique values
Column: id_36, Unique values: 2
One-hot encoding id_36 with 2 unique values
Column: id_37, Unique values: 2
One-hot encoding id_37 with 2 unique values
Column: id_38, Unique values: 2
One-hot encoding id_38 with 2 unique values


In [8]:
print(f"High cardinality categorical ID columns: {id_high_card_cols}")

High cardinality categorical ID columns: ['id_13', 'id_17', 'id_19', 'id_20', 'id_31']


In [9]:
from sklearn.impute import SimpleImputer

# numeric imputation (median) – exclude the target “isFraud”
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
num_cols.remove("isFraud")

# include id_01 to id_11 as they are numerical
id_num_cols = [c for c in id_cols if c not in id_cat_cols]
num_cols.extend(id_num_cols)

# among num_cols, find columns with nans that need to be imputed
nan_cols = [c for c in num_cols if df[c].isna().any()]

# exclude the categorical columns card1, card2, card3, card5, addr1, addr2
cat_cols = ["card1", "card2", "card3", "card5", "addr1", "addr2"]
nan_cols = [c for c in nan_cols if c not in cat_cols]

imputer = SimpleImputer(strategy="median")
df[nan_cols] = imputer.fit_transform(df[nan_cols])

In [10]:
# for remaining categoricals, one‐hot encode small‐cardinaliy ones
cat_cols_rem = df.select_dtypes(include=["object"]).columns.tolist()

# include the cat_cols that were excluded earlier
cat_cols_rem.extend(cat_cols)

# extract high cardinality categorical columns
high_card_cols = [c for c in cat_cols_rem if df[c].nunique() > 10]

# e.g. “ProductCD”, “MISSING” placeholders, etc.
for c in cat_cols_rem:
    n_uniq = df[c].nunique()
    print(f"Column: {c}, Unique values: {n_uniq}")
    if n_uniq <= 10:
        print(f"One-hot encoding {c} with {n_uniq} unique values")
        dummies = pd.get_dummies(df[c], prefix=c, drop_first=True)
        df = pd.concat([df.drop(c, axis=1), dummies], axis=1)

Column: ProductCD, Unique values: 5
One-hot encoding ProductCD with 5 unique values
Column: card4, Unique values: 4
One-hot encoding card4 with 4 unique values
Column: card6, Unique values: 4
One-hot encoding card6 with 4 unique values
Column: P_emaildomain, Unique values: 59
Column: R_emaildomain, Unique values: 60
Column: M1, Unique values: 2
One-hot encoding M1 with 2 unique values
Column: M2, Unique values: 2
One-hot encoding M2 with 2 unique values
Column: M3, Unique values: 2
One-hot encoding M3 with 2 unique values
Column: M4, Unique values: 3
One-hot encoding M4 with 3 unique values
Column: M5, Unique values: 2
One-hot encoding M5 with 2 unique values
Column: M6, Unique values: 2
One-hot encoding M6 with 2 unique values
Column: M7, Unique values: 2
One-hot encoding M7 with 2 unique values
Column: M8, Unique values: 2
One-hot encoding M8 with 2 unique values
Column: M9, Unique values: 2
One-hot encoding M9 with 2 unique values
Column: id_31, Unique values: 130
Column: DeviceType

In [12]:
print(id_high_card_cols)

['id_13', 'id_17', 'id_19', 'id_20', 'id_31']


In [19]:
# include the high cardinality categorical ID columns but remove duplicates
high_card_cols = list(set(high_card_cols + id_high_card_cols))  # remove duplicates
print(f"High cardinality categorical columns: {high_card_cols}")
print(len(high_card_cols))

High cardinality categorical columns: ['P_emaildomain', 'id_20', 'card3', 'DeviceInfo', 'card2', 'card5', 'addr2', 'card1', 'id_17', 'id_19', 'id_13', 'R_emaildomain', 'addr1', 'id_31']
14


In [20]:
# find all bool columns in df
bool_cols = df.select_dtypes(include="bool").columns

# cast them to int (True→1, False→0)
df[bool_cols] = df[bool_cols].astype(int)

In [21]:
df.head()

Unnamed: 0,isFraud,TransactionAmt,card1,card2,card3,card5,addr1,addr2,dist1,P_emaildomain,...,M2_T,M3_T,M4_M1,M4_M2,M5_T,M6_T,M7_T,M8_T,M9_T,DeviceType_mobile
0,0,68.5,13926,,150.0,142.0,315.0,87.0,19.0,,...,1,1,0,1,0,1,0,0,0,0
1,0,29.0,2755,404.0,150.0,102.0,325.0,87.0,8.0,gmail.com,...,0,0,0,0,1,1,0,0,0,0
2,0,59.0,4663,490.0,150.0,166.0,330.0,87.0,287.0,outlook.com,...,1,1,0,0,0,0,0,0,0,0
3,0,50.0,18132,567.0,150.0,117.0,476.0,87.0,8.0,yahoo.com,...,0,0,0,0,1,0,0,0,0,0
4,0,50.0,4497,514.0,150.0,102.0,420.0,87.0,8.0,gmail.com,...,0,0,0,0,0,0,0,0,0,1


In [22]:
# label encode high cardinality categorical columns
from sklearn.preprocessing import LabelEncoder

# Store encoders for later inference
encoders = {}
for col in high_card_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    encoders[col] = le


In [23]:
# separate all columns from high cardinality categorical columns
other_cols = [c for c in df.columns if c not in high_card_cols]
other_cols.remove("isFraud")

In [24]:
# Create training and CV sets by UID

# Compute UIDs for every transaction
df['UID'] = compute_uids(df)   # assumes compute_uids(df) returns a Series of length df

# Label each UID as fraud if any transaction is fraudulent
uid_labels = df.groupby('UID')['isFraud'].max().rename('UID_isFraud')
df = df.merge(uid_labels, left_on='UID', right_index=True)

# Split UIDs into “normal” vs. “fraud”
normal_uids = uid_labels[uid_labels == 0].index
fraud_uids  = uid_labels[uid_labels == 1].index

# Hold out 20% of normal UIDs for CV:
rng = np.random.RandomState(42)
hold_normals = rng.choice(normal_uids, size=int(0.2 * len(normal_uids)), replace=False)

# TRAIN on the remaining 80% normal UIDs
train_uids = np.setdiff1d(normal_uids, hold_normals)
train_df   = df[df['UID'].isin(train_uids)].copy()

# CV on all fraud UIDs + held‐out normals
cv_uids = np.concatenate([fraud_uids, hold_normals])
cv_df   = df[df['UID'].isin(cv_uids)].copy()

print(f"Training on {len(train_uids)} normal UIDs → {train_df.shape[0]} transactions")
print(f"CV set on {len(cv_uids)} UIDs → {cv_df.shape[0]} transactions")

# Build feature‐matrices (drop isFraud & UID)
X_train = train_df.drop(['isFraud','UID'], axis=1)
X_cv    = cv_df   .drop(['isFraud','UID'], axis=1)
y_cv    = cv_df['isFraud'].values


Training on 250514 normal UIDs → 454451 transactions
CV set on 74791 UIDs → 136089 transactions


In [25]:
# how many UIDs in each set?
n_normal_uids = len(normal_uids)
n_fraud_uids  = len(fraud_uids)
n_train_uids  = train_df['UID'].nunique()
n_cv_uids     = cv_df  ['UID'].nunique()

print(f"Number of normal UIDs available: {n_normal_uids}")
print(f"Number of fraud UIDs available:  {n_fraud_uids}")
print(f"UIDs in training set:            {n_train_uids}")
print(f"UIDs in CV set:                  {n_cv_uids}")


Number of normal UIDs available: 313142
Number of fraud UIDs available:  12163
UIDs in training set:            250514
UIDs in CV set:                  74791


In [26]:
import numpy as np
from sklearn.preprocessing import StandardScaler

# Identify one-hot columns (all values are 0 or 1)
one_hot_cols = [col for col in X_train.columns if set(X_train[col].unique()) <= {0, 1}]
non_one_hot_cols = [col for col in X_train.columns if col not in one_hot_cols and col not in high_card_cols]


In [27]:
scaler = StandardScaler()

X_train_scaled = X_train.copy()
X_cv_scaled    = X_cv.copy()

X_train_scaled[non_one_hot_cols] = scaler.fit_transform(X_train[non_one_hot_cols])
X_cv_scaled[non_one_hot_cols]    = scaler.transform(X_cv[non_one_hot_cols])


In [28]:
# Convert to PyTorch tensors
X_num_train = torch.tensor(X_train_scaled[other_cols].values, dtype=torch.float32)
X_cat_train = torch.tensor(X_train_scaled[high_card_cols].values, dtype=torch.long)

X_num_val = torch.tensor(X_cv_scaled[other_cols].values, dtype=torch.float32)
X_cat_val = torch.tensor(X_cv_scaled[high_card_cols].values, dtype=torch.long)

y_valid = torch.FloatTensor(y_cv)

In [29]:
from torch.utils.data import TensorDataset, DataLoader

train_ds = TensorDataset(X_num_train, X_cat_train, X_num_train, X_cat_train)
val_ds   = TensorDataset(X_num_val, X_cat_val, X_num_val, X_cat_val)

# Build Pytorch loaders
BATCH_SIZE = 512

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_loader   = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False)


In [30]:
# UID‐aware DataLoader for CV aggregation
from collections import defaultdict

# Extract the unique CV UIDs and build a mapping → index
unique_cv_uids = cv_df['UID'].unique()
uid2idx        = {uid: i for i, uid in enumerate(unique_cv_uids)}

# Map your UID column to these integer indices
uid_idx_arr = cv_df['UID'].map(uid2idx).astype(int).values

# Now you can build a LongTensor
uid_cv_tensor = torch.tensor(uid_idx_arr, dtype=torch.long)

# Create your dataset & loader as before
uid_eval_ds     = TensorDataset(X_num_val, X_cat_val, uid_cv_tensor)
uid_eval_loader = DataLoader(uid_eval_ds, batch_size=BATCH_SIZE, shuffle=False)


In [31]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class AutoEncoderWithEmbeddings(nn.Module):
    def __init__(self, num_numeric, cat_cardinalities, hidden1=128, hidden2=64, code_size=8, dropout_rate=0.2):
        super(AutoEncoderWithEmbeddings, self).__init__()
        
        # Create embeddings for each categorical feature
        self.embeddings = nn.ModuleList([
            nn.Embedding(num_categories, min(50, (num_categories + 1)//2))
            for num_categories in cat_cardinalities
        ])
        
        emb_size_total = sum([emb.embedding_dim for emb in self.embeddings])
        total_input_size = num_numeric + emb_size_total
        
        self.dropout = nn.Dropout(dropout_rate)
        
        # Encoder
        self.fc1 = nn.Linear(total_input_size, hidden1)
        self.fc2 = nn.Linear(hidden1, hidden2)
        self.fc3 = nn.Linear(hidden2, code_size)
        
        # Decoder
        self.fc4 = nn.Linear(code_size, hidden2)
        self.fc5 = nn.Linear(hidden2, hidden1)
        self.fc6 = nn.Linear(hidden1, total_input_size)
    
    def forward(self, x_num, x_cat):
        # Embed categorical variables
        embeds = [emb(x_cat[:, i]) for i, emb in enumerate(self.embeddings)]
        embeds = torch.cat(embeds, dim=1)
        
        # Concatenate numerical + embeddings
        x = torch.cat([x_num, embeds], dim=1)

        # Encoder with dropout noise
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        
        code = F.relu(self.fc3(x))
        
        # Decoder
        x = F.relu(self.fc4(code))
        x = self.dropout(x)
        
        x = F.relu(self.fc5(x))
        x = self.dropout(x)
        
        output = self.fc6(x)  # Linear activation
        return output

In [32]:
criterion = torch.nn.MSELoss()

In [33]:
def per_sample_mse(model, generator):
    model.eval()
    criterion = torch.nn.MSELoss(reduction="none")
    batch_losses = []

    for xb_num, xb_cat, yb_num, yb_cat in generator:
        # Forward pass
        y_pred = model(xb_num, xb_cat)

        # Construct full target (numeric + embeddings)
        true_embeds = [model.embeddings[i](yb_cat[:, i]) for i in range(len(model.embeddings))]
        y_true_full = torch.cat([yb_num] + true_embeds, dim=1)

        # Compute Loss
        loss = criterion(y_pred, y_true_full)
        loss_app = list(torch.mean(loss,axis=1).detach().cpu().numpy())
        batch_losses.extend(loss_app)
    
    return batch_losses


In [34]:
# Number of numeric features (including one-hot encodings)
num_numeric = len(other_cols)

# Cardinalities (unique values) of each categorical feature
cat_cardinalities = [df[col].nunique() for col in high_card_cols]

# Initialize the embedding autoencoder model
seed_everything(SEED)
model = AutoEncoderWithEmbeddings(num_numeric=num_numeric, 
                            cat_cardinalities=cat_cardinalities,
                            hidden1=128, hidden2=64, code_size=8, 
                            dropout_rate=0.2)

losses = per_sample_mse(model, val_loader)


In [35]:
print(losses[0:5])
print(np.mean(losses))

[np.float32(0.71192455), np.float32(0.7349194), np.float32(0.68293494), np.float32(0.7178748), np.float32(0.76193386)]
1.339104


In [36]:
def evaluate_model(model,generator,criterion):
    model.eval()
    batch_losses = []
    for xb_num, xb_cat, yb_num, yb_cat in generator:
        # Forward pass
        y_pred = model(xb_num, xb_cat)

        # Construct full target (numeric + embeddings)
        true_embeds = [model.embeddings[i](yb_cat[:, i]) for i in range(len(model.embeddings))]
        y_true_full = torch.cat([yb_num] + true_embeds, dim=1)

        # Compute Loss
        loss = criterion(y_pred, y_true_full)
        batch_losses.append(loss.item())
    mean_loss = np.mean(batch_losses)    
    return mean_loss

In [37]:
class EarlyStopping:
    
    def __init__(self, patience=3, verbose=False):
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = np.inf
    
    def continue_training(self,current_score):
        if self.best_score > current_score:
            self.best_score = current_score
            self.counter = 0
            if self.verbose:
                print("New best score:", current_score)
        else:
            self.counter+=1
            if self.verbose:
                print(self.counter, " iterations since best score.")
                
        return self.counter <= self.patience 

In [38]:
def training_loop(model,training_generator,valid_generator,optimizer,criterion,max_epochs=100,apply_early_stopping=True,patience=3,verbose=False):
    #Setting the model in training mode
    model.train()

    if apply_early_stopping:
        early_stopping = EarlyStopping(verbose=verbose,patience=patience)
    
    all_train_losses = []
    all_valid_losses = []
    
    #Training loop
    start_time=time.time()
    for epoch in range(max_epochs):
        model.train()
        train_loss=[]
        for xb_num, xb_cat, yb_num, yb_cat in training_generator:
            optimizer.zero_grad()
            # Forward pass
            y_pred = model(xb_num, xb_cat)

            # Construct full target (numeric + embeddings)
            true_embeds = [model.embeddings[i](yb_cat[:, i]) for i in range(len(model.embeddings))]
            y_true_full = torch.cat([yb_num] + true_embeds, dim=1)

            # Compute Loss
            loss = criterion(y_pred, y_true_full)
            # Backward pass
            loss.backward()
            optimizer.step()   
            train_loss.append(loss.item())
        
        #showing last training loss after each epoch
        all_train_losses.append(np.mean(train_loss))
        if verbose:
            print('')
            print('Epoch {}: train loss: {}'.format(epoch, np.mean(train_loss)))
        #evaluating the model on the test set after each epoch    
        valid_loss = evaluate_model(model,valid_generator,criterion)
        all_valid_losses.append(valid_loss)
        if verbose:
            print('valid loss: {}'.format(valid_loss))
        if apply_early_stopping:
            if not early_stopping.continue_training(valid_loss):
                if verbose:
                    print("Early stopping")
                break
        
    training_execution_time=time.time()-start_time
    return model,training_execution_time,all_train_losses,all_valid_losses

In [39]:
optimizer = torch.optim.Adam(model.parameters(), lr = 0.0001)

In [40]:
model,training_execution_time,train_losses,valid_losses = training_loop(model,train_loader,val_loader,optimizer,criterion,verbose=True)


Epoch 0: train loss: 0.6050630776664695
valid loss: 0.8247278764059669
New best score: 0.8247278764059669

Epoch 1: train loss: 0.46598865763024166
valid loss: 0.7771255816508057
New best score: 0.7771255816508057

Epoch 2: train loss: 0.42154230090143446
valid loss: 0.7228615339985467
New best score: 0.7228615339985467

Epoch 3: train loss: 0.3828485046286841
valid loss: 0.6553017530114131
New best score: 0.6553017530114131

Epoch 4: train loss: 0.34616664870902225
valid loss: 0.5985279115183013
New best score: 0.5985279115183013

Epoch 5: train loss: 0.31369785125459637
valid loss: 0.5678653494083792
New best score: 0.5678653494083792

Epoch 6: train loss: 0.29016324369287166
valid loss: 0.5427642594042578
New best score: 0.5427642594042578

Epoch 7: train loss: 0.27126387380869
valid loss: 0.5227748528683096
New best score: 0.5227748528683096

Epoch 8: train loss: 0.2555021621656042
valid loss: 0.5040838949774441
New best score: 0.5040838949774441

Epoch 9: train loss: 0.2414187775

In [41]:
# Compute UID-level reconstruction errors
def compute_uid_errors(model, loader, idx2uid):
    model.eval()
    criterion = torch.nn.MSELoss(reduction='none')
    uid_errors = defaultdict(list)

    with torch.no_grad():
        for xb_num, xb_cat, uid_idxs in loader:
            # Forward pass
            y_pred = model(xb_num, xb_cat)
            # rebuild full “true” vector
            true_embeds = [model.embeddings[i](xb_cat[:, i])
                           for i in range(len(model.embeddings))]
            y_true_full = torch.cat([xb_num] + true_embeds, dim=1)

            losses = criterion(y_pred, y_true_full).mean(dim=1).cpu().numpy()
            for idx, err in zip(uid_idxs.cpu().numpy(), losses):
                actual_uid = idx2uid[idx]
                uid_errors[actual_uid].append(err)

    return uid_errors


In [42]:
# evaluate UID‐level reconstruction errors

idx2uid    = list(unique_cv_uids)
uid_errors = compute_uid_errors(model, uid_eval_loader, idx2uid)

# average per‐UID
uid_avg = {u: np.mean(errs) for u, errs in uid_errors.items()}
uid_df  = pd.DataFrame.from_dict(uid_avg, orient='index', 
                                  columns=['avg_error'])
uid_df['true_label'] = uid_df.index.isin(fraud_uids).astype(int)

# metrics
from sklearn.metrics import roc_auc_score, accuracy_score
auc = roc_auc_score(uid_df['true_label'], uid_df['avg_error'])
print(f"UID‐level ROC AUC: {auc:.4f}")


UID‐level ROC AUC: 0.7976


In [43]:
from sklearn.metrics import recall_score

train_errs = per_sample_mse(model, train_loader)
thresh     = np.percentile(train_errs, 70)  # 70th percentile as threshold
uid_df['pred'] = (uid_df['avg_error'] >= thresh).astype(int)
recall   = recall_score(uid_df['true_label'], uid_df['pred'])    # binary‐class recall
print(f"Recall at 70th percentile train-error threshold = {recall:.4f}")

Recall at 70th percentile train-error threshold = 0.7234


In [44]:
# UID‐level evaluation via fraction‐rule

# Recompute uid_errors if needed
idx2uid    = list(unique_cv_uids)
uid_errors = compute_uid_errors(model, uid_eval_loader, idx2uid)

# Build train‐error threshold (70th pct of per‐sample errs)
train_errs = per_sample_mse(model, train_loader)
thresh     = np.percentile(train_errs, 70)  # 70th percentile as threshold

# For each UID, compute fraction of its txns flagged as fraud
uid_frac = {
    uid: np.mean(np.array(errs) >= thresh)
    for uid, errs in uid_errors.items()
}

# Build DataFrame and true labels
uid_df = pd.DataFrame.from_dict(uid_frac, orient='index',
                                columns=['fraud_frac'])
uid_df['true_label'] = uid_df.index.isin(fraud_uids).astype(int)

# Classify UID as fraud if majority of its txns exceed threshold
uid_df['pred'] = (uid_df['fraud_frac'] >= 0.50).astype(int)

# Compute metrics
auc_frac = roc_auc_score(uid_df['true_label'], uid_df['fraud_frac'])
recall_frac = recall_score(uid_df['true_label'], uid_df['pred'])

print(f"UID‐level ROC AUC (fraction rule):       {auc_frac:.4f}")
print(f"UID‐level recall (majority‐vote rule): {recall_frac:.4f}")


UID‐level ROC AUC (fraction rule):       0.7331
UID‐level recall (majority‐vote rule): 0.7258
