In [1]:

# Imports
import os, time, random
import numpy as np
import pandas as pd

# Sklearn
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

# Torch
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, TensorDataset

# Utils
from collections import defaultdict


In [2]:
# NOTE: Update these paths if running on a different machine.
txn = pd.read_csv("/Users/judepereira/Downloads/ieee-fraud-detection/train_transaction.csv")
idm = pd.read_csv("/Users/judepereira/Downloads/ieee-fraud-detection/train_identity.csv")

# Derive simple features (day, D1new) and drop TransactionDT
txn["day"] = (txn["TransactionDT"] // (3600 * 24)).astype(int)
txn["D1new"] = (txn["TransactionDT"] // (60*60*24)) - txn["D1"] + 2000
txn.drop("TransactionDT", axis=1, inplace=True)

# Merge identity into transactions, drop TransactionID afterward
df = txn.merge(idm, on="TransactionID", how="left")
df.drop("TransactionID", axis=1, inplace=True)

In [3]:
# seeding
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

SEED = 42
seed_everything(SEED)    

In [4]:

# UID construction helpers
def safe_str(series):
    """Convert to string with an explicit missing token to avoid 'nan' ambiguity."""
    return series.astype("object").where(~series.isna(), "__MISSING__").astype(str)

def compute_uids(df):
    """Stable UID built from card/address/email/C1 with explicit missing token."""
    return (
        safe_str(df['card1']) + "_" +
        safe_str(df['addr1']) + "_" +
        safe_str(df['D1new']) + "_" +
        safe_str(df['P_emaildomain']) + "_" +
        safe_str(df['C1'])
    )


In [5]:
# compute missing % for all columns
nulls = df.isna().mean() * 100

# find columns with more than 80% missing values
cols_80 = nulls[nulls >= 80].index.tolist()

# and drop them!
df.drop(columns=cols_80, inplace=True)

In [6]:
# categorical ID columns
id_cols = [c for c in df.columns if c.startswith("id_")]

# but id_01 to id_11 are numerical so need to exclude them
id_cat_cols = [c for c in id_cols if not c.startswith("id_0")]
id_cat_cols.remove("id_11")  # id_11 is a numerical column

# extract high cardinality categorical ID columns
id_high_card_cols = [c for c in id_cat_cols if df[c].nunique() > 10]

# one-hot encode categorical features with low cardinality
for c in id_cat_cols:
    n_uniq = df[c].nunique()
    print(f"Column: {c}, Unique values: {n_uniq}")
    if n_uniq <= 10:
        print(f"One-hot encoding {c} with {n_uniq} unique values")
        dummies = pd.get_dummies(df[c], prefix=c, drop_first=True)
        df = pd.concat([df.drop(c, axis=1), dummies], axis=1)

Column: id_12, Unique values: 2
One-hot encoding id_12 with 2 unique values
Column: id_13, Unique values: 54
Column: id_15, Unique values: 3
One-hot encoding id_15 with 3 unique values
Column: id_16, Unique values: 2
One-hot encoding id_16 with 2 unique values
Column: id_17, Unique values: 104
Column: id_19, Unique values: 522
Column: id_20, Unique values: 394
Column: id_28, Unique values: 2
One-hot encoding id_28 with 2 unique values
Column: id_29, Unique values: 2
One-hot encoding id_29 with 2 unique values
Column: id_31, Unique values: 130
Column: id_35, Unique values: 2
One-hot encoding id_35 with 2 unique values
Column: id_36, Unique values: 2
One-hot encoding id_36 with 2 unique values
Column: id_37, Unique values: 2
One-hot encoding id_37 with 2 unique values
Column: id_38, Unique values: 2
One-hot encoding id_38 with 2 unique values


In [7]:
# numeric imputation (median) – exclude the target “isFraud”
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
num_cols.remove("isFraud")

# include id_01 to id_11 as they are numerical
id_num_cols = [c for c in id_cols if c not in id_cat_cols]
num_cols.extend(id_num_cols)

# remove duplicates from the list
num_cols = list(set(num_cols))

# among num_cols, find columns with nans that need to be imputed
nan_cols = [c for c in num_cols if df[c].isna().any()]

# exclude the categorical columns card1, card2, card3, card5, addr1, addr2
cat_cols = ["card1", "card2", "card3", "card5", "addr1", "addr2"]
nan_cols = [c for c in nan_cols if c not in cat_cols]
print(f"Columns with NaNs to be imputed: {nan_cols}")

imputer = SimpleImputer(strategy="median")
df[nan_cols] = imputer.fit_transform(df[nan_cols])

Columns with NaNs to be imputed: ['V128', 'V26', 'V232', 'V69', 'V182', 'V106', 'V320', 'V122', 'V257', 'dist1', 'V10', 'V307', 'V99', 'V22', 'V290', 'V171', 'V264', 'V212', 'V88', 'V243', 'V61', 'V296', 'V245', 'V60', 'V20', 'V115', 'V293', 'V67', 'V105', 'V62', 'V169', 'V36', 'id_11', 'V283', 'V35', 'id_17', 'V47', 'V227', 'V63', 'V310', 'V284', 'V17', 'V9', 'V319', 'V111', 'V39', 'V66', 'V291', 'V268', 'V83', 'V311', 'V321', 'D4', 'D11', 'V101', 'V215', 'V118', 'V276', 'V42', 'V224', 'V206', 'V112', 'V104', 'V219', 'V306', 'V124', 'V82', 'V134', 'V24', 'V110', 'V91', 'V222', 'V57', 'V248', 'V187', 'V192', 'V258', 'V175', 'V230', 'V198', 'V54', 'V41', 'V266', 'V262', 'V120', 'V263', 'V279', 'V304', 'V51', 'V71', 'V170', 'V226', 'V107', 'V177', 'V267', 'V173', 'V221', 'V302', 'V38', 'V183', 'V75', 'V53', 'V2', 'V195', 'V31', 'D3', 'V135', 'V259', 'V318', 'V96', 'V172', 'V70', 'V43', 'V236', 'D5', 'V207', 'V241', 'V210', 'V186', 'V220', 'V19', 'V133', 'V208', 'V269', 'V242', 'V308', 'V

In [8]:
# for remaining categoricals, one‐hot encode small‐cardinaliy ones
cat_cols_rem = df.select_dtypes(include=["object"]).columns.tolist()

# include the cat_cols that were excluded earlier
cat_cols_rem.extend(cat_cols)

# remove duplicates from the list
cat_cols_rem = list(set(cat_cols_rem))

# extract high cardinality categorical columns
high_card_cols = [c for c in cat_cols_rem if df[c].nunique() > 10]

# e.g. “ProductCD”, “MISSING” placeholders, etc.
for c in cat_cols_rem:
    n_uniq = df[c].nunique()
    print(f"Column: {c}, Unique values: {n_uniq}")
    if n_uniq <= 10:
        print(f"One-hot encoding {c} with {n_uniq} unique values")
        dummies = pd.get_dummies(df[c], prefix=c, drop_first=True)
        df = pd.concat([df.drop(c, axis=1), dummies], axis=1)

Column: M3, Unique values: 2
One-hot encoding M3 with 2 unique values
Column: M5, Unique values: 2
One-hot encoding M5 with 2 unique values
Column: M6, Unique values: 2
One-hot encoding M6 with 2 unique values
Column: card1, Unique values: 13553
Column: M4, Unique values: 3
One-hot encoding M4 with 3 unique values
Column: addr1, Unique values: 332
Column: R_emaildomain, Unique values: 60
Column: DeviceType, Unique values: 2
One-hot encoding DeviceType with 2 unique values
Column: M2, Unique values: 2
One-hot encoding M2 with 2 unique values
Column: M7, Unique values: 2
One-hot encoding M7 with 2 unique values
Column: card5, Unique values: 119
Column: addr2, Unique values: 74
Column: card3, Unique values: 114
Column: card2, Unique values: 500
Column: card4, Unique values: 4
One-hot encoding card4 with 4 unique values
Column: card6, Unique values: 4
One-hot encoding card6 with 4 unique values
Column: M1, Unique values: 2
One-hot encoding M1 with 2 unique values
Column: P_emaildomain, Uni

In [9]:
# include the high cardinality categorical ID columns but remove duplicates
high_card_cols = list(set(high_card_cols + id_high_card_cols))  # remove duplicates
print(f"High cardinality categorical columns: {high_card_cols}")
print(len(high_card_cols))

High cardinality categorical columns: ['id_31', 'card5', 'card1', 'addr2', 'card3', 'id_13', 'addr1', 'R_emaildomain', 'id_17', 'card2', 'id_19', 'P_emaildomain', 'DeviceInfo', 'id_20']
14


In [10]:
# find all bool columns in df
bool_cols = df.select_dtypes(include="bool").columns

# cast them to int (True→1, False→0)
df[bool_cols] = df[bool_cols].astype(int)

In [11]:
df.head()

Unnamed: 0,isFraud,TransactionAmt,card1,card2,card3,card5,addr1,addr2,dist1,P_emaildomain,...,card6_credit,card6_debit,card6_debit or credit,M1_T,ProductCD_H,ProductCD_R,ProductCD_S,ProductCD_W,M8_T,M9_T
0,0,68.5,13926,,150.0,142.0,315.0,87.0,19.0,,...,1,0,0,1,0,0,0,1,0,0
1,0,29.0,2755,404.0,150.0,102.0,325.0,87.0,8.0,gmail.com,...,1,0,0,0,0,0,0,1,0,0
2,0,59.0,4663,490.0,150.0,166.0,330.0,87.0,287.0,outlook.com,...,0,1,0,1,0,0,0,1,0,0
3,0,50.0,18132,567.0,150.0,117.0,476.0,87.0,8.0,yahoo.com,...,0,1,0,0,0,0,0,1,0,0
4,0,50.0,4497,514.0,150.0,102.0,420.0,87.0,8.0,gmail.com,...,1,0,0,0,1,0,0,0,0,0


In [12]:
# label encode high cardinality categorical columns and
# Store encoders for later inference
encoders = {}
for col in high_card_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    encoders[col] = le


In [13]:
# separate all columns from high cardinality categorical columns
other_cols = [c for c in df.columns if c not in high_card_cols]
other_cols.remove("isFraud")

In [14]:
# Faster way to split UIDs into train/val/test sets
# Create UID column using the robust helper
df['UID'] = compute_uids(df)

# Label UIDs as fraud if any txn is fraud, then split
uid_labels = df.groupby('UID')['isFraud'].max().rename('UID_isFraud')
df = df.merge(uid_labels, left_on='UID', right_index=True)

rng = np.random.RandomState(42)
normal_uids = uid_labels[uid_labels == 0].index.values
fraud_uids  = uid_labels[uid_labels == 1].index.values

# Split normal UIDs as 60% in train, 20% in val, 20% in hold
rng.shuffle(normal_uids)
n_norm = len(normal_uids)
n_train = int(0.60 * n_norm)
n_val   = int(0.20 * n_norm)

train_norm_uids = normal_uids[:n_train]
val_norm_uids   = normal_uids[n_train:n_train+n_val]
hold_norm_uids  = normal_uids[n_train+n_val:]

train_uids = set(train_norm_uids)
val_uids   = set(val_norm_uids)
test_uids  = set(hold_norm_uids) | set(fraud_uids)

def slice_by_uids(df_in, uids):
    return df_in[df_in['UID'].isin(uids)].copy()

train_df = slice_by_uids(df, train_uids)  # normals only
val_df   = slice_by_uids(df, val_uids)    # normals only
test_df  = slice_by_uids(df, test_uids)   # mixture

print(f"Train(normals) UIDs: {len(train_uids)}  | rows: {train_df.shape[0]}")
print(f"Val(normals)   UIDs: {len(val_uids)}    | rows: {val_df.shape[0]}")
print(f"Test(mix)      UIDs: {len(test_uids)}   | rows: {test_df.shape[0]}")

# Feature matrices per split (drop label & UID)
X_train = train_df.drop(['isFraud','UID'], axis=1)
X_val   = val_df  .drop(['isFraud','UID'], axis=1)
X_test  = test_df .drop(['isFraud','UID'], axis=1)


Train(normals) UIDs: 187885  | rows: 340202
Val(normals)   UIDs: 62628    | rows: 113875
Test(mix)      UIDs: 74792   | rows: 136463


In [15]:
# Identify one-hot columns (all values are 0 or 1)
one_hot_cols = [col for col in X_train.columns if set(X_train[col].unique()) <= {0, 1}]

# Define columns to be standardized
std_cols = [col for col in X_train.columns if col not in one_hot_cols and col not in high_card_cols]


In [16]:
# Standardize features
scaler = StandardScaler()

X_train_scaled = X_train.copy()
X_val_scaled    = X_val.copy()
X_test_scaled   = X_test.copy()

X_train_scaled[std_cols] = scaler.fit_transform(X_train[std_cols])
X_val_scaled[std_cols]   = scaler.transform(X_val[std_cols])
X_test_scaled[std_cols]  = scaler.transform(X_test[std_cols])


In [17]:
# Convert to PyTorch tensors
X_num_train = torch.tensor(X_train_scaled[other_cols].values, dtype=torch.float32)
X_cat_train = torch.tensor(X_train_scaled[high_card_cols].values, dtype=torch.long)

X_num_val = torch.tensor(X_val_scaled[other_cols].values, dtype=torch.float32)
X_cat_val = torch.tensor(X_val_scaled[high_card_cols].values, dtype=torch.long)

X_num_test = torch.tensor(X_test_scaled[other_cols].values, dtype=torch.float32)
X_cat_test = torch.tensor(X_test_scaled[high_card_cols].values, dtype=torch.long)


In [18]:
# Create TensorDatasets for train/val/test sets
train_ds = TensorDataset(X_num_train, X_cat_train, X_num_train, X_cat_train)
val_ds   = TensorDataset(X_num_val, X_cat_val, X_num_val, X_cat_val)
test_ds  = TensorDataset(X_num_test, X_cat_test, X_num_test, X_cat_test)

# Build Pytorch loaders
BATCH_SIZE = 512

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_loader   = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False)
test_loader  = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False)


In [19]:
# model definition with categorical embeddings
import torch
import torch.nn as nn
import torch.nn.functional as F

class AutoEncoderWithEmbeddings(nn.Module):
    def __init__(self, num_numeric, cat_cardinalities, hidden1=128, hidden2=64, code_size=8, dropout_rate=0.2):
        super(AutoEncoderWithEmbeddings, self).__init__()
        
        # Create embeddings for each categorical feature
        self.embeddings = nn.ModuleList([
            nn.Embedding(num_categories, min(50, (num_categories + 1)//2))
            for num_categories in cat_cardinalities
        ])
        
        emb_size_total = sum([emb.embedding_dim for emb in self.embeddings])
        total_input_size = num_numeric + emb_size_total
        
        self.dropout = nn.Dropout(dropout_rate)
        
        # Encoder
        self.fc1 = nn.Linear(total_input_size, hidden1)
        self.fc2 = nn.Linear(hidden1, hidden2)
        self.fc3 = nn.Linear(hidden2, code_size)
        
        # Decoder
        self.fc4 = nn.Linear(code_size, hidden2)
        self.fc5 = nn.Linear(hidden2, hidden1)
        self.fc6 = nn.Linear(hidden1, total_input_size)
    
    def forward(self, x_num, x_cat):
        # Embed categorical variables
        embeds = [emb(x_cat[:, i]) for i, emb in enumerate(self.embeddings)]
        embeds = torch.cat(embeds, dim=1)
        
        # Concatenate numerical + embeddings
        x = torch.cat([x_num, embeds], dim=1)

        # Encoder with dropout noise
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        
        code = F.relu(self.fc3(x))
        
        # Decoder
        x = F.relu(self.fc4(code))
        x = self.dropout(x)
        
        x = F.relu(self.fc5(x))
        x = self.dropout(x)
        
        output = self.fc6(x)  # Linear activation
        return output

In [20]:

def per_sample_mse(model, generator):
    # Targets are built from *detached* embedding lookups to avoid target leakage
    model.eval()
    crit = torch.nn.MSELoss(reduction="none")
    losses = []
    with torch.no_grad():
        for xb_num, xb_cat, yb_num, yb_cat in generator:
            # Forward pass
            y_pred = model(xb_num, xb_cat)

            # Construct full target (numeric + embeddings)
            true_embeds = []
            if xb_cat.shape[1] > 0:
                for i in range(len(model.embeddings)):
                    true_embeds.append(model.embeddings[i](yb_cat[:, i]))
            y_true_full = torch.cat([yb_num] + true_embeds, dim=1) if true_embeds else yb_num

            # Compute Loss
            l = crit(y_pred, y_true_full).mean(dim=1).cpu().numpy()
            losses.extend(l.tolist())
    return np.array(losses)


In [21]:

def evaluate_model(model,generator,criterion):
    # Detached embedding targets for validation loss
    model.eval()
    batch_losses = []
    with torch.no_grad():
        for xb_num, xb_cat, yb_num, yb_cat in generator:
            # Forward pass
            y_pred = model(xb_num, xb_cat)

            # Construct full target (numeric + embeddings)
            true_embeds = []
            if xb_cat.shape[1] > 0:
                for i in range(len(model.embeddings)):
                    true_embeds.append(model.embeddings[i](yb_cat[:, i]))
            y_true_full = torch.cat([yb_num] + true_embeds, dim=1) if true_embeds else yb_num

            # Compute Loss
            loss = criterion(y_pred, y_true_full).mean(dim=1)
            batch_losses.extend(list(loss.cpu().numpy()))
    return float(np.mean(batch_losses))


In [22]:

def training_loop(model,training_generator,valid_generator,optimizer,criterion,
                  max_epochs=100,apply_early_stopping=True,patience=3,verbose=False):
    # Train with detached embedding targets; early stop on Val(normals) and restore best checkpoint
    model.train()

    class EarlyStopping:
        def __init__(self, patience=3, verbose=False):
            self.patience = patience
            self.verbose = verbose
            self.counter = 0
            self.best_score = np.inf
            self.best_state = None
        def step(self, current_score, model):
            if current_score < self.best_score - 1e-4: # tolerance of 1e-4
                self.best_score = current_score
                self.counter = 0
                self.best_state = {k: v.cpu().clone() for k,v in model.state_dict().items()}
                if self.verbose:
                    print("New best score:", current_score)
            else:
                self.counter += 1
                if self.verbose:
                    print(f"No improvement. Patience {self.counter}/{self.patience}")
            return self.counter < self.patience

    if apply_early_stopping:
        early_stopping = EarlyStopping(verbose=verbose,patience=patience)

    all_train_losses = []
    all_valid_losses = []
    start_time=time.time()

    for epoch in range(max_epochs):
        model.train()
        train_loss=[]
        for xb_num, xb_cat, yb_num, yb_cat in training_generator:
            optimizer.zero_grad()
            y_pred = model(xb_num, xb_cat)
            # detach embedding targets
            with torch.no_grad():
                true_embeds = []
                if xb_cat.shape[1] > 0:
                    for i in range(len(model.embeddings)):
                        true_embeds.append(model.embeddings[i](yb_cat[:, i]))
                y_true_full = torch.cat([yb_num] + true_embeds, dim=1) if true_embeds else yb_num
            loss = criterion(y_pred, y_true_full).mean()
            loss.backward()
            optimizer.step()
            train_loss.append(loss.item())

        if verbose:
            print('')
            print('Epoch {}: train loss: {}'.format(epoch, np.mean(train_loss)))
        valid_loss = evaluate_model(model,valid_generator,criterion)
        all_valid_losses.append(valid_loss)
        if verbose:
            print('valid loss: {}'.format(valid_loss))
        if apply_early_stopping:
            if not early_stopping.step(valid_loss, model):
                if verbose:
                    print("Early stopping")
                break

    training_execution_time=time.time()-start_time
    if apply_early_stopping and early_stopping.best_state is not None:
        model.load_state_dict(early_stopping.best_state)

    return model,training_execution_time,all_train_losses,all_valid_losses


In [23]:
# Number of numeric features (including one-hot encodings)
num_numeric = len(other_cols)

# Cardinalities of only high cardinality categorical features
cat_cardinalities = [df[col].nunique() for col in high_card_cols]

# Initialize the embedding autoencoder model
seed_everything(SEED)
model = AutoEncoderWithEmbeddings(num_numeric=num_numeric, 
                            cat_cardinalities=cat_cardinalities,
                            hidden1=128, hidden2=64, code_size=8, dropout_rate=0.2)

# specify loss criterion and optimizer
criterion = torch.nn.MSELoss(reduction="none")
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

# Generate losses before training
losses = per_sample_mse(model, val_loader)


In [24]:
print(losses[0:5])
print(np.mean(losses))

[0.77852422 0.6913051  0.91999686 0.72580612 2.81249809]
0.9808297051247598


In [25]:
# Train with early stopping on Val (normals)
model, train_time, _, _ = training_loop(model, train_loader, val_loader, optimizer, criterion,
                                        max_epochs=500, apply_early_stopping=True, patience=5, verbose=True)


Epoch 0: train loss: 0.6593874987803008
valid loss: 0.5122092962265015
New best score: 0.5122092962265015

Epoch 1: train loss: 0.5216837655780907
valid loss: 0.48909398913383484
New best score: 0.48909398913383484

Epoch 2: train loss: 0.4952052961614795
valid loss: 0.4779715836048126
New best score: 0.4779715836048126

Epoch 3: train loss: 0.48269162688936507
valid loss: 0.46698975563049316
New best score: 0.46698975563049316

Epoch 4: train loss: 0.4693346645150866
valid loss: 0.45358574390411377
New best score: 0.45358574390411377

Epoch 5: train loss: 0.4567858086044627
valid loss: 0.445102334022522
New best score: 0.445102334022522

Epoch 6: train loss: 0.4473322482037365
valid loss: 0.4362899661064148
New best score: 0.4362899661064148

Epoch 7: train loss: 0.437493185889452
valid loss: 0.4257003664970398
New best score: 0.4257003664970398

Epoch 8: train loss: 0.42901792140831624
valid loss: 0.4185340404510498
New best score: 0.4185340404510498

Epoch 9: train loss: 0.42211763

In [26]:
# Define 70% threshold based on validation set
val_errs = per_sample_mse(model, val_loader)
thresh = float(np.percentile(val_errs, 70.0))
print("Validation 70th percentile error threshold =", thresh)

Validation 70th percentile error threshold = 0.29802235960960366


In [27]:
# final test evaluation (UID level) using val-based threshold
from sklearn.metrics import roc_auc_score
# Compute errors on test set
test_errs = per_sample_mse(model, test_loader)

# Construct dictionary to store UID-level errors
uid_errors = defaultdict(list)
for uid, err in zip(test_df['UID'].values, test_errs):
    uid_errors[uid].append(err)

# Average error per UID for average rule
uid_avg = {u: float(np.mean(es)) for u, es in uid_errors.items()}

# Create data frame to store average error per UID
uid_df  = pd.DataFrame.from_dict(uid_avg, orient='index', 
                                  columns=['avg_error'])

# Define ground-truth label
uid_df['true_label'] = uid_df.index.isin(fraud_uids).astype(int)

# metrics
auc = roc_auc_score(uid_df['true_label'], uid_df['avg_error'])
print(f"UID‐level ROC AUC: {auc:.4f}")

UID‐level ROC AUC: 0.8035


In [28]:
from sklearn.metrics import recall_score

uid_df['pred'] = (uid_df['avg_error'] >= thresh).astype(int)
recall   = recall_score(uid_df['true_label'], uid_df['pred'])    # binary‐class recall
print(f"Recall at 70th percentile val-error threshold = {recall:.4f}")

Recall at 70th percentile val-error threshold = 0.7590


In [29]:
# UID‐level evaluation via fraction‐rule

# For each UID, compute fraction of its txns flagged as fraud
uid_frac = {u: float((np.array(es) >= thresh).mean()) for u, es in uid_errors.items()}

# Build DataFrame and true labels
uid_df = pd.DataFrame.from_dict(uid_frac, orient='index',
                                columns=['fraud_frac'])
uid_df['true_label'] = uid_df.index.isin(fraud_uids).astype(int)

# Classify UID as fraud if majority of its txns exceed threshold
uid_df['pred'] = (uid_df['fraud_frac'] >= 0.50).astype(int)

# Compute metrics
auc_frac = roc_auc_score(uid_df['true_label'], uid_df['fraud_frac'])
recall_frac = recall_score(uid_df['true_label'], uid_df['pred'])

print(f"UID‐level ROC AUC (fraction rule):       {auc_frac:.4f}")
print(f"UID‐level recall (majority‐vote rule): {recall_frac:.4f}")


UID‐level ROC AUC (fraction rule):       0.7271
UID‐level recall (majority‐vote rule): 0.7641
