In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder, LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import random
import os

In [None]:
# Load IEEE-CIS CSVs from Google Drive by FILE ID
import sys, subprocess
from pathlib import Path

# Ensure gdown is available
try:
    import gdown
except Exception:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "gdown"])
    import gdown

# File IDs to pull from Google Drive
TXN_ID = "1-D90o63nu_b-N1h-pNUlIRkUMVuP5JLs"
IDM_ID = "1tUZyy06wbS9l-3yTaXC_O8TgbQLjrg6G"

DATA_DIR = Path("./data/ieee-fraud")
DATA_DIR.mkdir(parents=True, exist_ok=True)

# Download if missing
txn_path = DATA_DIR / "train_transaction.csv"

if not txn_path.exists(): gdown.download(id=TXN_ID, output=str(txn_path), quiet=False)

# Read CSVs (with a forgiving parser fallback)
try:
    kaggle_set = pd.read_csv(txn_path, low_memory=False)
except Exception:
    kaggle_set = pd.read_csv(txn_path, low_memory=False, engine="python")



Downloading...
From (original): https://drive.google.com/uc?id=1-D90o63nu_b-N1h-pNUlIRkUMVuP5JLs
From (redirected): https://drive.google.com/uc?id=1-D90o63nu_b-N1h-pNUlIRkUMVuP5JLs&confirm=t&uuid=82dc73a6-cfd8-4bc5-954c-070ab9184192
To: /Users/sara/Desktop/data/ieee-fraud/train_transaction.csv
100%|██████████| 683M/683M [19:05<00:00, 597kB/s] 


NameError: name 'idm' is not defined

In [3]:
print("train_identity  :", kaggle_set.shape)

train_identity  : (590540, 394)


In [None]:
#load data from my computer. Cell is Off rn becuase we're loading from drive, in this case data are tabluar and of mixed types
#kaggle_set = pd.read_csv('/Users/sara/Desktop/train_transaction.csv')

In [4]:
# Check the proportion of fraud cases so we can think about what would be a good final contamination level
fraud_proportion = kaggle_set['isFraud'].sum()/kaggle_set.shape[0]  

# Drop the labels so they can't be used as a factor
df = kaggle_set.copy().drop(columns=['isFraud']) 

# Add a count of nans in columns. Not using one-hot encoding because it will cluster fraud cases together. 
df['nan_count'] = df.isnull().sum(axis=1)

# Separate into categorical and numerical factors
categorical_df = df.select_dtypes(include=['object']).copy()
#fill missing values in categorical columns with the mode of the column
for column in categorical_df.columns:
    if categorical_df[column].isnull().any(): 
        categorical_df[column] = categorical_df[column].fillna(categorical_df[column].mode()[0])
        

categorical_df = categorical_df.apply(lambda x: LabelEncoder().fit_transform(x.astype(str)))

numerical_df = df.select_dtypes(exclude=['object']).copy()
# Fill missing values in numerical columns with the mean of the column
for column in numerical_df.columns:
    # Convert numerical columns to float type and Z-normalize
    numerical_df[column] = numerical_df[column].astype(float)
    numerical_df[column] = (numerical_df[column]- numerical_df[column].mean())/ numerical_df[column].std()

    # Fill missing values with the mean of the column (0 after normalization)
    numerical_df[column] = numerical_df[column].fillna(numerical_df[column].mean())
   

In [5]:
#Use PCA to reduce dimensionality
pca = PCA(n_components=0.95)  # Keep 79% of variance (I think this was Adrian's number but correct me if I'm wrong)
our_pca = pca.fit_transform(numerical_df.drop(columns=['nan_count']) )

df_pca = pd.DataFrame(our_pca, columns=[f'PC{i+1}' for i in range(our_pca.shape[1])])
# add the categorical and nancount columns back to the PCA dataframe
df_pca = pd.concat([df_pca, categorical_df.reset_index(drop=True)], axis = 1)
df_pca['nan_count'] = numerical_df['nan_count'].reset_index(drop=True)

In [6]:
def detect_anomalies(data, contamination, random_state=42):
#takes a DataFrame, applies Isolation Forest, and returns a DataFrame with anomaly scores and predictions.

    # Step 4: Isolation Forest
    model = IsolationForest(contamination=contamination, random_state=random_state)
    model.fit(data)

    scores = model.decision_function(data)
    preds = model.predict(data)
    scaled_scores = 1 - MinMaxScaler().fit_transform(scores.reshape(-1, 1))

    # Step 5: Package results
    results = df.copy()
    results['anomaly_score'] = scores
    results['anomaly_likelihood'] = scaled_scores.flatten()
    results['is_anomaly'] = (preds == -1).astype(int)

    return results

In [7]:
#here's where we actually run the model. We set the contamination to .3 for reasons discussed later.
# df is mixed-type data with missing values
results = detect_anomalies(df_pca, contamination=.3)

In [8]:
results['is_fraud'] = kaggle_set['isFraud']  # Add original fraud labels for comparison
results.sort_values("anomaly_likelihood", ascending=False).head() # View top anomalies

Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,V335,V336,V337,V338,V339,nan_count,anomaly_score,anomaly_likelihood,is_anomaly,is_fraud
583412,3570412,15605960,430.0,S,11755,174.0,150.0,visa,195.0,credit,...,0.0,0.0,104060.0,104060.0,104060.0,118,-0.381264,1.0,1,0
452627,3439627,11560420,250.0,S,10024,321.0,150.0,visa,144.0,credit,...,250.0,0.0,750.0,750.0,750.0,95,-0.369513,0.973614,1,0
429406,3416406,10859391,600.0,S,12316,548.0,150.0,visa,195.0,credit,...,0.0,0.0,64800.0,64800.0,64800.0,47,-0.363836,0.960867,1,0
85794,3072794,1811724,50.0,H,15186,512.0,150.0,mastercard,224.0,debit,...,0.0,0.0,0.0,0.0,0.0,165,-0.362491,0.957845,1,0
507373,3494373,13291141,300.0,S,9043,170.0,150.0,visa,195.0,credit,...,0.0,0.0,0.0,0.0,0.0,77,-0.351051,0.932156,1,0


In [9]:
results['final_ans']= results['is_fraud']*4 - results['is_anomaly']

TP_count = (results['final_ans'] == 3).sum() # True positive is 4-1 = 3
FP_count = (results['final_ans'] == -1).sum() # False positive is 0-1 = -1
FN_count = (results['final_ans'] == 4).sum() # False negative is 4-0 = 4
TN_count = (results['final_ans'] == 0).sum() # True negative is 0-0 = 0

Precision = TP_count / (TP_count + FP_count) if (TP_count + FP_count) > 0 else 0
Recall = TP_count / (TP_count + FN_count) if (TP_count + FN_count) > 0 else 0
F1_score = 2 * (Precision * Recall) / (Precision + Recall) if (Precision + Recall) > 0 else 0

print(f"Precision: {Precision:.4f}, Recall: {Recall:.4f}, F1 Score: {F1_score:.4f}")

Precision: 0.0753, Recall: 0.6460, F1 Score: 0.1349


In [None]:
# In this cell we do an analysis of the optimal contamination level for anomaly detection. 
# Chosing this level based on our entire dataset is a bit of overfitting, 
# so we ultimately ignore this in favor of using the contamination level based on the fraud proportion in the dataset and biasing towards improved recall.
# We selected 30% contamination based on our best judgement to improve recall as much as we reasonably can, but one can adjust this as needed.

def find_optimal_contamination(data, contamination_range=np.arange(0.01, 0.5, 0.01)):
    best_f1 = 0
    best_contamination = 0

    for contaminations in contamination_range:
        results = detect_anomalies(data, contamination=contaminations)
        results['is_fraud'] = kaggle_set['isFraud'] 
        results['final_ans']= results['is_fraud']*4 - results['is_anomaly']
        
        TP_count = (results['final_ans'] == 3).sum()
        FP_count = (results['final_ans'] == -1).sum()
        FN_count = (results['final_ans'] == 4).sum()

        Precision = TP_count / (TP_count + FP_count) if (TP_count + FP_count) > 0 else 0
        Recall = TP_count / (TP_count + FN_count) if (TP_count + FN_count) > 0 else 0
        F1_score = 2 * (Precision * Recall) / (Precision + Recall) if (Precision + Recall) > 0 else 0
        print(f"Contamination: {contaminations:.2f}, F1 Score: {F1_score:.4f}, Precision: {Precision:.4f}, Recall: {Recall:.4f}")

        if F1_score > best_f1:
            best_f1 = F1_score
            best_contamination = contaminations

    return best_contamination, best_f1

In [None]:
find_optimal_contamination(df_pca)
# This will print the F1 scores for different contamination levels and return the best one.
# Note: The contamination level is crucial for the performance of the Isolation Forest model.
# optimal contamination level is around 0.06 for our dataset (Contamination: 0.06, F1 Score: 0.2135, Precision: 0.1690, Recall: 0.2898)

Contamination: 0.01, F1 Score: 0.1094, Precision: 0.2460, Recall: 0.0703
Contamination: 0.02, F1 Score: 0.1663, Precision: 0.2286, Recall: 0.1307
Contamination: 0.03, F1 Score: 0.1967, Precision: 0.2130, Recall: 0.1826
Contamination: 0.04, F1 Score: 0.2082, Precision: 0.1952, Recall: 0.2232
Contamination: 0.05, F1 Score: 0.2130, Precision: 0.1811, Recall: 0.2587
Contamination: 0.06, F1 Score: 0.2135, Precision: 0.1690, Recall: 0.2898
Contamination: 0.07, F1 Score: 0.2113, Precision: 0.1584, Recall: 0.3170
Contamination: 0.08, F1 Score: 0.2086, Precision: 0.1499, Recall: 0.3428


(np.float64(0.060000000000000005), np.float64(0.21352681118083286))

In [None]:
#Here we calculate the roc-auc to compare with other models
from sklearn.metrics import roc_auc_score   
roc_auc = roc_auc_score(results['is_fraud'], results['anomaly_likelihood'])
print(f"ROC AUC Score: {roc_auc:.4f}")


ROC AUC Score: 0.7383
