In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
#load data, in this case data are tabluar and of mixed types
kaggle_set = pd.read_csv('/Users/sara/Desktop/train_transaction.csv')

In [10]:
# Check the proportion of fraud cases so we can set contamination level
fraud_proportion = kaggle_set['isFraud'].sum()/kaggle_set.shape[0]  

# Drop the labels so they can't be used as a factor
df = kaggle_set.copy().drop(columns=['isFraud']) 

# Add a count of nans in columns. Not using one-hot encoding because it will cluster fraud cases together. 
df['nan_count'] = df.isnull().sum(axis=1)

# drop categoriacal and normalize numerical columns
for column in df.columns:
    if df[column].dtype == 'object':
        # Drop categorical columns
        df = df.drop(columns=[column])
    else:
        # Convert numerical columns to float type and Z-normalize
        df[column] = df[column].astype(float)
        df[column] = (df[column]- df[column].mean())/ df[column].std()
        df[column] = df[column].fillna(df[column].mean())

In [None]:
#USE PCA to reduce dimensionality

pca = PCA(n_components=0.79)  # Keep 79% of variance (I think this was Adrian's number but correct me if I'm wrong)
our_pca = pca.fit_transform(df)

df_pca = pd.DataFrame(our_pca, columns=[f'PC{i+1}' for i in range(our_pca.shape[1])])

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,...,PC28,PC29,PC30,PC31,PC32,PC33,PC34,PC35,PC36,PC37
0,-1.001706,-1.801878,-0.489763,-0.109898,0.580226,-2.620180,-0.095879,0.873571,0.503034,-0.664809,...,-0.536312,0.384384,-0.013301,-0.141127,-0.468333,-0.516122,0.182616,0.285175,-0.211934,-0.098105
1,-0.859679,-0.893833,-0.456142,-0.711481,-0.113292,-1.714956,-0.785104,1.024596,0.527242,0.871230,...,-0.246510,0.531709,-0.753970,0.311011,0.972958,-0.143113,-0.331173,0.571395,-0.012171,0.203299
2,-1.091533,-2.071031,-0.561796,0.065166,0.918129,-3.101187,-0.023381,0.652952,0.101622,0.815020,...,-0.577988,0.216474,0.277321,-0.438641,-1.098503,-0.642928,0.425275,0.065258,-0.360146,-0.013328
3,1.771189,-3.027413,-0.398349,1.065552,2.465435,-1.447535,-2.322455,-0.519230,1.271450,1.551222,...,-1.100929,1.068537,-0.201278,-1.461905,-0.107566,0.298104,0.180501,0.327528,-0.693011,-0.375399
4,-0.561672,0.906274,-0.431643,-1.183070,-0.566948,-0.714862,-0.336493,-0.100575,-0.828533,-0.347156,...,-0.658807,0.551564,3.150147,0.694725,0.721570,0.233560,-0.329801,-2.858915,0.965574,0.012583
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
590535,-1.044135,-3.500048,-0.132132,-0.032347,0.181965,0.108341,1.023852,-0.151815,0.393984,-5.159322,...,0.299378,-0.317615,0.434326,-0.251797,0.262653,-0.078158,-0.417996,-0.683644,-0.785237,-0.557063
590536,-1.044411,-2.149833,-0.735307,0.260718,1.126555,-3.342198,0.265988,0.474282,-0.058550,0.792514,...,0.196928,-0.411799,-0.126302,0.622010,1.055668,0.334589,-0.339464,0.070562,0.098586,0.072097
590537,-1.251552,-3.044963,-0.049614,-0.652464,-0.832677,0.602407,2.215143,-1.259183,-1.518077,0.120776,...,0.295979,-0.154167,-0.027807,0.398860,0.886591,0.462746,-0.225308,0.042711,0.137775,0.033404
590538,0.394821,-1.037966,-1.016117,3.789701,7.115883,-2.146840,1.437945,-3.407979,-2.819554,3.143359,...,0.216333,-1.566922,0.315043,1.460021,-0.656892,0.726706,0.079019,-0.517470,0.836246,-0.147128


In [22]:
def detect_anomalies(data, contamination=0.1, random_state=42):
#takes a DataFrame, applies Isolation Forest, and returns a DataFrame with anomaly scores and predictions.

    # Step 4: Isolation Forest
    model = IsolationForest(contamination=contamination, random_state=random_state)
    model.fit(data)

    scores = model.decision_function(data)
    preds = model.predict(data)
    scaled_scores = 1 - MinMaxScaler().fit_transform(scores.reshape(-1, 1))

    # Step 5: Package results
    results = df.copy()
    results['anomaly_score'] = scores
    results['anomaly_likelihood'] = scaled_scores.flatten()
    results['is_anomaly'] = (preds == -1).astype(int)

    return results

In [None]:
# df is mixed-type data with missing values
results = detect_anomalies(df_pca, contamination=fraud_proportion)

In [25]:
results['is_fraud'] = kaggle_set['isFraud']  # Add original fraud labels for comparison
results.sort_values("anomaly_likelihood", ascending=False).head() # View top anomalies

Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,card1,card2,card3,card5,addr1,addr2,dist1,...,V335,V336,V337,V338,V339,nan_count,anomaly_score,anomaly_likelihood,is_anomaly,is_fraud
211103,-0.493719,-0.535608,1.10792,-0.872391,0.99779,-0.281828,-1.36452,0.454745,0.074098,-2.6719640000000003e-17,...,142.057914,200.659506,-0.082803,-0.138042,-0.123567,-2.986215,-0.266314,1.0,1,0
25947,-1.579842,-1.451762,1.191545,0.378739,-1.194953,-0.281828,-0.103745,0.385943,0.074098,-2.6719640000000003e-17,...,-0.152632,-0.103909,1.173767,0.629057,0.907175,-1.14586,-0.264835,0.996821,1,0
214364,-0.47459,-0.515538,1.003388,-1.449396,-1.194953,-0.281828,-1.413012,0.454745,0.074098,-2.6719640000000003e-17,...,107.398528,110.065584,-0.082803,-0.138042,-0.123567,-2.986215,-0.260212,0.986882,1,0
583412,1.69024,1.783247,1.233357,0.378739,-1.194953,-0.281828,-0.103745,0.385943,0.074098,-2.6719640000000003e-17,...,-0.152632,-0.103909,155.582193,94.890937,127.565761,-1.120299,-0.259896,0.986203,1,0
25908,-1.58007,-1.451886,1.191545,0.378739,-1.194953,-0.281828,-0.103745,0.385943,0.074098,-2.6719640000000003e-17,...,-0.152632,-0.103909,0.545482,0.245508,0.391804,-1.14586,-0.248482,0.961665,1,0


In [26]:
results['final_ans']= results['is_fraud']*4 - results['is_anomaly']

TP_count = (results['final_ans'] == 3).sum() # True positive is 4-1 = 3
FP_count = (results['final_ans'] == -1).sum() # False positive is 0-1 = -1
FN_count = (results['final_ans'] == 4).sum() # False negative is 4-0 = 4
TN_count = (results['final_ans'] == 0).sum() # True negative is 0-0 = 0

Precision = TP_count / (TP_count + FP_count) if (TP_count + FP_count) > 0 else 0
Recall = TP_count / (TP_count + FN_count) if (TP_count + FN_count) > 0 else 0
F1_score = 2 * (Precision * Recall) / (Precision + Recall) if (Precision + Recall) > 0 else 0

print(f"Precision: {Precision:.4f}, Recall: {Recall:.4f}, F1 Score: {F1_score:.4f}")

Precision: 0.1988, Recall: 0.1988, F1 Score: 0.1988
