# Fraud Detection Model

In [1]:
import warnings
warnings.filterwarnings('ignore')


import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import IsolationForest
# from sklearn.neighbors import LocalOutlierFactor
import joblib
from datetime import datetime
from sklearn.metrics import confusion_matrix, classification_report, recall_score

# reading in synthetic data that should be in the same folder
df = pd.read_csv("synthetic_plaid_transactions.csv")

In [2]:

NUMBER_FEATURES = 12

# using scaler created to normalize data
def apply_scaler(df, state):
    data = create_matrix(df, state)
    return state['scaler'].transform(data)

In [4]:


def create_matrix(df, state):
    rows = []
    for _, row in df.iterrows():
        rows.append(create_row(row, state))
    return np.vstack(rows) if rows else np.zeros((0, NUMBER_FEATURES))


# converting the transaction into numbers to be added to the matrix
def create_row(txn, state):
    features = []

    # adding amount features
    amount = float(txn.get('amount', 0.0))
    features.append(amount)
    # log to reduce impact from spike
    features.append(np.log1p(max(amount, 0.0)))


    # adding date and time related features
    dt = pd.to_datetime(txn.get('date', datetime.utcnow()), errors='coerce')
    if pd.isna(dt):
        dt = pd.Timestamp(datetime.utcnow())

    features.extend([
        dt.dayofweek,
        dt.day,
        dt.month,
        getattr(dt, 'hour', 12) if not pd.isna(getattr(dt, 'hour', np.nan)) else 12,
    ])
    features.append(1 if dt.dayofweek >= 5 else 0)

    # get merchant and the channel encodings
    merchant = txn.get('merchant_name', 'Unknown')
    if pd.isna(merchant):
        merchant = 'Unknown'
    channel = txn.get('payment_channel', 'online')
    if pd.isna(channel):
        channel = 'online'

    merchant_encoder = state['merchant_encoder']
    channel_encoder = state['channel_encoder']

    if merchant not in merchant_encoder.classes_:
        merchant = 'Unknown'
    if channel not in channel_encoder.classes_:
        channel = 'online'


    merchant_encoded = int(merchant_encoder.transform([merchant])[0])
    channel_encoded  = int(channel_encoder.transform([channel])[0])

    features.append(merchant_encoded)
    features.append(channel_encoded)

    # adding txn status feature
    features.append(1 if bool(txn.get('pending', False)) else 0)

    mean_merchant = state['merchant_mean']
    std_merchant  = state['merchant_std']

    # using zscore to determine how far from the avg it is
    if merchant in mean_merchant:
        m_mean = float(mean_merchant[merchant])
        m_std  = float(std_merchant.get(merchant, 0.0))
        denom  = m_std if m_std and m_std > 0 else 1.0
        z = (amount - m_mean) / (denom + 1e-6)
        
    else:
        g_mean = float(state.get('global_amount_mean', amount))
        g_std  = float(state.get('global_amount_std', 1.0))
        denom  = g_std if g_std and g_std > 0 else 1.0
        z = (amount - g_mean) / (denom + 1e-6)
        # features.append(10.0)
        # features.append(10.0)
    z = float(np.clip(z, -5.0, 5.0))

    features.append(z)
    features.append(abs(z))
    return np.array(features, dtype=float)





In [5]:
# encode the data into numbers and determine scaler needed

feature_state = {}

# encoding categorical terms into numbers
merchant_encoder = LabelEncoder()
channel_encoder = LabelEncoder()

merchant_series = df.get('merchant_name', pd.Series(['Unknown'] * len(df))).fillna('Unknown')
channel_series  = df.get('payment_channel', pd.Series(['online'] * len(df))).fillna('online')

merchant_series = pd.concat([merchant_series, pd.Series(['Unknown'])], ignore_index=True)
channel_series  = pd.concat([channel_series,  pd.Series(['online', 'in_store'])], ignore_index=True)

merchant_encoder.fit(merchant_series)
channel_encoder.fit(channel_series)

# grouping data to get the avg and std of amounts
grp = df.groupby('merchant_name', dropna=False)['amount']
merchant_mean = grp.mean().to_dict()
merchant_std  = grp.std().fillna(0.0).to_dict()

# need these for any merchants that arn't in training data
global_mean = float(df['amount'].mean())
global_std  = float(df['amount'].std() or 1.0)

tmp_state = {
    'merchant_encoder': merchant_encoder,
    'channel_encoder' : channel_encoder,
    'merchant_mean'   : merchant_mean,
    'merchant_std'    : merchant_std,
    'global_amount_mean': global_mean,
    'global_amount_std' : global_std,
    'scaler'          : None,
}

# build out a feature table for the txns to determine best scaling
data_unscaled = create_matrix(df, tmp_state)
scaler = StandardScaler().fit(data_unscaled)

feature_state.update(tmp_state)
feature_state['scaler'] = scaler


In [6]:
# config and train models

isolation_forest = IsolationForest(
    contamination=0.02,
    n_estimators=500, #100
    random_state=42,
    n_jobs=-1,
    warm_start=False
)


print(f"Training IF Model")
train_data = apply_scaler(df, feature_state)
isolation_forest.fit(train_data)




Training IF Model


0,1,2
,n_estimators,500
,max_samples,'auto'
,contamination,0.02
,max_features,1.0
,bootstrap,False
,n_jobs,-1
,random_state,42
,verbose,0
,warm_start,False


In [7]:
models = {
    'isolation_forest': isolation_forest,
    'contamination': 0.02,
    'random_state': 42,
}

In [8]:
y_true = df['is_anomaly'].astype(int).values
y_pred = models['isolation_forest'].predict(train_data)

# -1 for outliers  1 for normal
y_pred = np.where(y_pred == -1, 1, 0)


con_matrix = confusion_matrix(y_true, y_pred)
tn, fp, fn, tp = con_matrix.ravel()

print("----Confusion Matrix")
print(con_matrix)
print(f"True Negatives : {tn}")
print(f"False Positives: {fp}")
print(f"False Negatives: {fn}")
print(f"True Positives : {tp}")
print()

print("----Classification Report")
print(classification_report(
    y_true, y_pred,
    target_names=['Normal', 'Fraud'],
    digits=3
))


----Confusion Matrix
[[32422   343]
 [  155   322]]
True Negatives : 32422
False Positives: 343
False Negatives: 155
True Positives : 322

----Classification Report
              precision    recall  f1-score   support

      Normal      0.995     0.990     0.992     32765
       Fraud      0.484     0.675     0.564       477

    accuracy                          0.985     33242
   macro avg      0.740     0.832     0.778     33242
weighted avg      0.988     0.985     0.986     33242



In [9]:
train_scores = -isolation_forest.score_samples(train_data)

# putting low_cutoff bit lower to setup for medium risk to be top 10% and high at 2%
# balancing out not catching all fraud but marking for review
low_cutoff  = np.quantile(train_scores, 0.90)
high_cutoff = np.quantile(train_scores, 0.98)



Chose the balanced setup ( contamination=0.02, n_estimators=500) because it helps the model catch more real fraud cases without flagging too many normal transactions by mistake. Setting it around 2% makes the model sensitive enough to catch unusual activity, but not so much that it flags everything. Went with this so the system can automatically catch the most likely fraud while marking medium-risk cases using threshold cutoffs which will be able to be reviewed in the system.

In [10]:
# save model and needed feature data
model_output_path = "fraud_model.joblib"


bundle = {
    'feature_state': {
        'merchant_encoder': feature_state['merchant_encoder'],
        'channel_encoder': feature_state['channel_encoder'],
        'merchant_mean': feature_state['merchant_mean'],
        'merchant_std': feature_state['merchant_std'],
        'scaler': feature_state['scaler'],
        'risk_thresholds': {
            'LOW_RISK_MAX': float(low_cutoff),
            'HIGH_RISK_MIN': float(high_cutoff)
        }
    },
    'models': {
        'isolation_forest': models['isolation_forest'],
        'contamination': models['contamination'],
        'random_state': models['random_state'],
    }
}
joblib.dump(bundle, model_output_path)

print(f"Model Saved")

Model Saved


### Running predictions on txns

In [11]:
# these are the utils that should be exposed to fraud detection service
def predict_single(transaction, feature_state, models):
    df = pd.DataFrame([transaction])

    thresholds = feature_state.get('risk_thresholds', {})
    LOW_RISK_MAX  = thresholds.get('LOW_RISK_MAX')
    HIGH_RISK_MIN = thresholds.get('HIGH_RISK_MIN')
    
    data = apply_scaler(df, feature_state)

    if_pred = models['isolation_forest'].predict(data)[0]
    if_score = float(-models['isolation_forest'].score_samples(data)[0])

    is_fraud = (if_pred == -1)

    if if_score >= HIGH_RISK_MIN:
        tier = "high"
    elif if_score >= LOW_RISK_MAX:
        tier = "medium"
    else:
        tier = "low"

    return {
        'is_fraud': bool(is_fraud),
        'fraud_score': if_score,
        'isolation_forest_prediction': int(if_pred),
        'risk_level': tier
    }


# will need util to load model and data
def load_pipeline(filepath):
    bundle = joblib.load(filepath)
    return bundle['feature_state'], bundle['models']


def predict_new_transaction(transaction_dict, model_path="fraud_model.joblib"):
    feature_state, models = load_pipeline(model_path)

    return predict_single(transaction_dict, feature_state, models)

In [12]:
txn_low1 = {
    "merchant_name": "Starbucks",
    "amount": 6.25,
    "payment_channel": "in_store",
    "pending": False,
    "date": "2025-10-23T08:15:00",
    "authorized_date": "2025-10-23T08:15:00",
}

txn_low2 = {
    "merchant_name": "Amazon",
    "amount": 48.99,
    "payment_channel": "online",
    "pending": False,
    "date": "2025-10-22T19:30:00",
    "authorized_date": "2025-10-22T19:30:00",
}

txn_low3 = {
    "merchant_name": "Uber",
    "amount": 12.80,
    "payment_channel": "online",
    "pending": False,
    "date": "2025-09-20T19:42:00"
}

txn_low4= {
    'transaction_id': 'new_001',
    'account_id': 1,
    'user_id': 1,
    'item_id': 1,
    'plaid_transaction_id': 'plaid_123456789',
    "merchant_name": "Chevron",
    "amount": 38.27,
    "payment_channel": "in_store",
    "pending": False,
    "date": "2025-09-18T18:55:00",
    "authorized_date": "2025-09-18T18:55:00",
    'iso_currency_code': 'USD',
},

txn_low5 = {
    'transaction_id': 'new_001',
    'account_id': 1,
    'user_id': 1,
    'item_id': 1,
    'plaid_transaction_id': 'plaid_123456789',
    "merchant_name": "Walmart",
    "amount": 41.16,
    "payment_channel": "in_store",
    "pending": False,
    "date": "2025-09-21T13:25:00",
    "authorized_date": "2025-09-21T13:25:00",
    'iso_currency_code': 'USD',
},


In [13]:
txn_med1 = {
    "merchant_name": "Best Buy",
    "amount": 275.49,
    "payment_channel": "in_store",
    "pending": False,
    "date": "2025-10-19T13:45:00",
    "authorized_date": "2025-10-19T13:45:00",
}

txn_med2 = {
    "merchant_name": "DoorDash",
    "amount": 89.75,
    "payment_channel": "online",
    "pending": False,
    "date": "2025-10-21T23:10:00",
    "authorized_date": "2025-10-21T23:10:00",
}


In [14]:
txn_high1 = {
    "merchant_name": "CryptoExchange.io",
    "amount": 2450.00,
    "payment_channel": "online",
    "pending": False,
    "date": "2025-10-18T02:35:00",
    "authorized_date": "2025-10-18T02:35:00",
}

txn_high2 = {
    "merchant_name": "Luxury Motors",
    "amount": 7800.00,
    "payment_channel": "in_store",
    "pending": False,
    "date": "2025-10-20T10:12:00",
    "authorized_date": "2025-10-20T10:12:00",
}

txn_high3 = {
    'transaction_id': 'new_001',
    'account_id': 1,
    'user_id': 1,
    'item_id': 1,
    'plaid_transaction_id': 'plaid_123456789',
    'name': 'Starbucks purchase',
    'merchant_name': 'Starbucks',
    'amount': 850.00,
    'iso_currency_code': 'USD',
    'date': '2025-10-24',
    'authorized_date': '2025-10-24',
    'pending': False,
    'payment_channel': 'online',
    'removed': False
}


In [15]:
for name, txn in {
    "LOW1": txn_low1,
    "LOW2": txn_low2,
    "LOW3": txn_low3,
    "LOW4": txn_low4,
    "LOW5": txn_low5,
    "MED1": txn_med1,
    "MED2": txn_med2,
    "HIGH1": txn_high1,
    "HIGH2": txn_high2,
    "HIGH3": txn_high3
}.items():
    res = predict_new_transaction(txn, "fraud_model.joblib")
    print(f"{name}: {res['risk_level']:<6}:  results={res}")


LOW1: low   :  results={'is_fraud': False, 'fraud_score': 0.45656183665083006, 'isolation_forest_prediction': 1, 'risk_level': 'low'}
LOW2: low   :  results={'is_fraud': False, 'fraud_score': 0.4553521359084569, 'isolation_forest_prediction': 1, 'risk_level': 'low'}
LOW3: low   :  results={'is_fraud': False, 'fraud_score': 0.4821653262675381, 'isolation_forest_prediction': 1, 'risk_level': 'low'}
LOW4: medium:  results={'is_fraud': False, 'fraud_score': 0.5332787658371704, 'isolation_forest_prediction': 1, 'risk_level': 'medium'}
LOW5: medium:  results={'is_fraud': False, 'fraud_score': 0.5332787658371704, 'isolation_forest_prediction': 1, 'risk_level': 'medium'}
MED1: medium:  results={'is_fraud': False, 'fraud_score': 0.543653925876238, 'isolation_forest_prediction': 1, 'risk_level': 'medium'}
MED2: low   :  results={'is_fraud': False, 'fraud_score': 0.4656770753884981, 'isolation_forest_prediction': 1, 'risk_level': 'low'}
HIGH1: high  :  results={'is_fraud': True, 'fraud_score': 0.