In [1]:
import pandas as pd
import numpy as np
import random

np.random.seed(42)

n_samples = 1000

# Simulate fields
transaction_amount = np.random.exponential(scale=1500, size=n_samples).clip(1, 15000)
recent_7d_payment_methods_count = np.random.poisson(lam=1.5, size=n_samples)
num_recent_login_failures = np.random.poisson(lam=1, size=n_samples)
account_age_days = np.random.randint(1, 365, size=n_samples)
is_domestic_ip = np.random.choice([True, False], size=n_samples, p=[0.85, 0.15])

# Fraud label logic (for realism)
fraud = (
    (transaction_amount > 5000).astype(int) +
    (recent_7d_payment_methods_count >= 3).astype(int) +
    (num_recent_login_failures >= 3).astype(int) +
    ((account_age_days <= 30) & (transaction_amount > 3000)).astype(int) +
    ((~is_domestic_ip) & (transaction_amount > 2000)).astype(int)
)
label = (fraud >= 2).astype(int)  # At least 2 fraud signals triggers label

# Create DataFrame
df = pd.DataFrame({
    'transaction_amount': transaction_amount.round(2),
    'recent_7d_payment_methods_count': recent_7d_payment_methods_count,
    'num_recent_login_failures': num_recent_login_failures,
    'account_age_days': account_age_days,
    'is_domestic_ip': is_domestic_ip,
    'label': label
})



In [2]:
df

Unnamed: 0,transaction_amount,recent_7d_payment_methods_count,num_recent_login_failures,account_age_days,is_domestic_ip,label
0,703.90,0,1,258,True,0
1,4515.18,4,0,249,True,0
2,1975.12,2,0,77,True,0
3,1369.41,1,0,180,True,0
4,254.44,2,0,66,True,0
...,...,...,...,...,...,...
995,144.08,1,0,6,True,0
996,3739.05,1,2,214,True,0
997,220.70,0,1,206,True,0
998,4500.74,0,1,67,False,0


In [3]:
df.to_csv('sample_fraud_data.csv', index=False)

In [4]:
import pandas as pd
from IPython.display import display

# Path to the data file from the book's GitHub repository (chapter07 folder)
DATA_PATH = "sample_fraud_data.csv" 

sample_data = pd.read_csv(DATA_PATH)
print("Sample transaction data ('sample_fraud_data.csv') loaded.")
print("--------------------------------------------------")
print("First 5 rows:")
display(sample_data.head())
print("\n--------------------------------------------------")
print(f"Dataset shape: {sample_data.shape}")
print("--------------------------------------------------")
print("Fraud vs. Legitimate distribution:")
display(sample_data['label'].value_counts(normalize=True).reset_index().rename(
    columns={'index': 'Label', 'label': 'Proportion'}
))
print("--------------------------------------------------")


Sample transaction data ('sample_fraud_data.csv') loaded.
--------------------------------------------------
First 5 rows:


Unnamed: 0,transaction_amount,recent_7d_payment_methods_count,num_recent_login_failures,account_age_days,is_domestic_ip,label
0,703.9,0,1,258,True,0
1,4515.18,4,0,249,True,0
2,1975.12,2,0,77,True,0
3,1369.41,1,0,180,True,0
4,254.44,2,0,66,True,0



--------------------------------------------------
Dataset shape: (1000, 6)
--------------------------------------------------
Fraud vs. Legitimate distribution:


Unnamed: 0,Proportion,proportion
0,0,0.962
1,1,0.038


--------------------------------------------------


In [5]:
def calculate_rule_detection_performance(df, rules_conditions):
    """
    Applies a dictionary of rules (as pandas query strings) to a DataFrame
    and calculates performance metrics for each rule.
    """
    results_list = []

    for rule_name, condition_query_str in rules_conditions.items():
        # Filter the DataFrame based on the rule's condition
        flagged_by_rule_df = df.query(condition_query_str)
        
        num_total_flagged = len(flagged_by_rule_df)
        
        if num_total_flagged > 0:
            num_fraud_flagged = flagged_by_rule_df['label'].sum() # Sum of 1s (fraud)
            num_legit_flagged = num_total_flagged - num_fraud_flagged
            # Detection Rate (Precision): Proportion of flagged transactions that are fraud
            detection_rate = num_fraud_flagged / num_total_flagged
        else:
            num_fraud_flagged = 0
            num_legit_flagged = 0
            detection_rate = 0.0

        results_list.append({
            'Rule Name': rule_name,
            'Legitimate Transactions Flagged': num_legit_flagged,
            'Fraudulent Transactions Flagged': num_fraud_flagged,
            'Detection Rate (Fraud/Flagged)': detection_rate
        })
        
    return pd.DataFrame(results_list)

# Define our set of expert rules using pandas query syntax

defined_rules = {
    'High Value Transaction (>5000)': "transaction_amount > 5000",
    'Multiple Recent Payment Methods (>=3)': "recent_7d_payment_methods_count >= 3",
    'Frequent Login Failures Recently (>=3)': "num_recent_login_failures >= 3",
    'New Account & Large Transaction (>3000, age<=30d)': "(account_age_days <= 30) and (transaction_amount > 3000)",
    'International IP & High Value Transaction (>2000)': "(is_domestic_ip == False) and (transaction_amount > 2000)"
}

# Evaluate the rules on our sample data
rule_performance_df = calculate_rule_detection_performance(sample_data.copy(), defined_rules)

print("\nPerformance of Defined Fraud Detection Rules:")
display(rule_performance_df)



Performance of Defined Fraud Detection Rules:


Unnamed: 0,Rule Name,Legitimate Transactions Flagged,Fraudulent Transactions Flagged,Detection Rate (Fraud/Flagged)
0,High Value Transaction (>5000),18,13,0.419355
1,Multiple Recent Payment Methods (>=3),160,30,0.157895
2,Frequent Login Failures Recently (>=3),53,16,0.231884
3,"New Account & Large Transaction (>3000, age<=30d)",7,12,0.631579
4,International IP & High Value Transaction (>2000),24,12,0.333333
