In [16]:
# Import necessary libraries
import ipaddress
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import joblib



In [12]:
# Load data
def load_data(file_path):
    df = pd.read_csv(file_path)
    print(f"Loaded {df.shape[0]} transactions with {df.shape[1]} features")
    return df
df = load_data('output.csv')
df.head
df.dropna()

Loaded 2512 transactions with 16 features


Unnamed: 0,TransactionID,AccountID,TransactionAmount,TransactionDate,TransactionType,Location,DeviceID,IP Address,MerchantID,Channel,CustomerAge,CustomerOccupation,TransactionDuration,LoginAttempts,AccountBalance,PreviousTransactionDate
0,TX000001,AC00128,14.09,2023-04-11 16:29:14,Debit,San Diego,D000380,162.198.218.92,M015,ATM,70,Doctor,81,1,5112.21,2024-11-04 08:08:08
1,TX000002,AC00455,376.24,2023-06-27 16:44:19,Debit,Houston,D000051,13.149.61.4,M052,ATM,68,Doctor,141,1,13758.91,2024-11-04 08:09:35
2,TX000003,AC00019,126.29,2023-07-10 18:16:08,Debit,Mesa,D000235,215.97.143.157,M009,Online,19,Student,56,1,1122.35,2024-11-04 08:07:04
3,TX000004,AC00070,184.50,2023-05-05 16:32:11,Debit,Raleigh,D000187,200.13.225.150,M002,Online,26,Student,25,1,8569.06,2024-11-04 08:09:06
4,TX000005,AC00411,13.45,2023-10-16 17:51:24,Credit,Atlanta,D000308,65.164.3.100,M091,Online,26,Student,198,1,7429.40,2024-11-04 08:06:39
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2507,TX002508,AC00297,856.21,2023-04-26 17:09:36,Credit,Colorado Springs,D000625,21.157.41.17,M072,Branch,33,Doctor,109,1,12690.79,2024-11-04 08:11:29
2508,TX002509,AC00322,251.54,2023-03-22 17:36:48,Debit,Tucson,D000410,49.174.157.140,M029,Branch,48,Doctor,177,1,254.75,2024-11-04 08:11:42
2509,TX002510,AC00095,28.63,2023-08-21 17:08:50,Debit,San Diego,D000095,58.1.27.124,M087,Branch,56,Retired,146,1,3382.91,2024-11-04 08:08:39
2510,TX002511,AC00118,185.97,2023-02-24 16:24:46,Debit,Denver,D000634,21.190.11.223,M041,Online,23,Student,19,1,1776.91,2024-11-04 08:12:22


In [13]:
# Process Data
def preprocess_data(df):
    processed_df = df.copy()
    
    processed_df['TransactionDate'] = pd.to_datetime(processed_df['TransactionDate'])
    processed_df['PreviousTransactionDate'] = pd.to_datetime(processed_df['PreviousTransactionDate'])
    
    processed_df['TimeSincePrevTx'] = (processed_df['TransactionDate'] - processed_df['PreviousTransactionDate']).dt.total_seconds() / 3600
    
    processed_df['TransactionHour'] = processed_df['TransactionDate'].dt.hour
    processed_df['TransactionDay'] = processed_df['TransactionDate'].dt.day_name()
    processed_df['IsWeekend'] = processed_df['TransactionDate'].dt.dayofweek >= 5
    
    processed_df['IPClass'] = processed_df['IP Address'].apply(lambda x: ipaddress.ip_address(x).version)
    
    return processed_df

df_processed = preprocess_data(df)
df_processed.head()

Unnamed: 0,TransactionID,AccountID,TransactionAmount,TransactionDate,TransactionType,Location,DeviceID,IP Address,MerchantID,Channel,...,CustomerOccupation,TransactionDuration,LoginAttempts,AccountBalance,PreviousTransactionDate,TimeSincePrevTx,TransactionHour,TransactionDay,IsWeekend,IPClass
0,TX000001,AC00128,14.09,2023-04-11 16:29:14,Debit,San Diego,D000380,162.198.218.92,M015,ATM,...,Doctor,81,1,5112.21,2024-11-04 08:08:08,-13743.648333,16,Tuesday,False,4
1,TX000002,AC00455,376.24,2023-06-27 16:44:19,Debit,Houston,D000051,13.149.61.4,M052,ATM,...,Doctor,141,1,13758.91,2024-11-04 08:09:35,-11895.421111,16,Tuesday,False,4
2,TX000003,AC00019,126.29,2023-07-10 18:16:08,Debit,Mesa,D000235,215.97.143.157,M009,Online,...,Student,56,1,1122.35,2024-11-04 08:07:04,-11581.848889,18,Monday,False,4
3,TX000004,AC00070,184.5,2023-05-05 16:32:11,Debit,Raleigh,D000187,200.13.225.150,M002,Online,...,Student,25,1,8569.06,2024-11-04 08:09:06,-13167.615278,16,Friday,False,4
4,TX000005,AC00411,13.45,2023-10-16 17:51:24,Credit,Atlanta,D000308,65.164.3.100,M091,Online,...,Student,198,1,7429.4,2024-11-04 08:06:39,-9230.254167,17,Monday,False,4


Unnamed: 0,TransactionID,AccountID,TransactionAmount,TransactionDate,TransactionType,Location,DeviceID,IP Address,MerchantID,Channel,...,CustomerOccupation,TransactionDuration,LoginAttempts,AccountBalance,PreviousTransactionDate,TimeSincePrevTx,TransactionHour,TransactionDay,IsWeekend,IPClass
0,TX000001,AC00128,14.09,2023-04-11 16:29:14,Debit,San Diego,D000380,162.198.218.92,M015,ATM,...,Doctor,81,1,5112.21,2024-11-04 08:08:08,-13743.648333,16,Tuesday,False,4
1,TX000002,AC00455,376.24,2023-06-27 16:44:19,Debit,Houston,D000051,13.149.61.4,M052,ATM,...,Doctor,141,1,13758.91,2024-11-04 08:09:35,-11895.421111,16,Tuesday,False,4
2,TX000003,AC00019,126.29,2023-07-10 18:16:08,Debit,Mesa,D000235,215.97.143.157,M009,Online,...,Student,56,1,1122.35,2024-11-04 08:07:04,-11581.848889,18,Monday,False,4
3,TX000004,AC00070,184.5,2023-05-05 16:32:11,Debit,Raleigh,D000187,200.13.225.150,M002,Online,...,Student,25,1,8569.06,2024-11-04 08:09:06,-13167.615278,16,Friday,False,4
4,TX000005,AC00411,13.45,2023-10-16 17:51:24,Credit,Atlanta,D000308,65.164.3.100,M091,Online,...,Student,198,1,7429.4,2024-11-04 08:06:39,-9230.254167,17,Monday,False,4


In [14]:
# FEATURE ENGINEERING
def engineer_features(df):
    featured_df = df.copy()
    
    account_stats = featured_df.groupby('AccountID').agg({
        'TransactionAmount': ['mean', 'std', 'max'],
        'TransactionDuration': ['mean', 'std'],
        'LoginAttempts': ['mean', 'max']
    })
    
    account_stats.columns = ['_'.join(col).strip() for col in account_stats.columns.values]
    account_stats = account_stats.reset_index()
    
    featured_df = pd.merge(featured_df, account_stats, on='AccountID', how='left')
    
    featured_df['AmountToAvgRatio'] = featured_df['TransactionAmount'] / featured_df['TransactionAmount_mean']
    
  
    location_mode = featured_df.groupby('AccountID')['Location'].apply(
        lambda x: x.mode()[0] if not x.mode().empty else None
    ).reset_index()
    location_mode.columns = ['AccountID', 'UsualLocation']
    
    featured_df = pd.merge(featured_df, location_mode, on='AccountID', how='left')
    featured_df['UnusualLocation'] = featured_df['Location'] != featured_df['UsualLocation']
    
    return featured_df

df_featured = engineer_features(df_processed)
df_featured.head()

Unnamed: 0,TransactionID,AccountID,TransactionAmount,TransactionDate,TransactionType,Location,DeviceID,IP Address,MerchantID,Channel,...,TransactionAmount_mean,TransactionAmount_std,TransactionAmount_max,TransactionDuration_mean,TransactionDuration_std,LoginAttempts_mean,LoginAttempts_max,AmountToAvgRatio,UsualLocation,UnusualLocation
0,TX000001,AC00128,14.09,2023-04-11 16:29:14,Debit,San Diego,D000380,162.198.218.92,M015,ATM,...,460.698571,487.969555,1397.59,138.428571,73.461684,1.0,1,0.030584,Austin,True
1,TX000002,AC00455,376.24,2023-06-27 16:44:19,Debit,Houston,D000051,13.149.61.4,M052,ATM,...,304.622857,240.297881,787.41,128.857143,59.159188,1.0,1,1.235101,Baltimore,True
2,TX000003,AC00019,126.29,2023-07-10 18:16:08,Debit,Mesa,D000235,215.97.143.157,M009,Online,...,237.0475,197.693348,489.31,71.25,43.645351,1.0,1,0.532762,Louisville,True
3,TX000004,AC00070,184.5,2023-05-05 16:32:11,Debit,Raleigh,D000187,200.13.225.150,M002,Online,...,236.48375,275.157989,890.24,100.5,61.836189,1.375,4,0.78018,Charlotte,True
4,TX000005,AC00411,13.45,2023-10-16 17:51:24,Credit,Atlanta,D000308,65.164.3.100,M091,Online,...,280.796667,310.729033,826.23,119.0,74.659226,1.0,1,0.047899,Atlanta,False


In [23]:
class RulesEngine:
    def __init__(self):
        self.rules = []
        self.thresholds = {'default': 0.5}
    
    def add_rule(self, rule_function, weight=1.0, description=""):
        self.rules.append({
            'function': rule_function,
            'weight': weight,
            'description': description
        })
    
    def set_threshold(self, category, threshold):
        self.thresholds[category] = threshold
    
    def score_transaction(self, transaction, category='default'):
        total_score = 0
        total_weight = 0
        detailed_scores = {}
        
        for rule in self.rules:
            rule_score = rule['function'](transaction)
            weighted_score = rule_score * rule['weight']
            total_score += weighted_score
            total_weight += rule['weight']
            detailed_scores[rule['description']] = rule_score
        
        # Normalize score
        final_score = total_score / total_weight if total_weight > 0 else 0
        is_fraud = final_score >= self.thresholds[category]
        
        return {
            'score': final_score,
            'is_fraud': is_fraud,
            'detailed_scores': detailed_scores,
            'threshold': self.thresholds[category]
        }
    
    def batch_score(self, transactions_df):
        results = []
        for _, transaction in transactions_df.iterrows():
            results.append(self.score_transaction(transaction))
        return results

def unusual_amount_rule(tx):
    return 0.25 if tx['TransactionAmount'] > tx.get('TransactionAmount_mean', 0) * 3 else 0

def unusual_hour_rule(tx):
    return 0.15 if (tx.get('TransactionHour', -1) >= 23 or tx.get('TransactionHour', -1) <= 4) else 0

def unusual_location_rule(tx):
    return 0.2 if tx.get('UnusualLocation', False) else 0

def login_attempts_rule(tx):
    return 0.1 * tx['LoginAttempts'] if tx['LoginAttempts'] > 1 else 0

def quick_transaction_rule(tx):
    return 0.1 if tx.get('TransactionDuration', 0) < tx.get('TransactionDuration_mean', 0) * 0.3 else 0

def occupation_amount_rule(tx):
    return 0.15 if (tx['CustomerOccupation'] == 'Student' and tx['TransactionAmount'] > 1000) else 0

def reporting_threshold_rule(tx):
    return 0.15 if 9000 <= tx['TransactionAmount'] <= 9999 else 0

def create_rules_engine():
    engine = RulesEngine()
    
    engine.add_rule(unusual_amount_rule, weight=1.0, description="Unusual transaction amount")
    engine.add_rule(unusual_hour_rule, weight=0.8, description="Unusual transaction hour")
    engine.add_rule(unusual_location_rule, weight=1.0, description="Unusual location")
    engine.add_rule(login_attempts_rule, weight=0.7, description="Multiple login attempts")
    engine.add_rule(quick_transaction_rule, weight=0.6, description="Unusually quick transaction")
    engine.add_rule(occupation_amount_rule, weight=0.5, description="High amount for occupation")
    engine.add_rule(reporting_threshold_rule, weight=1.0, description="Amount just below reporting threshold")
    
    engine.set_threshold('default', 0.5)
    
    return engine

rules_engine = create_rules_engine()
joblib.dump(rules_engine, 'fraud_rules_model.pkl')
print("Rules engine created and saved")

Rules engine created and saved


In [24]:
def test_rules_engine(engine, transactions_df):
    results = []
    
    for idx, transaction in transactions_df.iterrows():
        score_result = engine.score_transaction(transaction)
        
        results.append({
            'TransactionID': transaction['TransactionID'],
            'Score': score_result['score'],
            'IsFraud': score_result['is_fraud'],
            'Details': score_result['detailed_scores']
        })
    
    results_df = pd.DataFrame(results)
    return results_df

loaded_engine = joblib.load('fraud_rules_model.pkl')
test_results = test_rules_engine(loaded_engine, df_featured.head(10))
print("Test Results:")
test_results[['TransactionID', 'Score', 'IsFraud']].head()


Test Results:


Unnamed: 0,TransactionID,Score,IsFraud
0,TX000001,0.035714,False
1,TX000002,0.035714,False
2,TX000003,0.035714,False
3,TX000004,0.046429,False
4,TX000005,0.0,False


In [32]:
def score_transaction_realtime(transaction_data, rules_engine):
    # Preprocess transaction
    df = pd.DataFrame([transaction_data])
    df = preprocess_data(df)
    df = engineer_features(df)
    
    # Get transaction as dictionary
    transaction = df.iloc[0].to_dict()
    
    # Generate prediction
    result = rules_engine.score_transaction(transaction)
    
    return {
        'transaction_id': transaction_data.get('TransactionID', 'Unknown'),
        'fraud_score': result['score'],
        'is_fraud': result['is_fraud'],
        'risk_level': 'high' if result['score'] >= 0.5 else 'medium' if result['score'] >= 0.3 else 'low',
        'rule_details': result['detailed_scores']
    }

# Example transaction to score
sample_transaction = {
    'TransactionID': 'TX999999',
    'AccountID': 'AC00128',
    'TransactionAmount': 2000.00, 
    'TransactionDate': '2023-12-01 02:15:00',  
    'TransactionType': 'Debit',
    'Location': 'New York',  
    'DeviceID': 'D000999',
    'IP Address': '45.232.190.7',
    'MerchantID': 'M112',
    'Channel': 'Online',
    'CustomerAge': 70,
    'CustomerOccupation': 'Doctor',
    'TransactionDuration': 25,  
    'LoginAttempts': 6, 
    'AccountBalance': 5112.21,
    'PreviousTransactionDate': '2023-11-30 10:05:00'
}

# Score the sample transaction
sample_result = score_transaction_realtime(sample_transaction, loaded_engine)
print("Sample Transaction Score:")
for key, value in sample_result.items():
    print(f"{key}: {value}")


Sample Transaction Score:
transaction_id: TX999999
fraud_score: 0.09642857142857145
is_fraud: False
risk_level: low
rule_details: {'Unusual transaction amount': 0, 'Unusual transaction hour': 0.15, 'Unusual location': 0, 'Multiple login attempts': 0.6000000000000001, 'Unusually quick transaction': 0, 'High amount for occupation': 0, 'Amount just below reporting threshold': 0}
