In [2]:
# General
import pandas as pd
import numpy as np

# Feature Engineering
from sklearn.feature_selection import mutual_info_classif

# Data Preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Models
from sklearn.ensemble import RandomForestClassifier

# Evaluations
from sklearn.metrics import (
    precision_score, recall_score, f1_score, accuracy_score,
    classification_report, roc_auc_score, confusion_matrix
)

def load_data(file_path="/Users/davidchan/ai-compliance-engine/datasets/cleaned_dataset.csv"):
    """Load the cleaned dataset"""
    return pd.read_csv(file_path)

def evaluate_model_performance(y_test, y_pred, y_prob, model_name="Random Forest"):
    """Evaluate the model performance with precision, recall, f1-score, and ROC AUC"""
    print(f"==== {model_name} ====")
    print(classification_report(y_test, y_pred))
    print(f"ROC AUC Score: {roc_auc_score(y_test, y_prob):.4f}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\n")

# Load Dataset
df = load_data()
y = df['FLAG']
X = df.drop(columns=['FLAG', 'address hash'])  # Drop unnecessary columns

# --- Feature Selection ---
# Correlation
correlation_scores = X.corrwith(y).abs()

# Mutual Information
mi_scores = mutual_info_classif(X, y)
mi_scores = pd.Series(mi_scores, index=X.columns)

# Random Forest Feature Importance
rf = RandomForestClassifier(random_state=42)
rf.fit(X, y)
rf_importance = pd.Series(rf.feature_importances_, index=X.columns)

# Normalize scores
correlation_scores_norm = (correlation_scores - correlation_scores.min()) / (correlation_scores.max() - correlation_scores.min())
mi_scores_norm = (mi_scores - mi_scores.min()) / (mi_scores.max() - mi_scores.min())
rf_importance_norm = (rf_importance - rf_importance.min()) / (rf_importance.max() - rf_importance.min())

# Weighted average of scores
combined_scores = (correlation_scores_norm + mi_scores_norm + rf_importance_norm) / 3

# Select top 20 features
top_20_features = combined_scores.sort_values(ascending=False).head(20).index
print(f"Top 20 Features: {top_20_features}")

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X[top_20_features], y, test_size=0.2, random_state=42, stratify=y)

# Scale Features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- Random Forest Classifier ---
rf_model = RandomForestClassifier(
    n_estimators=100, 
    max_depth=10,  # Adjust depth based on experimentation
    random_state=42
)
rf_model.fit(X_train_scaled, y_train)

# Predictions
y_pred_rf = rf_model.predict(X_test_scaled)
y_prob_rf = rf_model.predict_proba(X_test_scaled)[:, 1]

# Evaluate Model Performance
evaluate_model_performance(y_test, y_pred_rf, y_prob_rf, model_name="Random Forest Classifier")

Top 20 Features: Index(['Time Diff between first and last (Mins)',
       'ERC20 most sent token type hash', 'ERC20 min val rec',
       'ERC20 most rec token type hash', 'Total ERC20 tnxs',
       'ERC20 uniq rec token name', 'ERC20 uniq rec addr', 'ERC20 avg val rec',
       'ERC20 uniq rec contract addr', 'max value received',
       'Avg min between received tnx', 'total ether received',
       'total transactions (including tnx to create contract',
       'avg val received', 'ERC20 total Ether received', 'Received Tnx',
       'ERC20 max val rec', 'total ether balance',
       'Unique Received From Addresses', 'Sent tnx'],
      dtype='object')
==== Random Forest Classifier ====
              precision    recall  f1-score   support

         0.0       0.98      1.00      0.99      1533
         1.0       0.99      0.94      0.96       436

    accuracy                           0.98      1969
   macro avg       0.99      0.97      0.98      1969
weighted avg       0.98      0.98  

In [4]:
import joblib

# Save the trained model
joblib.dump(rf_model, '/Users/davidchan/ai-compliance-engine/app/ai_models/fraud_model_v1.pkl')
print("Model saved successfully")


Model saved successfully
