In [139]:
# General
import pandas as pd
import numpy as np

# Feature Engineering
from sklearn.feature_selection import mutual_info_classif
from sklearn.tree import DecisionTreeClassifier

# Data preprocessing
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# Models
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# Evaluations
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, classification_report, roc_auc_score, confusion_matrix

In [140]:
def load_data(file_path="/Users/lionsee/Desktop/ai-compliance-engine/datasets/cleaned_dataset.csv"):
    """Load the cleaned dataset"""
    return pd.read_csv(file_path)

def evaluate_model(true_labels, predicted_labels):
    """Evaluate the model performance with precision, recall, f1-score, and accuracy"""
    precision = precision_score(true_labels, predicted_labels)
    recall = recall_score(true_labels, predicted_labels)
    f1 = f1_score(true_labels, predicted_labels)
    accuracy = accuracy_score(true_labels, predicted_labels)
    print(f"Precision: {precision:.4f}") # % of TP out of all P
    print(f"Recall: {recall:.4f}") 
    print(f"F1-Score: {f1:.4f}") # measures the effectiveness of the model's classification
    print(f"Accuracy: {accuracy:.4f}")

def evaluate_model_performance(y_test, y_pred, y_prob, model_name="Model"):
    print(f"==== {model_name} ====")
    print(classification_report(y_test, y_pred))
    print(f"ROC AUC Score: {roc_auc_score(y_test, y_prob):.4f}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\n")

In [141]:
df = load_data()
y = df['FLAG']
X = df.drop(columns=['FLAG', 'address hash']) 

# --- Feature Selection ---

# Correlation
correlation_scores = X.corrwith(y).abs()

# Mutual Information
mi_scores = mutual_info_classif(X, y)
mi_scores = pd.Series(mi_scores, index=X.columns)

# Random Forest Feature Importance
rf = RandomForestClassifier(random_state=42)
rf.fit(X, y)
rf_importance = pd.Series(rf.feature_importances_, index=X.columns)

# Normalize scores
correlation_scores_norm = (correlation_scores - correlation_scores.min()) / (correlation_scores.max() - correlation_scores.min())
mi_scores_norm = (mi_scores - mi_scores.min()) / (mi_scores.max() - mi_scores.min())
rf_importance_norm = (rf_importance - rf_importance.min()) / (rf_importance.max() - rf_importance.min())

# Weighted average of scores
combined_scores = (correlation_scores_norm + mi_scores_norm + rf_importance_norm) / 3

# Select top 20 features
top_20_features = combined_scores.sort_values(ascending=False).head(20).index

print(top_20_features)

# played around and found 20 was the best? 

Index(['Time Diff between first and last (Mins)',
       'ERC20 most sent token type hash', 'ERC20 min val rec',
       'ERC20 most rec token type hash', 'Total ERC20 tnxs',
       'ERC20 uniq rec token name', 'ERC20 uniq rec addr',
       'ERC20 uniq rec contract addr', 'ERC20 avg val rec',
       'max value received', 'Avg min between received tnx',
       'total ether received',
       'total transactions (including tnx to create contract',
       'avg val received', 'ERC20 max val rec', 'Received Tnx',
       'ERC20 total Ether received', 'total ether balance',
       'Unique Received From Addresses', 'Sent tnx'],
      dtype='object')


In [142]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X[top_20_features], y, test_size=0.2, random_state=42, stratify=y)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Logistic Regression
log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg.fit(X_train_scaled, y_train)
y_pred_lr = log_reg.predict(X_test_scaled)
y_prob_lr = log_reg.predict_proba(X_test_scaled)[:, 1]
evaluate_model_performance(y_test, y_pred_lr, y_prob_lr, model_name="Logistic Regression")


### Decision Tree
dtree = DecisionTreeClassifier(max_depth=5, random_state=42)
dtree.fit(X_train, y_train)
y_pred_dt = dtree.predict(X_test)
y_prob_dt = dtree.predict_proba(X_test)[:, 1]
evaluate_model_performance(y_test, y_pred_dt, y_prob_dt, model_name="Decision Tree")

### Isolation Forest 
iso_forest = IsolationForest(n_estimators=100, contamination=0.1, random_state=42)  # Adjust 'contamination' as needed
iso_forest.fit(X_train)
y_pred_scores = iso_forest.decision_function(X_test)  
y_pred = iso_forest.predict(X_test)  # -1 = anomaly, 1 = normal (model convention)

# Change the labels to match our data
y_pred_binary = [1 if i == -1 else 0 for i in y_pred]

evaluate_model_performance(y_test, y_pred_binary, y_pred_scores, model_name="Isolation Forest")

==== Logistic Regression ====
              precision    recall  f1-score   support

         0.0       0.87      0.96      0.91      1533
         1.0       0.79      0.49      0.60       436

    accuracy                           0.86      1969
   macro avg       0.83      0.73      0.76      1969
weighted avg       0.85      0.86      0.84      1969

ROC AUC Score: 0.8640
Confusion Matrix:
[[1475   58]
 [ 223  213]]


==== Decision Tree ====
              precision    recall  f1-score   support

         0.0       0.98      0.98      0.98      1533
         1.0       0.92      0.92      0.92       436

    accuracy                           0.97      1969
   macro avg       0.95      0.95      0.95      1969
weighted avg       0.97      0.97      0.97      1969

ROC AUC Score: 0.9722
Confusion Matrix:
[[1498   35]
 [  33  403]]


==== Isolation Forest ====
              precision    recall  f1-score   support

         0.0       0.77      0.89      0.82      1533
         1.0      