In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import xgboost as xgb

# Step 1: Load the Dataset
data = pd.read_csv('/Users/davidchan/ai-compliance-engine/datasets/fraud_dataset.csv')

# Step 2: Preprocess the Data
# Encode the 'FLAG' column
data['FLAG'] = data['FLAG'].map({'Fraud': 1, 'Non - Fraud': 0})

# Handle missing values (if any)
data = data.fillna(0)

# Separate features and target variable
X = data.drop(['FLAG'], axis=1)
y = data['FLAG']

# Step 3: Split the Dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Train the XGBoost Model
# Convert data to DMatrix format (optional but efficient for XGBoost)
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Set hyperparameters
params = {
    'objective': 'binary:logistic',  # Binary classification
    'eval_metric': 'logloss',       # Evaluation metric
    'learning_rate': 0.1,           # Learning rate
    'max_depth': 6,                 # Max depth of trees
    'lambda': 1.0,                  # L2 regularization
    'alpha': 0.0,                   # L1 regularization
    'seed': 42                      # Random seed
}

# Train the model
xgb_model = xgb.train(params, dtrain, num_boost_round=100)

# Step 5: Make Predictions and Evaluate
y_pred_proba = xgb_model.predict(dtest)
y_pred = (y_pred_proba > 0.5).astype(int)  # Convert probabilities to binary predictions

# Evaluate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Accuracy: 0.9891
Precision: 0.9851
Recall: 0.9663
F1 Score: 0.9756


In [3]:
import joblib

# Save the trained model
joblib.dump(xgb_model, '/Users/davidchan/ai-compliance-engine/app/ai_models/fraud_model_v2.pkl')
print("Model saved successfully")


Model saved successfully


In [4]:
print(xgb_model.feature_names)

['Unnamed: 0', 'Avg min between sent tnx', 'Avg min between received tnx', 'Time Diff between first and last (Mins)', 'Sent tnx', 'Received Tnx', 'Number of Created Contracts', 'max value received ', 'avg val received', 'avg val sent', 'total Ether sent', 'total ether balance', ' ERC20 total Ether received', ' ERC20 total ether sent', ' ERC20 total Ether sent contract', ' ERC20 uniq sent addr', ' ERC20 uniq rec token name', ' ERC20 most sent token type', ' ERC20_most_rec_token_type']
