In [None]:
# Standard Libraries
import math
from sklearn.utils import resample
import pickle
import warnings
# Scientific Computing and Data Manipulation
import numpy as np
import pandas as pd
import scipy
from scipy import stats
from scipy.spatial.distance import mahalanobis

# Machine Learning Models and Utilities
from sklearn.calibration import CalibratedClassifierCV, calibration_curve
from sklearn.decomposition import PCA
from sklearn.ensemble import (
    IsolationForest,
    RandomForestClassifier,
    StackingClassifier,
    VotingClassifier
)
from sklearn.experimental import enable_halving_search_cv
from sklearn.feature_selection import RFE, SelectFromModel, mutual_info_classif
from sklearn.linear_model import LogisticRegression, LassoCV
from sklearn.metrics import (
    ConfusionMatrixDisplay,
    accuracy_score,
    average_precision_score,
    balanced_accuracy_score,
    brier_score_loss,
    classification_report,
    confusion_matrix,
    f1_score,
    log_loss,
    matthews_corrcoef,
    precision_recall_curve,
    precision_score,
    recall_score,
    roc_auc_score,
    roc_curve,
    auc
)
from sklearn.model_selection import (
    GridSearchCV,
    HalvingRandomSearchCV,
    RandomizedSearchCV,
    RepeatedStratifiedKFold,
    cross_val_score,
    learning_curve,
    train_test_split
)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler, label_binarize
from sklearn.svm import OneClassSVM
from sklearn.tree import DecisionTreeClassifier, plot_tree

# XGBoost
import xgboost as xgb
from xgboost import XGBClassifier

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import shap

# Imbalanced Learning
import imblearn
from imblearn.over_sampling import SMOTE

# Parallelization
from joblib import Parallel, delayed

# Google Colab Utilities
from google.colab import drive, files

# Warnings
warnings.filterwarnings('ignore')

# Google Drive Mount
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
with open('/content/drive/My Drive/IEEE_dataset/voting_model_RandomForest.pkl', 'rb') as f:
    voting_model_RandomForest = pickle.load(f)
with open('/content/drive/My Drive/IEEE_dataset/voting_model_LASSO.pkl', 'rb') as f:
    voting_model_LASSO = pickle.load(f)
with open('/content/drive/My Drive/IEEE_dataset/one_class_svm_models.pkl', 'rb') as f:
    svm_models = pickle.load(f)
filtered_trainDF = pd.read_pickle('/content/drive/My Drive/IEEE_dataset/filtered_trainDF.pkl')

In [None]:
X = filtered_trainDF.drop(columns=['isFraud'])  # Features
y = filtered_trainDF['isFraud']  # Target

for col in X.columns:
    if X[col].dtype == 'datetime64[ns]' or 'Timestamp' in col:
        X[col] = pd.to_datetime(X[col], errors='coerce').astype('int64')  # Convert to timestamp (int)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

label_encoders = {}
for col in X_train.columns:  # Fit on X_train
    if X_train[col].dtype == 'object' or isinstance(X_train[col].dtype, pd.CategoricalDtype):
        le = LabelEncoder()
        # Fit the encoder on the combined unique values from both train and test
        # to ensure all possible values are seen during fitting
        le.fit(pd.concat([X_train[col], X_test[col]]).astype(str).unique())
        X_train[col] = le.transform(X_train[col].astype(str))
        label_encoders[col] = le

# Transform X_test using the fitted label encoders
for col in X_test.columns:
    if col in label_encoders:
        X_test[col] = label_encoders[col].transform(X_test[col].astype(str))

# Impute missing values separately for training and test sets
# using strategies fitted on the training data
for col in X_train.columns:
    if X_train[col].dtype in ['float64', 'int64']:
        # Numeric columns: fill with training mean
        impute_value = X_train[col].mean()
        X_train[col] = X_train[col].fillna(impute_value)
        X_test[col] = X_test[col].fillna(impute_value)
    else:
        # Non-numeric columns: fill with training mode
        impute_value = X_train[col].mode()[0]
        X_train[col] = X_train[col].fillna(impute_value)
        X_test[col] = X_test[col].fillna(impute_value)




In [None]:
# Predict for balanced models (Random Forest and Lasso)
expected_features = voting_model_RandomForest.estimators_[0][1].get_booster().feature_names

# Select only those features from X_test
X_test_subset = X_test[expected_features]

y_pred_rf = voting_model_RandomForest.predict(X_test_subset)
y_pred_lasso = voting_model_LASSO.predict(X_test_subset)

# Predict for the One-Class SVM model
# Assuming svm_models was also trained with the same subset of features
y_pred_svm = svm_models.predict(X_test_subset)

# ... (rest of the code remains the same)
# Select only those features from X_test
X_test_subset = X_test[expected_features]

y_pred_rf = voting_model_RandomForest.predict(X_test_subset)
y_pred_lasso = voting_model_LASSO.predict(X_test_subset)

# Predict for the One-Class SVM model
# Assuming svm_models was also trained with the same subset of features
y_pred_svm = svm_models.predict(X_test_subset)

# ... (rest of the code remains the same)
# Convert SVM predictions to 0/1 (0 for inliers, 1 for outliers)
y_pred_svm = np.where(y_pred_svm == 1, 0, 1)  # Assuming 1 is inlier, -1 is outlier in SVM

# Combine predictions using majority voting
y_pred_combined = np.zeros_like(y_pred_rf)
for i in range(len(y_pred_rf)):
    votes = [y_pred_rf[i], y_pred_lasso[i], y_pred_svm[i]]
    y_pred_combined[i] = max(set(votes), key=votes.count)  # Majority vote

# Evaluate the combined predictions
print("Combined Model Performance:")
print(f"Balanced Accuracy: {balanced_accuracy_score(y_test, y_pred_combined)}")
print(f"ROC AUC Score: {roc_auc_score(y_test, y_pred_combined)}")
print(classification_report(y_test, y_pred_combined))

# Confusion matrix
cm_combined = confusion_matrix(y_test, y_pred_combined)
print(f"Combined Model Confusion Matrix: \n{cm_combined}")

TypeError: 'XGBClassifier' object is not subscriptable