## MMTHE01 - Masters Thesis

### E3. Thesis - Apply and Evaluate different XAI methods - Case Study with the ANN Model

* Applying XAI on a Deep Learning AI model (ANN Model)

#### Importing the libraries

In [1]:
### import general libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import os
import time
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import recall_score
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

from sklearn.preprocessing import LabelEncoder

In [2]:
os.chdir(r'S:\Semester 4\Masters Thesis Report\6. Analysis')

#### Importing the dataset

In [3]:
dataset = pd.read_csv('train_dataset_final_encoded.csv')

In [4]:
dataset.head()

Unnamed: 0,isFraud,TransactionDT,TransactionAmt,card1,C3,C9,C12,C13,C14,TransactionID,...,card4_discover,card4_mastercard,card4_visa,card6_charge card,card6_credit,card6_debit,card6_debit or credit,M4_M0,M4_M1,M4_M2
0,0,86400,68.5,13926,0.0,1.0,0.0,1.0,1.0,2987000,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,0,86401,29.0,2755,0.0,0.0,0.0,1.0,1.0,2987001,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,0,86469,59.0,4663,0.0,1.0,0.0,1.0,1.0,2987002,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
3,0,86499,50.0,18132,0.0,1.0,0.0,25.0,1.0,2987003,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
4,0,86506,50.0,4497,0.0,0.0,0.0,1.0,1.0,2987004,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [5]:
dataset.shape

(590540, 201)

### 5.1 Split the data into Train-Test

#### 5.1.1 Separate the features and the label

In [6]:
dataset_final = dataset.drop('TransactionID', axis=1)

In [7]:
#X = dataset.iloc[:, 1:].values
#y = dataset.iloc[:,0].values

In [8]:
X = dataset_final.iloc[:, 1:]
y = dataset_final.iloc[:,0]

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify=y, random_state = 1)

### 5.2 Applying SMOTE

In [None]:
# Applying SMOTE only to the training data
smote = SMOTE(random_state=1)
X_train, y_train = smote.fit_resample(X_train_im, y_train_im)

### 5.3 Feature Scaling

In [10]:
sc = StandardScaler()
X_tn_scaled = sc.fit_transform(X_train)
X_tt_scaled = sc.fit_transform(X_test)


# Convert to dataframe
X_train_scaled = pd.DataFrame(X_tn_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_tt_scaled, columns=X_test.columns)

### 5.4 Model Fitting

In [11]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

In [12]:
# Build an ANN model
model = Sequential([
    Input(shape=(X_train.shape[1],)),  # Explicit Input layer instead of input_dim in Dense
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['AUC'])

# Early stopping to prevent overfitting
early_stop = EarlyStopping(monitor='val_AUC', patience=3, restore_best_weights=True, mode='max')

In [13]:
# Train the ANN model with timing
history = model.fit(
    X_train_scaled, y_train,
    validation_split=0.2,
    epochs=20,
    batch_size=256,
    callbacks=[early_stop],
    verbose=1
)

Epoch 1/20
[1m1477/1477[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 4ms/step - AUC: 0.7477 - loss: 0.1595 - val_AUC: 0.8621 - val_loss: 0.1040
Epoch 2/20
[1m1477/1477[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 4ms/step - AUC: 0.8437 - loss: 0.1094 - val_AUC: 0.8731 - val_loss: 0.1007
Epoch 3/20
[1m1477/1477[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 3ms/step - AUC: 0.8543 - loss: 0.1051 - val_AUC: 0.8769 - val_loss: 0.0990
Epoch 4/20
[1m1477/1477[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - AUC: 0.8676 - loss: 0.1010 - val_AUC: 0.8814 - val_loss: 0.0970
Epoch 5/20
[1m1477/1477[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 4ms/step - AUC: 0.8759 - loss: 0.0978 - val_AUC: 0.8856 - val_loss: 0.0952
Epoch 6/20
[1m1477/1477[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - AUC: 0.8777 - loss: 0.0959 - val_AUC: 0.8859 - val_loss: 0.0950
Epoch 7/20
[1m1477/1477[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s

### 5.5 Applying XAI methods to the ANN Model

#### 5.5.6 Applying Permutation Importance to the ANN Model
* E3D. Thesis - Test different XAI methods with the ANN Model (Permutation Importance)

In [None]:
from sklearn.inspection import permutation_importance

In [None]:
start_time = time.time()

In [None]:
# Compute permutation importance
result = permutation_importance(
    model, X_test_scaled, y_test,
    n_repeats=10, random_state=42, scoring='roc_auc'
)

In [None]:
end_time = time.time()
explanation_time = end_time - start_time
print(f"Permutation Important on ANN (Explanation Time): {explanation_time:.2f} seconds")

In [None]:
# Store results in DataFrame
pi_df = pd.DataFrame({
    'feature': X_test.columns,
    'importance_mean': result.importances_mean,
    'importance_std': result.importances_std
}).sort_values(by='importance_mean', ascending=False)

In [None]:
print(pi_df.head(15))

In [None]:
# Plot top 15 features
top_n = 10
plt.figure(figsize=(10,6))
plt.barh(pi_df['feature'].head(top_n)[::-1], pi_df['importance_mean'].head(top_n)[::-1])
plt.xlabel("Permutation Importance (Mean decrease in ROC-AUC)")
plt.title("Top Features by Permutation Importance - ANN")
plt.show()

#### 5.5.2 Applying Counterfactual to the ANN Model
* E3D. Thesis - Test different XAI methods with the ANN Model (Counterfactual)

In [None]:
from sklearn.metrics import pairwise_distances

In [None]:
# Show floats with 2 decimal places and avoid scientific notation
pd.set_option('display.float_format', '{:.4f}'.format)

In [None]:
# Assume: X_train, X_test, model
query_instance = X_test_scaled.iloc[0].copy()
query_instance_values = query_instance.values

In [None]:
def predict_label(model, x):
    """
    Predict class for a single instance.
    """
    if hasattr(model, "predict_proba"):
        return model.predict_proba(x.reshape(1, -1))[0,1]  # probability of class 1
    else:
        return model.predict(x.reshape(1, -1))[0]

In [None]:
def distance(x1, x2):
    """Euclidean distance between two instances"""
    return np.linalg.norm(x1 - x2)

In [None]:
def generate_counterfactual(model, x0, total_cfs=3, max_trials=10000, step_size=0.1):
    """
    Generates counterfactuals using random perturbations.
    """
    cf_list = []
    trials = 0
    
    while len(cf_list) < total_cfs and trials < max_trials:
        # Perturb features randomly
        x_cf = x0 + np.random.normal(0, step_size, size=x0.shape)
        
        # Check if prediction flips
        pred_orig = predict_label(model, x0)
        pred_cf = predict_label(model, x_cf)
        
        # For binary classification, flip the label
        if (pred_orig < 0.5 and pred_cf >= 0.5) or (pred_orig >= 0.5 and pred_cf < 0.5):
            cf_list.append(x_cf)
        
        trials += 1
    
    return np.array(cf_list)

In [None]:
start_time = time.time()

In [None]:
counterfactuals = generate_counterfactual(model=model, x0=query_instance_values, total_cfs=3)

In [None]:
end_time = time.time()
explanation_time = end_time - start_time
print(f"Counterfactual on ANN (Explanation Time): {explanation_time:.2f} seconds")

In [None]:
df_cf = pd.DataFrame(counterfactuals, columns=X_test_scaled.columns)
df_compare = pd.concat([query_instance.to_frame().T, df_cf], keys=['Original', 'Counterfactual'])
df_transposed = df_compare.T

In [None]:
df_transposed

#### 5.5.3 Applying Adversarial Explanations to the ANN Model
* E3D. Thesis - Test different XAI methods with the ANN Model (Adversarial Explanation)

In [None]:
from scipy.optimize import differential_evolution

In [None]:
# Define adversarial explanation function 
def adversarial_explanation(model,
    x_orig,                     # 1D numpy array (already preprocessed to model input space)
    target_label=None,          # desired target label (0 or 1). If None -> flip original label.
    feature_bounds=None,        # list of (min, max) for each feature (in same scaled space as x_orig)
    maxiter=200,                # DE iterations
    popsize=15,                 # DE population size multiplier
    penalty_coef=50.0,          # strength of constraint penalty
    norm='l2',                  # 'l2' or 'linf'
    random_state=0
):
    """
    Find a minimal perturbation delta such that model.predict_proba(x_orig + delta) yields target_label.
    Uses differential_evolution (global, gradient-free).
    Returns: dict { 'delta', 'x_adv', 'orig_prob', 'adv_prob', 'success', 'distance' }
    """
    rng = np.random.RandomState(random_state)
    x_orig = np.asarray(x_orig).astype(float).ravel()
    n = x_orig.size

    # current predicted label and probability
    prob_orig = model.predict_proba(x_orig.reshape(1, -1))[0,1]
    label_orig = int(prob_orig >= 0.5)
    if target_label is None:
        target = 1 - label_orig
    else:
        target = int(target_label)

    # bounds for delta: by default allow changes within (min_feature - val, max_feature - val)
    if feature_bounds is None:
        # set bounds from training feature ranges (we'll use training min/max scaled)
        # Compute safe bounds around x_orig: +/- 3 STD (in scaled space that's reasonable)
        # but to be general, allow range [-clip, +clip] where clip = max(abs(min,max)) * 1.1
        # Here we set per-feature bounds to ensure we stay inside plausible values.
        fb = []
        X_all = np.vstack([X_train_scaled.values, X_test_scaled.values])
        min_col = X_all[:, :].min(axis=0)
        max_col = X_all[:, :].max(axis=0)
        for i in range(n):
            # bounds for delta so that x_adv remains in [min_col[i], max_col[i]]
            lo = min_col[i] - x_orig[i]
            hi = max_col[i] - x_orig[i]
            fb.append((lo, hi))
    else:
        # feature_bounds given as list of (min,max) for each feature in input space
        fb = []
        for i, (mn, mx) in enumerate(feature_bounds):
            fb.append((mn - x_orig[i], mx - x_orig[i]))

    # objective: minimize norm(delta) + penalty * max(0, threshold - prob_target)
    def objective(delta_flat):
        delta = np.array(delta_flat)
        x_candidate = x_orig + delta
        # clip to bounds (avoid invalid values)
        # compute predicted probability of target class
        proba = model.predict_proba(x_candidate.reshape(1, -1))[0, 1]
        # fitness for target: we want proba_target >= 0.5 if target==1; else <=0.5
        if target == 1:
            violation = max(0.0, 0.5 - proba)
        else:
            violation = max(0.0, proba - 0.5)
        # distance measure
        if norm == 'l2':
            dist = np.linalg.norm(delta)
        elif norm == 'linf':
            dist = np.max(np.abs(delta))
        else:
            dist = np.linalg.norm(delta)
        # objective: distance + penalty * violation
        return dist + penalty_coef * violation

    # differential evolution
    result = differential_evolution(
        objective,
        fb,
        maxiter=maxiter,
        popsize=popsize,
        tol=1e-5,
        polish=True,
        updating='deferred',
        seed=random_state,
        mutation=(0.5, 1.0),
        recombination=0.7,
    )

    delta_opt = result.x
    x_adv = x_orig + delta_opt
    prob_adv = model.predict_proba(x_adv.reshape(1, -1))[0,1]
    success = (prob_adv >= 0.5 and target == 1) or (prob_adv < 0.5 and target == 0)
    distance = np.linalg.norm(delta_opt) if norm == 'l2' else np.max(np.abs(delta_opt))

    return {
        'delta': delta_opt,
        'x_adv': x_adv,
        'orig_prob': prob_orig,
        'adv_prob': prob_adv,
        'success': success,
        'distance': distance,
        'result_obj': result
    }

In [None]:
start_time = time.time()

In [None]:
# Run adversarial explanation for one test instance
idx = 0
x0 = X_test_scaled.iloc[idx].values  # already scaled to model input
ae = adversarial_explanation(model, x0, target_label=None, maxiter=100, popsize=10, penalty_coef=200.0, norm='l2', random_state=0)

In [None]:
end_time = time.time()
explanation_time = end_time - start_time
print(f"Adversarial Expanations on ANN (Explanation Time): {explanation_time:.2f} seconds")

In [None]:
print("Original prob (class=1):", ae['orig_prob'])
print("Adversarial prob (class=1):", ae['adv_prob'])
print("Success flipped?:", ae['success'])
print("L2 distance of delta:", ae['distance'])

In [None]:
# show top changed features
delta = ae['delta']
df_changes = pd.DataFrame({
    'feature': X_test_scaled.columns,
    'orig': x0,
    'adv': ae['x_adv'],
    'delta': delta,
    'abs_delta': np.abs(delta)
}).sort_values('abs_delta', ascending=False)

print("\nTop feature changes (by absolute perturbation):")
print(df_changes.head(10).to_string(index=False))

In [None]:
# Take top 10 features by absolute change
top_changes = df_changes.head(10).sort_values('abs_delta', ascending=True)

plt.figure(figsize=(8, 6))
plt.barh(top_changes['feature'], top_changes['delta'], color='skyblue')
plt.xlabel('Change (delta)')
plt.title('Top 10 Features Changed by Adversarial Example')
plt.grid(axis='x')
plt.show()