In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from ucimlrepo import fetch_ucirepo 

# Fetch the dataset from UCI repository
def fetch_adult_data():
    adult = fetch_ucirepo(id=2)
    X = adult.data.features
    y = adult.data.targets
    y = y.squeeze()  # Remove the second axis
    return X, y

def clean_adult_data(X: pd.DataFrame, y: pd.Series) -> (pd.DataFrame, pd.Series):
    # Renaming columns for easier access
    X.rename(columns={'capital-gain': 'gain', 'capital-loss': 'loss', 'native-country': 'country',
                      'hours-per-week': 'hours', 'marital-status': 'marital', 'sex': 'gender'}, inplace=True)
    
    # Replace missing values ('?') with NaN
    X.replace('?', pd.NA, inplace=True)
    
    # Drop rows with missing values
    X.dropna(inplace=True)
    y = pd.Series(y)[X.index].reset_index(drop=True)  # Keep the same indices for `y`
    
    # Map target labels
    y = y.map({'<=50K': 0, '>50K': 1, '<=50K.': 0, '>50K.': 1})

    # Normalize numerical features
    numerical_features = ['age', 'fnlwgt', 'education-num', 'gain', 'loss', 'hours']
    scaler = MinMaxScaler()
    X[numerical_features] = scaler.fit_transform(X[numerical_features])
    
    return X.reset_index(drop=True), y.reset_index(drop=True)

def encode_features(X: pd.DataFrame) -> pd.DataFrame:
    # Apply one-hot encoding to categorical columns
    X_encoded = pd.get_dummies(X).astype(float)
    return X_encoded

# Fetch and clean the data
X, y = fetch_adult_data()
X_cleaned, y_cleaned = clean_adult_data(X, y)

# Encode the features
X_encoded = encode_features(X_cleaned)


In [27]:
import torch
import torch.nn as nn
import torch.optim as optim

# Define PyTorch Logistic Regression model
class LogisticRegressionModel(nn.Module):
    def __init__(self, input_dim):
        super(LogisticRegressionModel, self).__init__()
        self.linear = nn.Linear(input_dim, 1)
    
    def forward(self, x):
        return torch.sigmoid(self.linear(x))

# Training the logistic regression model
def train_pytorch_logistic_regression(X_train, y_train, input_dim, lr=0.01, epochs=200):
    model = LogisticRegressionModel(input_dim)
    criterion = nn.BCELoss()
    optimizer = optim.SGD(model.parameters(), lr=lr)
    
    X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
     # Convert labels (y_train) to numeric type, then to PyTorch tensor
    y_train_numeric = pd.to_numeric(y_train, errors='coerce').astype(float)
    y_train_tensor = torch.tensor(y_train_numeric.values, dtype=torch.float32).view(-1, 1)
    
    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        outputs = model(X_train_tensor)
        loss = criterion(outputs, y_train_tensor)
        loss.backward()
        optimizer.step()

        if (epoch + 1) % 10 == 0:
            print(f'Epoch [{epoch + 1}/{epochs}], Loss: {loss.item():.4f}')
    
    return model


In [28]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_cleaned, test_size=0.3, random_state=42)

# Model 1: Train on all attributes except age
X_train_no_age = X_train.drop(columns=['age'])
X_test_no_age = X_test.drop(columns=['age'])

# Ensure all columns are numeric (floats)
X_train_no_age = X_train_no_age.astype(float)
X_test_no_age = X_test_no_age.astype(float)

input_dim_no_age = X_train_no_age.shape[1]
model_no_age = train_pytorch_logistic_regression(X_train_no_age, y_train, input_dim_no_age)

# Model 2: Train on all attributes except age, sex, and race
X_train_no_age_sex_race = X_train.drop(columns=['age', 'gender_Female', 'gender_Male'])

X_test_no_age_sex_race = X_test.drop(columns=['age', 'gender_Female', 'gender_Male'])

# Ensure all columns are numeric (floats)
X_train_no_age_sex_race = X_train_no_age_sex_race.astype(float)
X_test_no_age_sex_race = X_test_no_age_sex_race.astype(float)

input_dim_no_age_sex_race = X_train_no_age_sex_race.shape[1]
model_no_age_sex_race = train_pytorch_logistic_regression(X_train_no_age_sex_race, y_train, input_dim_no_age_sex_race)

Epoch [10/200], Loss: 0.6689
Epoch [20/200], Loss: 0.6423
Epoch [30/200], Loss: 0.6212
Epoch [40/200], Loss: 0.6043
Epoch [50/200], Loss: 0.5907
Epoch [60/200], Loss: 0.5795
Epoch [70/200], Loss: 0.5703
Epoch [80/200], Loss: 0.5626
Epoch [90/200], Loss: 0.5560
Epoch [100/200], Loss: 0.5503
Epoch [110/200], Loss: 0.5454
Epoch [120/200], Loss: 0.5410
Epoch [130/200], Loss: 0.5372
Epoch [140/200], Loss: 0.5336
Epoch [150/200], Loss: 0.5304
Epoch [160/200], Loss: 0.5275
Epoch [170/200], Loss: 0.5247
Epoch [180/200], Loss: 0.5222
Epoch [190/200], Loss: 0.5197
Epoch [200/200], Loss: 0.5174
Epoch [10/200], Loss: 0.6762
Epoch [20/200], Loss: 0.6553
Epoch [30/200], Loss: 0.6382
Epoch [40/200], Loss: 0.6239
Epoch [50/200], Loss: 0.6119
Epoch [60/200], Loss: 0.6018
Epoch [70/200], Loss: 0.5931
Epoch [80/200], Loss: 0.5856
Epoch [90/200], Loss: 0.5791
Epoch [100/200], Loss: 0.5733
Epoch [110/200], Loss: 0.5682
Epoch [120/200], Loss: 0.5635
Epoch [130/200], Loss: 0.5593
Epoch [140/200], Loss: 0.555

In [29]:
from captum.attr import IntegratedGradients
import torch

# Function to apply Integrated Gradients
def explain_with_ig(model, X_test):
    ig = IntegratedGradients(model)
    
    # Convert the test set to PyTorch tensor
    X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
    
    # Compute attributions using Integrated Gradients
    attributions, delta = ig.attribute(X_test_tensor, target=0, return_convergence_delta=True)
    
    return attributions, delta

In [30]:
# Apply Integrated Gradients to Model 1 (excluding age)
attributions_no_age, delta_no_age = explain_with_ig(model_no_age, X_test_no_age)

# Print the attributions and convergence delta for Model 1
print("Attributions for Model 1 (Excluding Age):", attributions_no_age)
print("Convergence delta for Model 1:", delta_no_age)

# Apply Integrated Gradients to Model 2 (excluding age, sex, and race)
attributions_no_age_sex_race, delta_no_age_sex_race = explain_with_ig(model_no_age_sex_race, X_test_no_age_sex_race)

# Print the attributions and convergence delta for Model 2
print("Attributions for Model 2 (Excluding Age, Sex, and Race):", attributions_no_age_sex_race)
print("Convergence delta for Model 2:", delta_no_age_sex_race)


Attributions for Model 1 (Excluding Age): tensor([[ 0.0027, -0.0135,  0.0000,  ..., -0.0000, -0.0000,  0.0000],
        [ 0.0009, -0.0040,  0.0000,  ..., -0.0659, -0.0000,  0.0000],
        [ 0.0015, -0.0080,  0.0000,  ..., -0.0582, -0.0000,  0.0000],
        ...,
        [ 0.0018, -0.0092,  0.0000,  ..., -0.0669, -0.0000,  0.0000],
        [ 0.0018, -0.0089,  0.0000,  ..., -0.0650, -0.0000,  0.0000],
        [ 0.0012, -0.0091,  0.0000,  ..., -0.0665, -0.0000,  0.0000]],
       dtype=torch.float64)
Convergence delta for Model 1: tensor([-4.5800e-08, -3.8636e-08, -1.2884e-08,  ..., -5.6466e-09,
        -3.0050e-09, -1.3951e-08], dtype=torch.float64)
Attributions for Model 2 (Excluding Age, Sex, and Race): tensor([[ 0.0015, -0.0373, -0.0000,  ..., -0.0000, -0.0000,  0.0000],
        [ 0.0005, -0.0112, -0.0000,  ..., -0.0302, -0.0000,  0.0000],
        [ 0.0009, -0.0237, -0.0000,  ..., -0.0286, -0.0000,  0.0000],
        ...,
        [ 0.0010, -0.0258, -0.0000,  ..., -0.0310, -0.0000,  0.

In [31]:
print(len(attributions_no_age))

13567


In [40]:
# Function to calculate fidelity+ (comprehensiveness)
def fidelity_plus(model, X_test, original_preds, attributions, top_k=5):
    X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
    
    # Sort attributions by importance
    top_attributions = torch.argsort(attributions, descending=True)
    
    # Remove top_k important features
    for i in range(X_test_tensor.size(0)):
        top_features = top_attributions[i, :top_k]
        X_test_tensor[i, top_features] = 0
    
    # Re-run the model to get new predictions with removed features
    with torch.no_grad():
        new_predictions = (model(X_test_tensor) > 0.5).int()
    
    # Compare new predictions to original predictions (fidelity+)
    fidelity_plus_score = 1 - (1 / X_test_tensor.size(0)) * torch.sum((new_predictions == original_preds).float())
    
    return fidelity_plus_score.item()

# Function to calculate fidelity- (sufficiency)
def fidelity_minus(model, X_test, original_preds, attributions, top_k=5):
    X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
    
    # Sort attributions by importance
    top_attributions = torch.argsort(attributions, descending=True)
    
    # Keep only top_k important features and zero out the rest
    for i in range(X_test_tensor.size(0)):
        mask = torch.zeros(X_test_tensor.size(1), dtype=torch.float32)
        top_features = top_attributions[i, :top_k]
        mask[top_features] = 1
        X_test_tensor[i] = X_test_tensor[i] * mask
    
    # Re-run the model to get new predictions with only important features
    with torch.no_grad():
        new_predictions = (model(X_test_tensor) > 0.5).int()
    
    # Compare new predictions to original predictions (fidelity-)
    fidelity_minus_score = 1 - (1 / X_test_tensor.size(0)) * torch.sum((new_predictions == original_preds).float())
    
    return fidelity_minus_score.item()



In [44]:
# For model 1 (excluding age):
original_preds_no_age = (model_no_age(torch.tensor(X_test_no_age.values, dtype=torch.float32)) > 0.5).int()

# For model 2 (excluding age, sex, and race):
original_preds_no_age_sex_race = (model_no_age_sex_race(torch.tensor(X_test_no_age_sex_race.values, dtype=torch.float32)) > 0.5).int()

# Calculate fidelity+ and fidelity- for model_no_age (first model)
fidelity_plus_score_no_age = fidelity_plus(model_no_age, X_test_no_age, original_preds_no_age, attributions_no_age, top_k=5000)
fidelity_minus_score_no_age = fidelity_minus(model_no_age, X_test_no_age, original_preds_no_age, attributions_no_age, top_k=5000)

# Calculate fidelity+ and fidelity- for model_no_age_sex_race (second model)
fidelity_plus_score_no_age_sex_race = fidelity_plus(model_no_age_sex_race, X_test_no_age_sex_race, original_preds_no_age_sex_race, attributions_no_age_sex_race, top_k=5000)
fidelity_minus_score_no_age_sex_race = fidelity_minus(model_no_age_sex_race, X_test_no_age_sex_race, original_preds_no_age_sex_race, attributions_no_age_sex_race, top_k=5000)

# Print the results:
print(f"Fidelity+ (Comprehensiveness) Score for Model 1 (Excluding Age): {fidelity_plus_score_no_age}")
print(f"Fidelity- (Sufficiency) Score for Model 1 (Excluding Age): {fidelity_minus_score_no_age}")

print(f"Fidelity+ (Comprehensiveness) Score for Model 2 (Excluding Age, Sex, and Race): {fidelity_plus_score_no_age_sex_race}")
print(f"Fidelity- (Sufficiency) Score for Model 2 (Excluding Age, Sex, and Race): {fidelity_minus_score_no_age_sex_race}")

Fidelity+ (Comprehensiveness) Score for Model 1 (Excluding Age): 0.00044232606887817383
Fidelity- (Sufficiency) Score for Model 1 (Excluding Age): 5.960464477539063e-08
Fidelity+ (Comprehensiveness) Score for Model 2 (Excluding Age, Sex, and Race): 5.960464477539063e-08
Fidelity- (Sufficiency) Score for Model 2 (Excluding Age, Sex, and Race): 5.960464477539063e-08
