In [48]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from ucimlrepo import fetch_ucirepo 

# Fetch the dataset from UCI repository
def fetch_adult_data():
    adult = fetch_ucirepo(id=2)
    X = adult.data.features
    y = adult.data.targets
    y = y.squeeze()  # Remove the second axis
    return X, y

def clean_adult_data(X: pd.DataFrame, y: pd.Series) -> (pd.DataFrame, pd.Series):
    # Renaming columns for easier access
    X.rename(columns={'capital-gain': 'gain', 'capital-loss': 'loss', 'native-country': 'country',
                      'hours-per-week': 'hours', 'marital-status': 'marital'}, inplace=True)
    
    print("Column names:", X.columns)
    
    # Replace missing values ('?') with NaN
    X.replace('?', pd.NA, inplace=True)
    
    # Drop rows with missing values
    X.dropna(inplace=True)
    y = y.loc[X.index]
    
    # Map target labels
    y = y.map({'<=50K': 0, '>50K': 1, '<=50K.': 0, '>50K.': 1})

    # Group by categorical columns
    group_columns = ['workclass', 'education', 'marital', 'occupation', 'relationship', 'race', 'country']

    # Finding significant groups, i.e., counting samples per group
    adult_counting_groups = X.groupby(group_columns).size().to_frame('m').reset_index()
    
    
    # Filter for groups with more than `group_threshold` elements
    group_threshold = 50
    significant_groups = adult_counting_groups[adult_counting_groups['m'] > group_threshold]
    print("Number of significant groups:", significant_groups.shape[0])
    
    # Get the significant groups as a list of tuples
    adult_list_significant = significant_groups[group_columns].apply(tuple, axis=1).tolist()
    
    # Filtering for only rows that belong to the significant groups
    X_filtered = X[X[group_columns].apply(tuple, axis=1).isin(adult_list_significant)]
    y_filtered = y.reindex(X_filtered.index)

    # Drop any rows where y might have missing values after reindexing
    X_filtered = X_filtered[y_filtered.notna()]
    y_filtered = y_filtered.dropna()

    # Normalize numerical features
    numerical_features = ['age', 'fnlwgt', 'education-num', 'gain', 'loss', 'hours']
    scaler = MinMaxScaler()
    X_filtered[numerical_features] = scaler.fit_transform(X_filtered[numerical_features])
    
    return X_filtered.reset_index(drop=True), y_filtered.reset_index(drop=True)

def encode_features(X: pd.DataFrame) -> pd.DataFrame:
    # Apply one-hot encoding to categorical columns
    X_encoded = pd.get_dummies(X).astype(float)
    return X_encoded

# Fetch and clean the data
X, y = fetch_adult_data()
X_cleaned, y_cleaned = clean_adult_data(X, y)

# Encode the features
X_encoded = encode_features(X_cleaned)
print("Columns after encoding:", X_encoded.columns)
print("length of y_cleaned:", len(y_cleaned))


Column names: Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital',
       'occupation', 'relationship', 'race', 'sex', 'gain', 'loss', 'hours',
       'country'],
      dtype='object')
Number of significant groups: 145
Columns after encoding: Index(['age', 'fnlwgt', 'education-num', 'gain', 'loss', 'hours',
       'workclass_Federal-gov', 'workclass_Local-gov', 'workclass_Private',
       'workclass_Self-emp-inc', 'workclass_Self-emp-not-inc',
       'workclass_State-gov', 'education_10th', 'education_11th',
       'education_7th-8th', 'education_Assoc-acdm', 'education_Assoc-voc',
       'education_Bachelors', 'education_Doctorate', 'education_HS-grad',
       'education_Masters', 'education_Prof-school', 'education_Some-college',
       'marital_Divorced', 'marital_Married-civ-spouse',
       'marital_Never-married', 'occupation_Adm-clerical',
       'occupation_Craft-repair', 'occupation_Exec-managerial',
       'occupation_Farming-fishing', 'occupation_Hand

In [33]:
import torch
import torch.nn as nn
import torch.optim as optim

# Define PyTorch Logistic Regression model
class LogisticRegressionModel(nn.Module):
    def __init__(self, input_dim):
        super(LogisticRegressionModel, self).__init__()
        self.linear = nn.Linear(input_dim, 1)
    
    def forward(self, x):
        return torch.sigmoid(self.linear(x))

# Training the logistic regression model
def train_pytorch_logistic_regression(X_train, y_train, input_dim, lr=0.01, epochs=200):
    model = LogisticRegressionModel(input_dim)
    criterion = nn.BCELoss()
    optimizer = optim.SGD(model.parameters(), lr=lr)
    
    X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
     # Convert labels (y_train) to numeric type, then to PyTorch tensor
    y_train_numeric = pd.to_numeric(y_train, errors='coerce').astype(float)
    y_train_tensor = torch.tensor(y_train_numeric.values, dtype=torch.float32).view(-1, 1)
    
    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        outputs = model(X_train_tensor)
        loss = criterion(outputs, y_train_tensor)
        loss.backward()
        optimizer.step()

        #if (epoch + 1) % 10 == 0:
            #print(f'Epoch [{epoch + 1}/{epochs}], Loss: {loss.item():.4f}')
    
    return model


# Function to predict using the PyTorch model
def predict_pytorch_logistic_regression(model, X):
    model.eval()  # Set model to evaluation mode
    X_tensor = torch.tensor(X.values, dtype=torch.float32)
    with torch.no_grad():
        probabilities = model(X_tensor).squeeze()  # Get predicted probabilities
        predictions = (probabilities >= 0.5).float()  # Convert probabilities to binary predictions (0 or 1)
    return predictions.numpy()  # Return predictions as NumPy array

In [43]:
import numpy as np

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_cleaned, test_size=0.3, random_state=42)

# Model 1: Train on all attributes - h_p model
input_dim_no_age = X_train.shape[1]
model_h_p = train_pytorch_logistic_regression(X_train, y_train, input_dim_no_age)

# Predict on training and test data
y_train_pred_h0 = predict_pytorch_logistic_regression(model_h_p, X_train)
y_test_pred_h0 = predict_pytorch_logistic_regression(model_h_p, X_test)

# Calculate accuracy
train_accuracy = accuracy_score(y_train, y_train_pred_h0)
test_accuracy = accuracy_score(y_test, y_test_pred_h0)

# Print accuracy results
print(f"Training Accuracy h_p: {train_accuracy:.4f}")
print(f"Test Accuracy h_p: {test_accuracy:.4f}")

# Model 2: Train on all attributes except age, sex, and race
X_train_no_sex = X_train.drop(columns=['sex_Female', 'sex_Male'])
X_test_no_sex = X_test.drop(columns=['sex_Female', 'sex_Male'])

# Ensure all columns are numeric (floats)
X_train_no_sex = X_train_no_sex.astype(float)
X_test_no_sex = X_test_no_sex.astype(float)

input_dim_no_sex = X_train_no_sex.shape[1]
model_no_sex = train_pytorch_logistic_regression(X_train_no_sex, y_train, input_dim_no_sex)

# Predict on training and test data
y_train_pred = predict_pytorch_logistic_regression(model_no_sex, X_train_no_sex)
y_test_pred = predict_pytorch_logistic_regression(model_no_sex, X_test_no_sex)

# Calculate accuracy
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

# Print accuracy results
print(f"Training Accuracy h_0: {train_accuracy:.4f}")
print(f"Test Accuracy h_0: {test_accuracy:.4f}")

Training Accuracy h_p: 0.6816
Test Accuracy h_p: 0.6902
Training Accuracy h_0: 0.6801
Test Accuracy h_0: 0.6886


In [57]:
import torch
import torch.nn.functional as F
from sklearn.metrics import accuracy_score

print("Columns in X_test:", X_test.columns)
print("X_test shape:", X_test.shape)
print("X_train shape:", X_train.shape)

# Function to calculate accuracy using sklearn's accuracy_score
def calculate_accuracy(model, X, y_true):
    """Calculates accuracy for a given model, input data X, and true labels y_true."""
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():
        # Convert data to torch tensors
        X_tensor = torch.tensor(X.values, dtype=torch.float32)
        
        # Get predictions from the model (probabilities between 0 and 1)
        y_pred = model(X_tensor).squeeze()

        # Convert probabilities to binary predictions (0 or 1)
        y_pred_rounded = (y_pred >= 0.5).float()

        # Calculate accuracy using sklearn's accuracy_score
        accuracy = accuracy_score(y_true, y_pred_rounded)
    
    return accuracy

# Function to filter data based on sex and calculate accuracy per group
def get_sex_filtered_accuracy(model, X, y, sex_filter):
    """Calculates accuracy for sex_Female and sex_Male groups."""
    # Filter the data using the sex filter (from the original X_test with sex columns)
    X_filtered = X[sex_filter]
    y_filtered = y[sex_filter]
    
    # Calculate accuracy for the filtered data
    accuracy = calculate_accuracy(model, X_filtered, y_filtered)
    
    return accuracy

# Filter for sex_Female and sex_Male using the original test set (X_test)
female_filter = X_test['sex_Female'] == 1
male_filter = X_test['sex_Male'] == 1

# Calculate accuracy for sex_Female and sex_Male in model_h_p (with sex attributes)
accuracy_female_h_p = get_sex_filtered_accuracy(model_h_p, X_test, y_test, female_filter)
accuracy_male_h_p = get_sex_filtered_accuracy(model_h_p, X_test, y_test, male_filter)

print(f"Accuracy for sex_Female h_p: {accuracy_female_h_p:.4f}")
print(f"Accuracy for sex_Male h_p: {accuracy_male_h_p:.4f}")

# Now calculate accuracy for sex_Female and sex_Male in model_no_sex (without sex attributes)
# Use X_test_no_sex for the data, but use the same sex filter based on X_test
accuracy_female_h_0 = get_sex_filtered_accuracy(model_no_sex, X_test_no_sex, y_test, female_filter)
accuracy_male_h_0 = get_sex_filtered_accuracy(model_no_sex, X_test_no_sex, y_test, male_filter)

print(f"Accuracy for sex_Female h_0: {accuracy_female_h_0:.4f}")
print(f"Accuracy for sex_Male h_0: {accuracy_male_h_0:.4f}")

# Calculate the difference in accuracy (h_p model minus no_sex model)
diff_female_accuracy = accuracy_female_h_p - accuracy_female_h_0
diff_male_accuracy = accuracy_male_h_p - accuracy_male_h_0

print(f"Difference in accuracy female: {diff_female_accuracy:.4f}")
print(f"Difference in accuracy male: {diff_male_accuracy:.4f}")


Columns in X_test: Index(['age', 'fnlwgt', 'education-num', 'gain', 'loss', 'hours',
       'workclass_Federal-gov', 'workclass_Local-gov', 'workclass_Private',
       'workclass_Self-emp-inc', 'workclass_Self-emp-not-inc',
       'workclass_State-gov', 'education_10th', 'education_11th',
       'education_7th-8th', 'education_Assoc-acdm', 'education_Assoc-voc',
       'education_Bachelors', 'education_Doctorate', 'education_HS-grad',
       'education_Masters', 'education_Prof-school', 'education_Some-college',
       'marital_Divorced', 'marital_Married-civ-spouse',
       'marital_Never-married', 'occupation_Adm-clerical',
       'occupation_Craft-repair', 'occupation_Exec-managerial',
       'occupation_Farming-fishing', 'occupation_Handlers-cleaners',
       'occupation_Machine-op-inspct', 'occupation_Other-service',
       'occupation_Prof-specialty', 'occupation_Protective-serv',
       'occupation_Sales', 'occupation_Tech-support',
       'occupation_Transport-moving', 'relatio

In [58]:
from captum.attr import IntegratedGradients
import torch

# Function to apply Integrated Gradients
def explain_with_ig(model, X_test):
    ig = IntegratedGradients(model)
    
    # Convert the test set to PyTorch tensor
    X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
    
    # Compute attributions using Integrated Gradients
    attributions, delta = ig.attribute(X_test_tensor, return_convergence_delta=True)
    # Print the shape of attributions
    print("Shape of attributions:", attributions.shape)
    
    return attributions, delta

In [59]:
# Apply Integrated Gradients to Model 1 (excluding age)
attributions_h_p, delta_h_p = explain_with_ig(model_h_p, X_test)

# Print the attributions and convergence delta for Model 1
print("Attributions for Model 1 (h_p):", attributions_h_p)
print("Convergence delta for Model 1:", delta_h_p)

# Apply Integrated Gradients to Model 2 (excluding sex)
attributions_no_sex, delta_no_sex = explain_with_ig(model_no_sex, X_test_no_sex)

# Print the attributions and convergence delta for Model 2
print("Attributions for Model 2 (Excluding Age, Sex, and Race):", attributions_no_sex)
print("Convergence delta for Model 2:", delta_no_sex)


In [55]:
print(len(attributions_h_p))
print(attributions_h_p.shape)
print(len(attributions_no_sex))

NameError: name 'attributions_h_p' is not defined

In [40]:
# Function to calculate fidelity+ (comprehensiveness)
def fidelity_plus(model, X_test, original_preds, attributions, top_k=5):
    X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
    
    # Sort attributions by importance
    top_attributions = torch.argsort(attributions, descending=True)
    
    # Remove top_k important features
    for i in range(X_test_tensor.size(0)):
        top_features = top_attributions[i, :top_k]
        X_test_tensor[i, top_features] = 0
    
    # Re-run the model to get new predictions with removed features
    with torch.no_grad():
        new_predictions = (model(X_test_tensor) > 0.5).int()
    
    # Compare new predictions to original predictions (fidelity+)
    fidelity_plus_score = 1 - (1 / X_test_tensor.size(0)) * torch.sum((new_predictions == original_preds).float())
    
    return fidelity_plus_score.item()

# Function to calculate fidelity- (sufficiency)
def fidelity_minus(model, X_test, original_preds, attributions, top_k=5):
    X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
    
    # Sort attributions by importance
    top_attributions = torch.argsort(attributions, descending=True)
    
    # Keep only top_k important features and zero out the rest
    for i in range(X_test_tensor.size(0)):
        mask = torch.zeros(X_test_tensor.size(1), dtype=torch.float32)
        top_features = top_attributions[i, :top_k]
        mask[top_features] = 1
        X_test_tensor[i] = X_test_tensor[i] * mask
    
    # Re-run the model to get new predictions with only important features
    with torch.no_grad():
        new_predictions = (model(X_test_tensor) > 0.5).int()
    
    # Compare new predictions to original predictions (fidelity-)
    fidelity_minus_score = 1 - (1 / X_test_tensor.size(0)) * torch.sum((new_predictions == original_preds).float())
    
    return fidelity_minus_score.item()



In [44]:
# For model 1 (excluding age):
original_preds_no_age = (model_no_age(torch.tensor(X_test_no_age.values, dtype=torch.float32)) > 0.5).int()

# For model 2 (excluding age, sex, and race):
original_preds_no_age_sex_race = (model_no_age_sex_race(torch.tensor(X_test_no_age_sex_race.values, dtype=torch.float32)) > 0.5).int()

# Calculate fidelity+ and fidelity- for model_no_age (first model)
fidelity_plus_score_no_age = fidelity_plus(model_no_age, X_test_no_age, original_preds_no_age, attributions_no_age, top_k=5000)
fidelity_minus_score_no_age = fidelity_minus(model_no_age, X_test_no_age, original_preds_no_age, attributions_no_age, top_k=5000)

# Calculate fidelity+ and fidelity- for model_no_age_sex_race (second model)
fidelity_plus_score_no_age_sex_race = fidelity_plus(model_no_age_sex_race, X_test_no_age_sex_race, original_preds_no_age_sex_race, attributions_no_age_sex_race, top_k=5000)
fidelity_minus_score_no_age_sex_race = fidelity_minus(model_no_age_sex_race, X_test_no_age_sex_race, original_preds_no_age_sex_race, attributions_no_age_sex_race, top_k=5000)

# Print the results:
print(f"Fidelity+ (Comprehensiveness) Score for Model 1 (Excluding Age): {fidelity_plus_score_no_age}")
print(f"Fidelity- (Sufficiency) Score for Model 1 (Excluding Age): {fidelity_minus_score_no_age}")

print(f"Fidelity+ (Comprehensiveness) Score for Model 2 (Excluding Age, Sex, and Race): {fidelity_plus_score_no_age_sex_race}")
print(f"Fidelity- (Sufficiency) Score for Model 2 (Excluding Age, Sex, and Race): {fidelity_minus_score_no_age_sex_race}")

Fidelity+ (Comprehensiveness) Score for Model 1 (Excluding Age): 0.00044232606887817383
Fidelity- (Sufficiency) Score for Model 1 (Excluding Age): 5.960464477539063e-08
Fidelity+ (Comprehensiveness) Score for Model 2 (Excluding Age, Sex, and Race): 5.960464477539063e-08
Fidelity- (Sufficiency) Score for Model 2 (Excluding Age, Sex, and Race): 5.960464477539063e-08
