In [1]:
# ECE 570 HW2 - P12 - Devin Bresser

# imports
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler


In [2]:
# import Adult dataset (per https://archive.ics.uci.edu/dataset/2/adult)

from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
adult = fetch_ucirepo(id=2) 
  
# data (as pandas dataframes) 
X = adult.data.features 
y = adult.data.targets 
  
# metadata 
#print(adult.metadata) 
  
# variable information 
#print(adult.variables) 

X.head(3)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States


In [3]:
# Comment: There is a fair amount of preprocessing that needs to be done on this dataset.
# I will make a few modifications for the purpose of this HW that should preserve the ability to analyze fairness
# without adding too much complexity to the model.

# 1. Remove "education": we already have "education-num" to work with.
# 2. Remove "capital-gain" and "capital-loss". I observe that >90% of capital-gain and capital-loss values are 0 so I will exclude them.
# 4. Remove "relationship": this feels redundant given that we already have sex and marital status.
# 5. Remove "fnlwgt": This feature is a prediction - it is not concrete data so I will exclude it.
# 6. Binarize y.
# 7. Implement one-hot encoding on the categorical features.

In [4]:
# Implement the changes above to X:
columns_to_remove = ["education", "capital-gain", "capital-loss", "relationship", "fnlwgt"]
categorical_columns = ["workclass", "marital-status", "occupation", "race", "sex", "native-country"]
label_mappings = {} # create a dictionary to store the label mappings for later analysis

X_reduced = X.drop(columns=columns_to_remove)

# X_label_encoded = pd.DataFrame(index=X_reduced.index)

# for col in categorical_columns:
#     X_reduced[col] = X_reduced[col].astype('category')  # Convert to 'category' dtype
#     label_mappings[col] = dict(enumerate(X_reduced[col].cat.categories))
#     X_label_encoded[col] = X_reduced[col].cat.codes
    
X_onehot = pd.get_dummies(X_reduced, columns=categorical_columns, dtype=int)

# Binarize y
y["income"] = y["income"].replace({'<=50K': 0, '>50K': 1, '<=50K.': 0, '>50K.': 1})


# Perform dropna() on X, and then ensure that X and y are still aligned:
#X_label_encoded = X_label_encoded.dropna()
X_onehot = X_onehot.dropna()
y = y.loc[X_onehot.index] 

  y["income"] = y["income"].replace({'<=50K': 0, '>50K': 1, '<=50K.': 0, '>50K.': 1})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y["income"] = y["income"].replace({'<=50K': 0, '>50K': 1, '<=50K.': 0, '>50K.': 1})


In [5]:
# Split into train/test data and implement a scaler

X_train, X_test, y_train, y_test = train_test_split(X_onehot, y, test_size=0.2)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [6]:
# Using Logistic Regression

# Define and train model
model = LogisticRegression(max_iter=10000, class_weight="balanced")
model.fit(X_train_scaled, y_train.values.ravel())

# Make predictions on X_test
y_pred = model.predict(X_test_scaled)
y_pred_series = pd.Series(y_pred, index=X_test.index, name='predictions')

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print(classification_report(y_test, y_pred))
cfm = confusion_matrix(y_test, y_pred)
print(cfm)

Accuracy: 0.7934281912171154
              precision    recall  f1-score   support

           0       0.94      0.78      0.85      7486
           1       0.54      0.83      0.65      2283

    accuracy                           0.79      9769
   macro avg       0.74      0.81      0.75      9769
weighted avg       0.85      0.79      0.81      9769

[[5845 1641]
 [ 377 1906]]


In [7]:
# So my vanilla log-reg classifier is OK, but not that good. It has low precision for the "1" label.
# Essentially, it makes a lot of false positives, meaning it assigns "1" often when the true label is "0".
# Precision and recall for the "0" class are pretty good, but this isn't very meaningful because that is the majority class
# There are a number of strategies that I could try to employ to improve the model accuracy 
# but for now let's proceed to the fairness implementations.

In [8]:
# (2) Implementing fairness constraints
# Let's start by looking at how the model did with respect to group fairness.
# Let's investigate how the model discriminated based upon sex as this is a simple binary attribute to work with.

# Extract indices in test data with sex_Male = 1 and sex_Female = 1
idx_m = X_test[X_test["sex_Male"]==1].index
idx_f = X_test[X_test["sex_Female"]==1].index

y_test_m = y_test.loc[idx_m]
y_test_f = y_test.loc[idx_f]

y_pred_m = y_pred_series.loc[idx_m]
y_pred_f = y_pred_series.loc[idx_f]

# Recall definition of demographic parity: P(Y_pred = 1 | A = a) = P(Y_pred = 1 | A = b)
# We can compute those probabilities empirically from the model
P_y_pred_1_m = y_pred_m.mean()
P_y_pred_1_f = y_pred_f.mean()
print(f"Model accuracy: {accuracy}")
print(f"Empirical probability that y_pred = 1 given group Male: {P_y_pred_1_m}")
print(f"Empirical probability that y_pred = 1 given group Female: {P_y_pred_1_f}")

Model accuracy: 0.7934281912171154
Empirical probability that y_pred = 1 given group Male: 0.4787888103289271
Empirical probability that y_pred = 1 given group Female: 0.1323935029114312


In [9]:
# As we can see, our model is exhibiting failure in demographic parity.
# It is significantly more likely to predict Y_pred = 1 given Male versus Female.

# To remedy this, we could implement the fairness constraint on the sex attribute.
# min_w Sum L(y_pred_i , y_i) + lambda*|Cov(Y_pred, A)|
# Note Cov(Y_pred, A) = E[Y_pred * A] - E[Y_pred]*E[A]

In [10]:
# Comment: At this point in the homework, I was not able to figure out how to implement the fairness 
# algorithms for the Adult dataset. I will complete the rest of the HW (Fairness Constraints, FairBatch, 
# and Post Processing from the provided skeleton code file & synthetic dataset.