In [1]:
import numpy as np
import pandas as pd

from sklearn.compose import TransformedTargetRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import MinMaxScaler

from aif360.sklearn.datasets import fetch_lawschool_gpa
from aif360.sklearn.inprocessing import GridSearchReduction
from aif360.sklearn.metrics import difference

pip install 'aif360[AdversarialDebiasing]'
pip install 'aif360[inFairness]'
pip install 'aif360[AdversarialDebiasing]'
pip install 'aif360[OptimalTransport]'


In [2]:
X_train, y_train = fetch_lawschool_gpa("train", numeric_only=True, dropcols="gender")
X_test, y_test = fetch_lawschool_gpa("test", numeric_only=True, dropcols="gender")
X_train

Unnamed: 0_level_0,Unnamed: 1_level_0,race,lsat,ugpa
race,gender,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.0,1,0.0,38.0,3.3
1.0,0,1.0,34.0,4.0
1.0,0,1.0,34.0,3.9
1.0,0,1.0,45.0,3.3
1.0,1,1.0,39.0,2.5
1.0,...,...,...,...
1.0,1,1.0,31.0,2.7
1.0,1,1.0,37.0,2.4
1.0,0,1.0,29.0,3.0
1.0,1,1.0,39.0,3.2


In [3]:
scaler = MinMaxScaler()

X_train  = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns, index=X_test.index)

X_train.head()

The history saving thread hit an unexpected error (OperationalError('database or disk is full')).History will not be written to the database.


Unnamed: 0_level_0,Unnamed: 1_level_0,race,lsat,ugpa
race,gender,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.0,1,0.0,0.72973,0.825
1.0,0,1.0,0.621622,1.0
1.0,0,1.0,0.621622,0.975
1.0,0,1.0,0.918919,0.825
1.0,1,1.0,0.756757,0.625


In [4]:
tt = TransformedTargetRegressor(
    regressor=LinearRegression(),
    transformer=scaler
)

# 2. Train the model (drop 'race' from features)
tt.fit(X_train.drop(["race"], axis=1), y_train)

# 3. Predict on test set (drop 'race' from features)
y_pred = tt.predict(X_test.drop(["race"], axis=1))

# 4. Create a combined DataFrame with true target, predicted target, and group column
df_test_results = X_test.copy()
df_test_results['Y'] = y_test.values if hasattr(y_test, 'values') else y_test
df_test_results['Y_pred'] = y_pred
df_test_results['T'] = X_test['race']

# 5. Define the discrimination statistic function
def compute_discrimination_statistic(df, target_col='Y', pred_col='Y_pred', group_col='T'):
    group_A = df[df[group_col] == 0]
    group_B = df[df[group_col] == 1]

    mean_A = (group_A[pred_col] - group_A[target_col]).mean()
    mean_B = (group_B[pred_col] - group_B[target_col]).mean()

    return mean_A - mean_B

# 6. Compute discrimination statistic
disc_stat = compute_discrimination_statistic(df_test_results)
print(f"Discrimination statistic (mean error difference): {disc_stat}")

Discrimination statistic (mean error difference): 0.5701362005059881


In [6]:
lr_mae = mean_absolute_error(y_test, y_pred)
lr_mae

0.7400826321650612

In [7]:
lr_mae_diff = difference(mean_absolute_error, y_test, y_pred, prot_attr="race")
lr_mae_diff

0.2039259052574448

In [8]:
df_test = X_test.copy()
df_test['y_true'] = y_test
df_test['y_pred'] = y_pred
group0_mae = mean_absolute_error(df_test[df_test['race'] == 0]['y_true'], df_test[df_test['race'] == 0]['y_pred'])
group1_mae = mean_absolute_error(df_test[df_test['race'] == 1]['y_true'], df_test[df_test['race'] == 1]['y_pred'])

print(f"Group 0 MAE: {group0_mae:.4f}")
print(f"Group 1 MAE: {group1_mae:.4f}")

max_group_mae = max(group0_mae, group1_mae)
print(f"Max group MAE: {max_group_mae:.4f}")

Group 0 MAE: 0.9287
Group 1 MAE: 0.7248
Max group MAE: 0.9287


In [9]:
T_train = X_train.drop(["lsat", "ugpa"], axis=1)

In [10]:
X_train2 = X_train.drop(["race"], axis=1)

In [65]:
y_train_max = y_train.max()
y_train_min = y_train.min()

In [11]:
y_train_scaled = pd.DataFrame(
    scaler.fit_transform(y_train.values.reshape(-1, 1)),
    columns=y_train.columns if hasattr(y_train, 'columns') else ['target'],
    index=y_train.index
)

In [12]:
D = pd.concat([X_train2, y_train_scaled, T_train], axis=1)

In [13]:
D

Unnamed: 0_level_0,Unnamed: 1_level_0,lsat,ugpa,target,race
race,gender,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.0,1,0.729730,0.825,0.488636,0.0
1.0,0,0.621622,1.000,0.688131,1.0
1.0,0,0.621622,0.975,0.398990,1.0
1.0,0,0.918919,0.825,0.758838,1.0
1.0,1,0.756757,0.625,0.482323,1.0
1.0,...,...,...,...,...
1.0,1,0.540541,0.675,0.430556,1.0
1.0,1,0.702703,0.600,0.545455,1.0
1.0,0,0.486486,0.750,0.525253,1.0
1.0,1,0.756757,0.800,0.558081,1.0


In [14]:
D_formatted = np.array([
    [[row['lsat'], row['ugpa']], row['target'], row['race']]
    for _, row in D.iterrows()
], dtype=object)

In [None]:
import numpy as np

max_attempts = 40  # Prevent infinite loops
attempt = 0
theta_opt = "No Solution Found"

while isinstance(theta_opt, str) and attempt < max_attempts:
    print(f"🔁 Attempt {attempt + 1}")
    import importlib
    import QNDLR_MAE
    importlib.reload(QNDLR_MAE)

    from QNDLR_MAE import QNDLR

    theta_opt = QNDLR(D_formatted, 0.05, 0.1, 1.5, 0)
    print(theta_opt)
    attempt += 1

if isinstance(theta_opt, str):
    print("❌ QNDLR failed after multiple attempts.")
else:
    print("✅ Solution found!")
    print(theta_opt)


🔁 Attempt 1
❌ Unfair theta=[-0.07085537 -0.09296775] → Penalty=2.357222, UB=0.207222 > epsilon=0.1
❌ Unfair theta=[-0.07439814 -0.09296775] → Penalty=2.358219, UB=0.208219 > epsilon=0.1
❌ Unfair theta=[-0.07085537 -0.09761614] → Penalty=2.357768, UB=0.207768 > epsilon=0.1
❌ Unfair theta=[-0.0673126  -0.09761614] → Penalty=2.356771, UB=0.206771 > epsilon=0.1
❌ Unfair theta=[-0.06376984 -0.09994033] → Penalty=2.356048, UB=0.206048 > epsilon=0.1
❌ Unfair theta=[-0.06376984 -0.09529194] → Penalty=2.355501, UB=0.205501 > epsilon=0.1
❌ Unfair theta=[-0.06022707 -0.09412985] → Penalty=2.354368, UB=0.204368 > epsilon=0.1
❌ Unfair theta=[-0.05314153 -0.10110243] → Penalty=2.353194, UB=0.203194 > epsilon=0.1
❌ Unfair theta=[-0.04428461 -0.10516977] → Penalty=2.351180, UB=0.201180 > epsilon=0.1
❌ Unfair theta=[-0.04074184 -0.09935928] → Penalty=2.349501, UB=0.199501 > epsilon=0.1
❌ Unfair theta=[-0.02922784 -0.09906876] → Penalty=2.346228, UB=0.196228 > epsilon=0.1
❌ Unfair theta=[-0.01328538 -0.

In [15]:
theta_opt = [0.31726307, 0.42657659]

In [16]:
X_test_scaled = X_test.drop(["race"], axis=1)

In [17]:
y_pred_scaled_qndlr = X_test_scaled.dot(theta_opt) 


In [18]:
# Suppose y_pred_scaled is a Pandas Series
y_pred_scaled_qndlr_2d = y_pred_scaled_qndlr.values.reshape(-1, 1)

# Then inverse scale
y_pred_qndlr = scaler.inverse_transform(y_pred_scaled_qndlr_2d)

# Flatten back to 1D array if needed
y_pred_qndlr = y_pred_qndlr.flatten()


In [19]:
QNDLR_mae = mean_absolute_error(y_test, y_pred_qndlr)
QNDLR_mae

0.7634846116145819

In [20]:
lr_mae_diff = difference(mean_absolute_error, y_test, y_pred_qndlr, prot_attr="race")
lr_mae_diff

0.09630325279714824

In [21]:
def compute_discrimination_statistic_QNDLR(D_formatted, theta_opt):
    """
    Computes d(θ) as the difference in mean prediction errors between T=0 and T=1,
    where D_formatted is an array of [X, Y, T] triples.
    """
    # Extract X, Y, T from D_formatted
    X = np.array([row[0] for row in D_formatted])
    Y = np.array([row[1] for row in D_formatted])
    T = np.array([row[2] for row in D_formatted])

    theta_opt = np.array(theta_opt).flatten()

    group_A_mask = (T == 0)
    group_B_mask = (T == 1)

    # Compute predictions properly depending on X shape
    if X.ndim == 1:
        # Each X is scalar (1D array of scalars)
        y_pred = X * theta_opt if theta_opt.size == 1 else X * theta_opt[0]
    else:
        # X is 2D (n_samples x n_features)
        y_pred = X.dot(theta_opt)

    # Group predictions and true values
    y_A_pred = y_pred[group_A_mask]
    y_B_pred = y_pred[group_B_mask]

    y_A_true = Y[group_A_mask]
    y_B_true = Y[group_B_mask]

    mean_A = np.mean(y_A_pred - y_A_true)
    mean_B = np.mean(y_B_pred - y_B_true)

    d_theta = mean_A - mean_B
    return d_theta

In [22]:
compute_discrimination_statistic_QNDLR(D_formatted, theta_opt)

np.float64(0.03217354482468135)

In [67]:
max_attempts = 40  # Prevent infinite loops
attempt = 0
theta_opt2 = "No Solution Found"


unscaled_epsilon = 0.07


scaled_epsilon = unscaled_epsilon / (y_train_max- y_train_min)

while isinstance(theta_opt2, str) and attempt < max_attempts:
    print(f"🔁 Attempt {attempt + 1}")
    import importlib
    import QNDLR_MAE_DIFF
    importlib.reload(QNDLR_MAE_DIFF)

    from QNDLR_MAE_DIFF import QNDLR

    theta_opt2 = QNDLR(D_formatted, 0.05, scaled_epsilon, 1, 0)
    print(theta_opt2)
    attempt += 1

if isinstance(theta_opt2, str):
    print("❌ QNDLR failed after multiple attempts.")
else:
    print("✅ Solution found!")
    print(theta_opt2)

🔁 Attempt 1
❌ Unfair theta=[ 0.07129298 -0.00123642] → Penalty=1.127207, UB=0.136045 > epsilon=0.00883838383838384
❌ Unfair theta=[ 0.07485763 -0.00123642] → Penalty=1.126334, UB=0.135173 > epsilon=0.00883838383838384
❌ Unfair theta=[ 0.07129298 -0.00129825] → Penalty=1.127213, UB=0.136051 > epsilon=0.00883838383838384
❌ Unfair theta=[ 0.07485763 -0.0011746 ] → Penalty=1.126328, UB=0.135166 > epsilon=0.00883838383838384
❌ Unfair theta=[ 0.07663996 -0.00111278] → Penalty=1.125885, UB=0.134724 > epsilon=0.00883838383838384
❌ Unfair theta=[ 0.08020461 -0.00111278] → Penalty=1.125013, UB=0.133851 > epsilon=0.00883838383838384
❌ Unfair theta=[ 0.08466042 -0.00105096] → Penalty=1.123916, UB=0.132755 > epsilon=0.00883838383838384
❌ Unfair theta=[ 0.08644274 -0.00092732] → Penalty=1.123467, UB=0.132306 > epsilon=0.00883838383838384
❌ Unfair theta=[ 0.0922353  -0.00077277] → Penalty=1.122034, UB=0.130873 > epsilon=0.00883838383838384
❌ Unfair theta=[ 0.10025576 -0.00071094] → Penalty=1.120066, 

In [74]:
theta_opt2 = [0.3517043, 0.35126387]

In [75]:
X_test_scaled = X_test.drop(["race"], axis=1)

In [76]:
y_pred_scaled_qndlr2 = X_test_scaled.dot(theta_opt2) 

In [77]:
# Suppose y_pred_scaled is a Pandas Series
y_pred_scaled_qndlr2_2d = y_pred_scaled_qndlr2.values.reshape(-1, 1)

# Then inverse scale
y_pred_qndlr2 = scaler.inverse_transform(y_pred_scaled_qndlr2_2d)

# Flatten back to 1D array if needed
y_pred_qndlr2 = y_pred_qndlr2.flatten()


In [78]:
QNDLR_mae2 = mean_absolute_error(y_test, y_pred_qndlr2)
QNDLR_mae2

0.8006676088961914

In [79]:
lr_mae_diff2= difference(mean_absolute_error, y_test, y_pred_qndlr2, prot_attr="race")
lr_mae_diff2

0.017506674616229456