# Testing aspects of Minimax Group Fairness
URL: https://arxiv.org/abs/2011.03108

Try simulate "Two-Player Game Formulation" proposed by the article to see how i can fit it under the package Temis.

GENERAL DESCRIPTION:

Regulator: Tries to identify which group has great loss and increase it's weight through exponential weights.

Learner: Minimize current model and seek for optimal solution.

In [4]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, accuracy_score
from sklearn.datasets import make_classification
from sklearn.base import clone

In [22]:
'''
This Should implement the base class for MinimaxFairness method presented
in the paper https://arxiv.org/abs/2011.03108.

It works by implementing a Two-Player Game formulation of Learner and Regulator.
Learner: Will optimize objective function based on samples_weights and base cost function.
Regulator: Will adjust sample_weights for next turn of game and final implementation.


'''
class MinimaxFairness:
    '''
    Model structure:
        self.model_class : It is a class reference for the base method used.
        self.T : Iteration count on the number of games it will run.
        self.lr : adaptive learning rate, as shown in the paper it should be 1/sqrt(t) 
            where t denotes the current iteration of the game.
        self.K : number of differente groups. (MAYBE ANOTHER NAME??)
        self.eps : OPT1 satisfatibility.
        self.verbose : Enables debugging information.

        --- 
        Another useful information:
        self.models : holds all models that are produced in the game.
        self.lambdas_history : store the sample_weights history.
        self.group_losses_history : store the group_losses_history.
    '''
    def __init__(self, model_class, eps=1e-1, verbose=False):
        self.model_class = model_class
        self.T = None
        self.lr = None
        self.K = None
        self.eps = eps
        self.verbose = verbose

        # Initialize storage for models, lambdas, and group losses history
        self.models = []
        self.lambdas_history = []
        self.group_losses_history = []

    '''
    This method will implement the Two-Player game formulation logic.
    The game consists of a Learner and a Regulator that will play their turn.
    Learner turn: Optimize the objective function and return such parameters.
    Regulator turn: Will adjust sample_weights so that more important samples have greater weight.
    '''
    def fit(self, X, y, groups):
        if self.verbose == True:
            print(f"Debugging fit information...")
        n_samples = len(y)

        unique_groups = np.unique(groups)
        # This will be useful for defining iteration count self.T
        n_groups = len(unique_groups)
        self.K = n_groups

        if self.verbose == True:
            print(f"Number identified groups: {self.K}")
            print(f"Identified Groups: {unique_groups}")

        # This bound is explicit defined in the paper.
        self.T = int(np.ceil(np.log(self.K) / (2 * self.eps * self.eps)))

        # Define proportions of each group.
        group_counts = {g: np.sum(groups == g) for g in unique_groups}

        if self.verbose == True:
            print(f"Group Counts: {group_counts}")

        # In MinimaxFair Algorithm, the first weight is defined by proportion of samples.
        self.lambdas = {g: group_counts[g] / n_samples for g in unique_groups}
        if self.verbose == True:
            print(f"Initial Lambdas: {self.lambdas}")


        # Game formulation...
        if self.verbose == True:
            print(f"Initializing game with {self.T} rounds...")
            
        sample_weights = np.zeros(n_samples)
        for g in unique_groups:
            mask = (groups == g)
            sample_weights[mask] = self.lambdas[g]

        for t in range(1, self.T + 1):
            self.lr = 1/np.sqrt(t)

            if self.verbose == True:
                print(f"Initialing round: {t}")

            #h_t = self.model_class(solver='lbfgs', max_iter=100)
            h_t = clone(self.model_class)
            h_t.fit(X, y, sample_weight=sample_weights)
            self.models.append(h_t)

            group_losses = {}
            probs = h_t.predict_proba(X)

            for g in unique_groups:
                mask = (groups == g)
                loss_k = log_loss(y[mask], probs[mask])
                group_losses[g] = loss_k

            self.group_losses_history.append(group_losses)
            self.lambdas_history.append(self.lambdas.copy())

            for g in unique_groups:
                self.lambdas[g] *= np.exp(self.lr * group_losses[g])
    def predict_proba(self, X):
        if self.verbose == True:
            print(f"debugging predict_proba information....")
        preds = np.array([h.predict_proba(X) for h in self.models])
        mean_preds = np.mean(preds, axis=0)
        return mean_preds

    def predict(self, X):
        return np.argmax(self.predict_proba(X), axis=1)

In [24]:
random_state = 42

np.random.seed(random_state)
n_samples = 1000
X, y = make_classification(n_samples=n_samples, n_features=20, n_informative=10, n_redundant=10, random_state=random_state)
groups = np.random.choice([0, 1], size=n_samples, p=[0.7, 0.3])

# Add noise to some group to make it harder to guess.
noise_idxs = np.where(groups == 1)[0]
y[noise_idxs] = np.random.choice([0, 1], size=len(noise_idxs))

# Add some baseline model and test error rates.
baseline_model = LogisticRegression(solver='lbfgs', max_iter=1000)
baseline_model.fit(X, y)
baseline_preds = baseline_model.predict(X)
#print(baseline_model.coef_)

baseline_loss_class_0 = log_loss(y[groups == 0], baseline_model.predict_proba(X)[groups == 0])
baseline_loss_class_1 = log_loss(y[groups == 1], baseline_model.predict_proba(X)[groups == 1])

print("--- Baseline (Standard Logistic Regression) ---")
print(f"Log Loss Grupo 0: {baseline_loss_class_0:.4f}")
print(f"Log Loss Grupo 1: {baseline_loss_class_1:.4f}")
print(f"Diferen√ßa de Erro: {abs(baseline_loss_class_0 - baseline_loss_class_1):.4f}")

# Add Minimax Fairness model and test error rates.
n_iter = 100
mm_model_class = LogisticRegression(solver='lbfgs', max_iter=n_iter)
mm_model = MinimaxFairness(mm_model_class, eps=1e-2, verbose=True)
mm_model.fit(X, y, groups)

mm_pred_probs = mm_model.predict_proba(X)
mm_pred_y0 = mm_pred_probs[:, 0]
mm_pred_y1 = mm_pred_probs[:, 1]
mm_loss_class_0 = log_loss(y[groups == 0], mm_pred_y1[groups == 0])
mm_loss_class_1 = log_loss(y[groups == 1], mm_pred_y1[groups == 1])

print(f"\n--- Minimax Fair Model (Ap√≥s {mm_model.T} itera√ß√µes) ---")
print(f"Log Loss Grupo 0: {mm_loss_class_0:.4f}")
print(f"Log Loss Grupo 1: {mm_loss_class_1:.4f}")
print(f"Diferen√ßa de Erro: {abs(mm_loss_class_0 - mm_loss_class_1):.4f}")



--- Baseline (Standard Logistic Regression) ---
Log Loss Grupo 0: 0.4144
Log Loss Grupo 1: 0.8712
Diferen√ßa de Erro: 0.4568
Debugging fit information...
Number identified groups: 2
Identified Groups: [0 1]
Group Counts: {np.int64(0): np.int64(712), np.int64(1): np.int64(288)}
Initial Lambdas: {np.int64(0): np.float64(0.712), np.int64(1): np.float64(0.288)}
Initializing game with 3466 rounds...
Initialing round: 1
Initialing round: 2
Initialing round: 3
Initialing round: 4
Initialing round: 5
Initialing round: 6
Initialing round: 7
Initialing round: 8
Initialing round: 9
Initialing round: 10
Initialing round: 11
Initialing round: 12
Initialing round: 13
Initialing round: 14
Initialing round: 15
Initialing round: 16
Initialing round: 17
Initialing round: 18
Initialing round: 19
Initialing round: 20
Initialing round: 21
Initialing round: 22
Initialing round: 23
Initialing round: 24
Initialing round: 25
Initialing round: 26
Initialing round: 27
Initialing round: 28
Initialing round: 29
In