Copyright 2019 Google LLC

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    https://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

## MNIST Simulation

We investigate the behavior of our method on a variant of the well-known MNIST task.
We take the MNIST dataset under the standard train/test split and then randomly select $20\%$ of the training data points and change their label to $2$, yielding a biased set of labels. On such a dataset, our method  should be able to find appropriate weights so that training on the weighted dataset roughly corresponds to training on the true labels.
To this end, we train a classifier with a demographic-parity-like constraint on the predictions of digit $2$; i.e., we encourage a classifier to predict the digit $2$ at a rate of $10\%$, the rate appearing in the true labels. 

In [0]:
import tensorflow as tf 
from tensorflow.keras.datasets import mnist
import numpy as np
import copy

### Load data

In [0]:
(train_xs, train_ys), (test_xs, test_ys) = mnist.load_data()
train_xs = train_xs / 255.
test_xs = test_xs / 255.
train_xs = train_xs.reshape(-1, 28 * 28)
test_xs = test_xs.reshape(-1, 28 * 28)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz


In [0]:
print("Distribution Before")
for i in range(10):
  print np.mean(train_ys == i)

Distribution Before
0.09871666666666666
0.11236666666666667
0.0993
0.10218333333333333
0.09736666666666667
0.09035
0.09863333333333334
0.10441666666666667
0.09751666666666667
0.09915


In [0]:
train_ys_corrupted = np.copy(train_ys)
np.random.seed(12345)
idxs = np.random.choice(range(len(train_ys_corrupted)), size=len(train_ys_corrupted)/5, replace=False)
train_ys_corrupted[idxs] = 2
print("Distribution After")
for i in range(10):
  print np.mean(train_ys_corrupted == i)

Distribution After
0.07875
0.08973333333333333
0.2791
0.0819
0.07831666666666667
0.07266666666666667
0.07966666666666666
0.08318333333333333
0.07741666666666666
0.07926666666666667


## Neural Network

In [0]:
def weight_variable(shape, name="weight_variable"):
  """weight_variable generates a weight variable of a given shape."""
  initial = tf.truncated_normal(shape, stddev=0.1)
  return tf.Variable(initial, name=name)


def bias_variable(shape, name="bias_variable"):
  """bias_variable generates a bias variable of a given shape."""
  initial = tf.constant(0.1, shape=shape)
  return tf.Variable(initial, name=name)


def run_simple_NN(X,
                  y,
                  X_test,
                  y_test,
                  weights,
                  num_iter=10000,
                  learning_rate=0.001,
                  batch_size=128,
                  display_steps=1000,
                  n_layers=1):
  n_labels = np.max(y) + 1
  n_features = X.shape[1]
  weights_ = weights / (1. * np.sum(weights))
  x = tf.placeholder(tf.float32, [None, n_features])
  y_ = tf.placeholder(tf.float32, [None, n_labels])
  
  N = 512
  
  W_1 = weight_variable([784, N])
  b_1 = bias_variable([N])

  h_1 = tf.nn.relu(tf.matmul(x, W_1) + b_1)

  W_2 = weight_variable([N, N])
  b_2 = bias_variable([N])

  h_2 = tf.nn.relu(tf.matmul(h_1, W_2) + b_2)

  W_3 = weight_variable([N, N])
  b_3 = bias_variable([N])

  h_3 = tf.nn.relu(tf.matmul(h_2, W_3) + b_3)

  W_4 = weight_variable([N, 10])
  b_4 = bias_variable([10])

  NN_logits =tf.nn.softmax(tf.matmul(h_3, W_4) + b_4)

  loss = -tf.reduce_mean(tf.reduce_sum(y_ *tf.log(NN_logits+1e-6),1),0)
  acc = tf.reduce_mean(
      tf.cast(tf.equal(tf.arg_max(NN_logits,1), tf.arg_max(y_,1)), "float"))
  train_step = tf.train.AdamOptimizer().minimize(loss)
  correct_prediction = tf.equal(tf.argmax(NN_logits, 1), tf.argmax(y_, 1))
  accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

  def one_hot(ns):
    return np.eye(n_labels)[ns]

  y_onehot = one_hot(y)
  y_test_onehot = one_hot(y_test)

  with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for i in range(num_iter):
      ns = np.random.choice(range(len(X)), size=50, replace=True, p=weights_)
      if (i + 1) % display_steps == 0:
        train_accuracy = accuracy.eval(feed_dict={x: X, y_: y_onehot})
        test_accuracy = accuracy.eval(feed_dict={x: X_test, y_: y_test_onehot})

        print("step %d, training accuracy %g, test accuracy %g" %
              (i + 1, train_accuracy, test_accuracy))
      train_step.run(
          feed_dict={x: X[ns, :], y_: y_onehot[ns, :]})

    testing_prediction = tf.argmax(NN_logits, 1).eval(feed_dict={x: X_test})
    training_prediction = tf.argmax(NN_logits, 1).eval(feed_dict={x: X})
    return training_prediction, testing_prediction


  
  

## Training on unbiased dataset

In [0]:
weights = np.array([1] * len(train_ys))
test_predictions = run_simple_NN(train_xs, train_ys, test_xs, test_ys, weights)

step 1000, training accuracy 0.969, test accuracy 0.9647
step 2000, training accuracy 0.977817, test accuracy 0.9674
step 3000, training accuracy 0.98295, test accuracy 0.9698
step 4000, training accuracy 0.986883, test accuracy 0.9738
step 5000, training accuracy 0.986067, test accuracy 0.9745
step 6000, training accuracy 0.98915, test accuracy 0.9762
step 7000, training accuracy 0.991067, test accuracy 0.9775
step 8000, training accuracy 0.99195, test accuracy 0.9794
step 9000, training accuracy 0.993867, test accuracy 0.9774
step 10000, training accuracy 0.992733, test accuracy 0.9786


## Baseline (unconstrained)

In [0]:
weights = np.array([1] * len(train_ys))
test_predictions = run_simple_NN(train_xs, train_ys_corrupted, test_xs, test_ys, weights)

step 1000, training accuracy 0.784717, test accuracy 0.9269
step 2000, training accuracy 0.799067, test accuracy 0.9415
step 3000, training accuracy 0.796417, test accuracy 0.9149
step 4000, training accuracy 0.813567, test accuracy 0.945
step 5000, training accuracy 0.816767, test accuracy 0.9502
step 6000, training accuracy 0.8257, test accuracy 0.9578
step 7000, training accuracy 0.826517, test accuracy 0.947
step 8000, training accuracy 0.835667, test accuracy 0.938
step 9000, training accuracy 0.83125, test accuracy 0.9314
step 10000, training accuracy 0.845467, test accuracy 0.9423


## Our method

In [0]:
def debias_weights(original_labels, protected_attributes, multipliers):
  exponents = np.zeros(len(original_labels))
  for i, m in enumerate(multipliers):
    exponents -= m * protected_attributes[i]
  weights = np.exp(exponents)/ (np.exp(exponents) + np.exp(-exponents))
  weights = np.where(original_labels == 2, 1 - weights, weights)
  return weights

In [0]:
multipliers = np.zeros(1)
learning_rate = 1.
n_iters = 100
protected_train = [(train_ys_corrupted == 2)]

for it in xrange(n_iters):
  print("Iteration", it + 1, "multiplier", multipliers)
  weights = debias_weights(train_ys_corrupted, protected_train, multipliers)
  weights = weights / np.sum(weights)
  print("Weights for 2", np.sum(weights[np.where(train_ys_corrupted==2)]))
  train_prediction, test_predictions = run_simple_NN(train_xs, train_ys_corrupted, test_xs, test_ys, weights)
  violation = np.mean(train_prediction == 2) - 0.1
  multipliers -= learning_rate * violation
  print()
  print()


('Iteration', 1, 'multiplier', array([0.]))
('Weights for 2', 0.27910000000000007)
step 1000, training accuracy 0.777633, test accuracy 0.9201
step 2000, training accuracy 0.803583, test accuracy 0.9507
step 3000, training accuracy 0.80525, test accuracy 0.95
step 4000, training accuracy 0.813167, test accuracy 0.9505
step 5000, training accuracy 0.82015, test accuracy 0.9499
step 6000, training accuracy 0.823233, test accuracy 0.9512
step 7000, training accuracy 0.827567, test accuracy 0.9415
step 8000, training accuracy 0.833467, test accuracy 0.9337
step 9000, training accuracy 0.840083, test accuracy 0.9347
step 10000, training accuracy 0.8462, test accuracy 0.9407
()
()
('Iteration', 2, 'multiplier', array([-0.05111667]))
('Weights for 2', 0.2686755189012749)
step 1000, training accuracy 0.762033, test accuracy 0.8753
step 2000, training accuracy 0.791167, test accuracy 0.922
step 3000, training accuracy 0.809883, test accuracy 0.9596
step 4000, training accuracy 0.8132, test accu