In [1]:
import json
import numpy as np
import os
import paramiko
import sklearn
import sys
import subprocess
import tensorflow as tf

print(sys.version)
print(np.__version__)
print(sklearn.__version__)
print(tf.__version__)

3.7.10 (default, Feb 26 2021, 10:16:00) 
[Clang 10.0.0 ]
1.18.5
0.24.1
1.15.5


## Deep Bayesian Bandits Reproducibility

This notebook explores the reproducibility around the [Deep Bayesian Bandits](https://github.com/tensorflow/models/tree/archive/research/deep_contextual_bandits) work by Google. We look at the LinTS implementation, which forms the baseline of their experiments.

In order to run these experiments, please perform the steps below:
- Create two environments using the versions above, one using `conda install`, and one using `pip install`. This ensures that one of the environments uses MKL and the other uses OpenBLAS.
- Download mushroom data from [readme](https://github.com/tensorflow/models/tree/archive/research/deep_contextual_bandits#real-world-datasets) and place in this directory.
- Clone the [tensorflow models repo](https://github.com/tensorflow/models), switch to the `archive` branch, and copy `models/research/deep_contextual_bandits/bandits` folder to this directory.
- Run the cell below to overwrite the LinTS implementation file. This updates the multivariate sampling such that we can select the method to use while sampling, between SVD and Cholesky. Note that the SVD method was used in the original code.

In [2]:
%%writefile bandits/algorithms/linear_full_posterior_sampling.py
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

"""Contextual algorithm that keeps a full linear posterior for each arm."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np
from scipy.stats import invgamma

from bandits.core.bandit_algorithm import BanditAlgorithm
from bandits.core.contextual_dataset import ContextualDataset


class LinearFullPosteriorSampling(BanditAlgorithm):
  """Thompson Sampling with independent linear models and unknown noise var."""

  def __init__(self, name, hparams):
    """Initialize posterior distributions and hyperparameters.

    Assume a linear model for each action i: reward = context^T beta_i + noise
    Each beta_i has a Gaussian prior (lambda parameter), each sigma2_i (noise
    level) has an inverse Gamma prior (a0, b0 parameters). Mean, covariance,
    and precision matrices are initialized, and the ContextualDataset created.

    Args:
      name: Name of the algorithm.
      hparams: Hyper-parameters of the algorithm.
    """

    self.name = name
    self.hparams = hparams
    self.rng = np.random.default_rng(self.hparams.seed)

    # Gaussian prior for each beta_i
    self._lambda_prior = self.hparams.lambda_prior

    self.mu = [
        np.zeros(self.hparams.context_dim + 1)
        for _ in range(self.hparams.num_actions)
    ]

    self.cov = [(1.0 / self.lambda_prior) * np.eye(self.hparams.context_dim + 1)
                for _ in range(self.hparams.num_actions)]

    self.precision = [
        self.lambda_prior * np.eye(self.hparams.context_dim + 1)
        for _ in range(self.hparams.num_actions)
    ]

    # Inverse Gamma prior for each sigma2_i
    self._a0 = self.hparams.a0
    self._b0 = self.hparams.b0

    self.a = [self._a0 for _ in range(self.hparams.num_actions)]
    self.b = [self._b0 for _ in range(self.hparams.num_actions)]

    self.t = 0
    self.data_h = ContextualDataset(hparams.context_dim,
                                    hparams.num_actions,
                                    intercept=True)

  def action(self, context):
    """Samples beta's from posterior, and chooses best action accordingly.

    Args:
      context: Context for which the action need to be chosen.

    Returns:
      action: Selected action for the context.
    """

    # Round robin until each action has been selected "initial_pulls" times
    if self.t < self.hparams.num_actions * self.hparams.initial_pulls:
      return self.t % self.hparams.num_actions

    # Sample sigma2, and beta conditional on sigma2
    sigma2_s = [
        self.b[i] * invgamma.rvs(self.a[i])
        for i in range(self.hparams.num_actions)
    ]

    try:
        if self.hparams.method == 'default':
            beta_s = [
              np.random.multivariate_normal(self.mu[i], sigma2_s[i] * self.cov[i])
              for i in range(self.hparams.num_actions)
            ]
        else:
            beta_s = [
                self.rng.multivariate_normal(self.mu[i], sigma2_s[i] * self.cov[i], method=self.hparams.method)
                for i in range(self.hparams.num_actions)
            ]
    except np.linalg.LinAlgError as e:
      # Sampling could fail if covariance is not positive definite
      print('Exception when sampling from {}.'.format(self.name))
      print('Details: {} | {}.'.format(e.message, e.args))
      d = self.hparams.context_dim + 1
      beta_s = [
          np.random.multivariate_normal(np.zeros((d)), np.eye(d))
          for i in range(self.hparams.num_actions)
      ]

    # Compute sampled expected values, intercept is last component of beta
    vals = [
        np.dot(beta_s[i][:-1], context.T) + beta_s[i][-1]
        for i in range(self.hparams.num_actions)
    ]

    return np.argmax(vals)

  def update(self, context, action, reward):
    """Updates action posterior using the linear Bayesian regression formula.

    Args:
      context: Last observed context.
      action: Last observed action.
      reward: Last observed reward.
    """

    self.t += 1
    self.data_h.add(context, action, reward)

    # Update posterior of action with formulas: \beta | x,y ~ N(mu_q, cov_q)
    x, y = self.data_h.get_data(action)

    # The algorithm could be improved with sequential update formulas (cheaper)
    s = np.dot(x.T, x)

    # Some terms are removed as we assume prior mu_0 = 0.
    precision_a = s + self.lambda_prior * np.eye(self.hparams.context_dim + 1)
    cov_a = np.linalg.inv(precision_a)
    mu_a = np.dot(cov_a, np.dot(x.T, y))

    # Inverse Gamma posterior update
    a_post = self.a0 + x.shape[0] / 2.0
    b_upd = 0.5 * (np.dot(y.T, y) - np.dot(mu_a.T, np.dot(precision_a, mu_a)))
    b_post = self.b0 + b_upd

    # Store new posterior distributions
    self.mu[action] = mu_a
    self.cov[action] = cov_a
    self.precision[action] = precision_a
    self.a[action] = a_post
    self.b[action] = b_post

  @property
  def a0(self):
    return self._a0

  @property
  def b0(self):
    return self._b0

  @property
  def lambda_prior(self):
    return self._lambda_prior


Overwriting bandits/algorithms/linear_full_posterior_sampling.py


We replicate the [quick start example](https://github.com/tensorflow/models/blob/archive/research/deep_contextual_bandits/example_main.py) from the Deep Contextual Bandits research repo below, focusing on just LinTS, and evaluating the cumulative reward at the end. Note that all the seeds are set, such that the cumulative reward should be the same when the code is run in the same environment.

In [3]:
%%writefile example_dcb.py

import os
import sys
import time

import numpy as np
import tensorflow as tf

from tensorflow.python.util import deprecation

from bandits.data.data_sampler import sample_mushroom_data
from bandits.core.contextual_bandit import run_contextual_bandit
from bandits.algorithms.linear_full_posterior_sampling import LinearFullPosteriorSampling

if type(tf.contrib) != type(tf):
    tf.contrib._warning = None

random_option = sys.argv[1]
np.random.seed(42)

def sample_data(num_contexts):
    num_actions = 2
    context_dim = 117
    file_name = os.path.join(os.path.dirname(__file__), 'mushroom.data')
    dataset, opt_mushroom = sample_mushroom_data(file_name, num_contexts)
    opt_rewards, opt_actions = opt_mushroom
    return dataset, opt_rewards, opt_actions, num_actions, context_dim

# Problem parameters
num_contexts = 2000

# Create dataset
sampled_vals = sample_data(num_contexts)
dataset, opt_rewards, opt_actions, num_actions, context_dim = sampled_vals

hparams_linear = tf.contrib.training.HParams(num_actions=num_actions,
                                             context_dim=context_dim,
                                             a0=6,
                                             b0=6,
                                             lambda_prior=0.25,
                                             initial_pulls=2,
                                             seed=42,
                                             method=random_option)

algos = [
      LinearFullPosteriorSampling('LinFullPost', hparams_linear),
]
t_init = time.time()
results = run_contextual_bandit(context_dim, num_actions, dataset, algos)
_, h_rewards = results


reward = np.sum(h_rewards[:, 0])
print(reward)

Overwriting example_dcb.py


In [4]:
with open('ssh_config.json') as fp:
    conf = json.load(fp)

local_conf = {'host_name': 'localhost'}

envs = [
    ('mac_openblas', local_conf, os.path.expanduser('~/Tools/miniconda3/envs/dcb/bin/python')),
    ('mac_mkl', local_conf, os.path.expanduser('~/Tools/miniconda3/envs/dcb2/bin/python')),
    ('linux_openblas', conf, '$HOME/Tools/miniconda3/envs/dcb/bin/python'),
    ('linux_mkl', conf, '$HOME/Tools/miniconda3/envs/dcb2/bin/python'),
]

def test_envs(env_lis, script, option):
    all_vals = []
    for env_name, conf, python_exec in env_lis:
        print(f'Running {env_name}...')
        if conf['host_name'] == 'localhost':
            res = subprocess.run([python_exec, script, option], capture_output=True).stdout.decode('utf-8').strip()
            all_vals.append(eval(res))
        else:
            with paramiko.SSHClient() as ssh:
                ssh.load_system_host_keys()
                ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
                ssh.connect(hostname=conf['host_name'], username=conf['username'], key_filename=os.path.expanduser(conf['key_filename']))
                with ssh.open_sftp() as ftp:
                    ftp.put(script, f'/tmp/{script}')
                    ftp.put('mushroom.data', '/tmp/mushroom.data')
                    for root, _, files in os.walk('bandits'):
                        if not root.endswith('__pycache__'):
                            try:
                                ftp.mkdir(f'/tmp/{root}')
                            except OSError:
                                pass
                            for file in files:
                                local_file = os.path.join(root, file)
                                ftp.put(local_file, f'/tmp/{local_file}')

                _, stdout, stderr = ssh.exec_command(f'{python_exec} /tmp/{script} {option}', get_pty=True)

                # Log the stdout as it comes
                res = ''.join([line.strip() for line in stdout.readlines()])
                all_vals.append(eval(res))

    return all_vals, np.unique(all_vals, axis=0)

### Option 1
We use the default implementation, which uses `np.random.multivariate_random`, and set the global seed to ensure reproducibility in a single environment. Note that this is the same as using `np.random.RandomState`, as the global seed sets the random state.

In [5]:
test_envs(envs, 'example_dcb.py', 'default')

Running mac_openblas...
Running mac_mkl...
Running linux_openblas...
Running linux_mkl...


([3830.0, 3925.0, 3590.0, 3360.0], array([3360., 3590., 3830., 3925.]))

The default implementation shows the problem: when running this code on different environments, even with different seeds, the cumulative regret is not deterministic.

### Option 2
We use the new `Generator` class with default parameters, which internally uses SVD for decomposition:

In [6]:
test_envs(envs, 'example_dcb.py', 'svd')

Running mac_openblas...
Running mac_mkl...
Running linux_openblas...
Running linux_mkl...


([3830.0, 3865.0, 3730.0, 3605.0], array([3605., 3730., 3830., 3865.]))

Again, running this code on different environments produces different results.

### Option 3
We use Cholesky decomposition with the new `Generator` class. Our hypothesis is that this will produce reproducible results across different environments.

In [7]:
test_envs(envs, 'example_dcb.py', 'cholesky')

Running mac_openblas...
Running mac_mkl...
Running linux_openblas...
Running linux_mkl...


([4025.0, 4025.0, 4025.0, 4025.0], array([4025.]))

As expected, using Cholesky decomposition leads to the same cumulative regret across all environments, alleviating the problem.