eugenevinitsky · eugenevinitsky · Mar 19, 2021 · Dec 6, 2019 · Dec 6, 2019 · Dec 6, 2019
diff --git a/.flake8 b/.flake8
@@ -1,2 +1,3 @@
 [flake8]
 max-line-length = 101
+extend-ignore = E203 # See https://github.com/PyCQA/pycodestyle/issues/373
diff --git a/.isort.cfg b/.isort.cfg
@@ -0,0 +1,6 @@
+[settings]
+line_length = 101
+multi_line_output = 3
+include_trailing_comma = True
+known_third_party = cv2,gym,matplotlib,numpy,pandas,pytz,ray,setuptools
+use_parentheses=True
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,18 @@
+repos:
+-   repo: https://github.com/asottile/seed-isort-config
+    rev: v1.9.3
+    hooks:
+    - id: seed-isort-config
+-   repo: https://github.com/pre-commit/mirrors-isort
+    rev: v4.3.21
+    hooks:
+    - id: isort
+-   repo: https://github.com/ambv/black
+    rev: stable
+    hooks:
+    - id: black
+      language_version: python3.6
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v2.3.0
+    hooks:
+    - id: flake8
diff --git a/.travis.yml b/.travis.yml
@@ -3,34 +3,16 @@ language: python
 cache: pip
 
 python:
-  - "3.5"
+  - "3.6.8"
 
 os: linux
 
-dist: trusty
-
-sudo: required
-
-before_install:
-  - sudo apt-get update
-  # Setup conda (needed for opencv, ray dependency)
-  # WARNING: enforces py3.5
-  - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh;
-  - bash miniconda.sh -b -p $HOME/miniconda
-  - export PATH="$HOME/miniconda/bin:$PATH"
-  - hash -r
-  - conda config --set always_yes yes --set changeps1 no
-  - conda update -q conda
-  - conda info -a
-  - python -V
-
-  # Set up requirements for running tests
-  - conda env create -f environment.yml
-  - source activate causal
+dist: bionic
 
 install:
   - pip install flake8 .
   - pip install pytest
+  - pip install -r requirements.txt
 
 before_script:
   - flake8 --version
@@ -39,4 +21,3 @@ before_script:
 script:
   - python setup.py install
   - python -m pytest
-
diff --git a/README.md b/README.md
@@ -3,7 +3,7 @@
 # Sequential Social Dilemma Games
 This repo is an open-source implementation of DeepMind's Sequential Social Dilemma (SSD) multi-agent game-theoretic environments [[1]](https://arxiv.org/abs/1702.03037). SSDs can be thought of as analogous to spatially and temporally extended Prisoner's Dilemma-like games. The reward structure poses a dilemma because individual short-term optimal strategies lead to poor long-term outcomes for the group.
 
-The implemented environments are structured to be compatible with OpenAIs gym environments (https://github.com/openai/gym) as well as RLlib's Multiagent Environment (https://github.com/ray-project/ray/blob/master/python/ray/rllib/env/multi_agent_env.py)
+The implemented environments are structured to be compatible with [OpenAIs gym environments](https://github.com/openai/gym) as well as [RLlib's Multiagent Environment](https://github.com/ray-project/ray/blob/master/rllib/env/multi_agent_env.pyhttps://github.com/ray-project/ray/blob/master/python/ray/rllib/env/multi_agent_env.py)
 
 ## Implemented Games
 
@@ -29,15 +29,38 @@ The above plot shows the empirical Schelling diagrams for both Cleanup (A) and H
 
 
 # Setup instructions
-* Create `causal` virtual environment: `conda env create -n causal environment.yml`
-* Run `python setup.py develop`
-* Activate your environment by running `source activate causal`, or `conda activate causal`.
+```
+git clone -b master https://github.com/internetcoffeephone/sequential_social_dilemma_games
+cd sequential_social_dilemma_games
+python3 -m venv venv # Create a Python virtual environment
+. venv/bin/activate
+pip3 install --upgrade pip setuptools wheel
+python3 setup.py develop
+pip3 install -r requirements.txt
+. ray_uint8_patch.sh # Ray patch due to https://github.com/ray-project/ray/issues/7946 
+cd run_scripts
+```
+
+After the setup, you can run experiments like so:
+- To train with default parameters (baseline model cleanup with 2 agents):
+`python3 train.py`
+
+- To train the MOA with 5 agents:
+`python3 train.py --model moa --num_agents 5`
+
+Many more options are available which can be found in [default_args.py](config/default_args.py). A collection preconfigured training scripts can be found in [run_scripts](run_scripts). 
+
+Note that the initialization time is rather high (up to 12 minutes) the more agents you use, possibly due to a [Ray bug](https://github.com/ray-project/ray/issues/5982#issuecomment-629217172).
+
+# CUDA, cuDNN and tensorflow-gpu
 
-To then set up the branch of Ray on which we have built the causal influence code, clone the repo to your desired folder:
-`git clone https://github.com/natashamjaques/ray.git`.
+If you run into any cuda errors, make sure you've got a [compatible set](https://www.tensorflow.org/install/source#tested_build_configurations) of cuda/cudnn/tensorflow versions installed. However, beware of the following:
+>The compatibility table given in the tensorflow site does not contain specific minor versions for cuda and cuDNN. However, if the specific versions are not met, there will be an error when you try to use tensorflow. [source](https://stackoverflow.com/a/53727997)
 
-Next, go to the rllib folder:
-` cd ray/python/ray/rllib ` and run the script `python setup-rllib-dev.py`. This will copy the rllib folder into the pip install of Ray and allow you to use the version of RLlib that is in your local folder by creating a softlink. 
+A configuration that works for me is:
+- CUDA 10.1.105
+- cuDNN 7.6.5
+- tensorflow-gpu 2.1.0 (this is automatically installed during with the above script, see [requirements.txt](requirements.txt))
 
 # Tests
 Tests are located in the test folder and can be run individually or run by running `python -m pytest`. Many of the less obviously defined rules for the games can be understood by reading the tests, each of which outline some aspect of the game. 
@@ -65,4 +88,4 @@ Every environment that subclasses MapEnv probably needs to implement the followi
 
 # Contributors
 
-This code base was developed by Eugene Vinitsky and Natasha Jaques; help with reproduction was provided by Joel Leibo, Antonio Castenada, and Edward Hughes. 
+This code base was developed by Eugene Vinitsky and Natasha Jaques; help with reproduction was provided by Joel Leibo, Antonio Castenada, and Edward Hughes. Additional development was done by Hugo Heemskerk. 
diff --git a/algorithms/__init__.py b/algorithms/__init__.py
diff --git a/algorithms/a3c_baseline.py b/algorithms/a3c_baseline.py
@@ -0,0 +1,17 @@
+from __future__ import absolute_import, division, print_function
+
+from ray.rllib.agents.a3c.a3c import get_policy_class, make_async_optimizer, validate_config
+from ray.rllib.agents.a3c.a3c_tf_policy import A3CTFPolicy
+from ray.rllib.agents.trainer_template import build_trainer
+
+
+def build_a3c_baseline_trainer(config):
+    a3c_trainer = build_trainer(
+        name="A3C",
+        default_config=config,
+        default_policy=A3CTFPolicy,
+        get_policy_class=get_policy_class,
+        validate_config=validate_config,
+        make_policy_optimizer=make_async_optimizer,
+    )
+    return a3c_trainer
diff --git a/algorithms/a3c_moa.py b/algorithms/a3c_moa.py
@@ -0,0 +1,164 @@
+"""Note: Keep in sync with changes to VTraceTFPolicy."""
+
+from __future__ import absolute_import, division, print_function
+
+from ray.rllib.agents.a3c.a3c import validate_config
+from ray.rllib.agents.a3c.a3c_tf_policy import postprocess_advantages
+from ray.rllib.agents.trainer_template import build_trainer
+from ray.rllib.evaluation.postprocessing import Postprocessing
+from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.policy.tf_policy import LearningRateSchedule
+from ray.rllib.policy.tf_policy_template import build_tf_policy
+from ray.rllib.utils import try_import_tf
+from ray.rllib.utils.explained_variance import explained_variance
+from ray.rllib.utils.tf_ops import make_tf_callable
+
+from algorithms.common_funcs_moa import (
+    EXTRINSIC_REWARD,
+    SOCIAL_INFLUENCE_REWARD,
+    get_moa_mixins,
+    moa_fetches,
+    moa_postprocess_trajectory,
+    setup_moa_loss,
+    setup_moa_mixins,
+)
+
+tf = try_import_tf()
+
+
+class A3CLoss(object):
+    def __init__(
+        self, action_dist, actions, advantages, v_target, vf, vf_loss_coeff=0.5, entropy_coeff=0.01,
+    ):
+        log_prob = action_dist.logp(actions)
+
+        # The "policy gradients" loss
+        self.pi_loss = -tf.reduce_sum(log_prob * advantages)
+
+        delta = vf - v_target
+        self.vf_loss = 0.5 * tf.reduce_sum(tf.square(delta))
+        self.entropy = tf.reduce_sum(action_dist.entropy())
+        self.total_loss = self.pi_loss + self.vf_loss * vf_loss_coeff - self.entropy * entropy_coeff
+
+
+def postprocess_a3c_moa(policy, sample_batch, other_agent_batches=None, episode=None):
+    """Adds the policy logits, VF preds, and advantages to the trajectory."""
+
+    batch = moa_postprocess_trajectory(policy, sample_batch)
+    batch = postprocess_advantages(policy, batch)
+    return batch
+
+
+def actor_critic_loss(policy, model, dist_class, train_batch):
+    logits, _ = model.from_batch(train_batch)
+    action_dist = dist_class(logits, model)
+    policy.loss = A3CLoss(
+        action_dist,
+        train_batch[SampleBatch.ACTIONS],
+        train_batch[Postprocessing.ADVANTAGES],
+        train_batch[Postprocessing.VALUE_TARGETS],
+        model.value_function(),
+        policy.config["vf_loss_coeff"],
+        policy.config["entropy_coeff"],
+    )
+
+    moa_loss = setup_moa_loss(logits, policy, train_batch)
+    policy.loss.total_loss += moa_loss.total_loss
+
+    # store this for future statistics
+    policy.moa_loss = moa_loss.total_loss
+
+    return policy.loss.total_loss
+
+
+def add_value_function_fetch(policy):
+    fetch = {SampleBatch.VF_PREDS: policy.model.value_function()}
+    fetch.update(moa_fetches(policy))
+    return fetch
+
+
+class ValueNetworkMixin(object):
+    def __init__(self):
+        @make_tf_callable(self.get_session())
+        def value(ob, prev_action, prev_reward, *state):
+            model_out, _ = self.model(
+                {
+                    SampleBatch.CUR_OBS: tf.convert_to_tensor([ob]),
+                    SampleBatch.PREV_ACTIONS: tf.convert_to_tensor([prev_action]),
+                    SampleBatch.PREV_REWARDS: tf.convert_to_tensor([prev_reward]),
+                    "is_training": tf.convert_to_tensor(False),
+                },
+                [tf.convert_to_tensor([s]) for s in state],
+                tf.convert_to_tensor([1]),
+            )
+            return self.model.value_function()[0]
+
+        self._value = value
+
+
+def stats(policy, train_batch):
+    base_stats = {
+        "cur_lr": policy.cur_lr,
+        "policy_loss": policy.loss.pi_loss,
+        "policy_entropy": policy.loss.entropy,
+        "var_gnorm": tf.global_norm([x for x in policy.model.trainable_variables()]),
+        "vf_loss": policy.loss.vf_loss,
+        "cur_influence_reward_weight": tf.cast(
+            policy.cur_influence_reward_weight_tensor, tf.float32
+        ),
+        SOCIAL_INFLUENCE_REWARD: train_batch[SOCIAL_INFLUENCE_REWARD],
+        EXTRINSIC_REWARD: train_batch[EXTRINSIC_REWARD],
+        "moa_loss": policy.moa_loss,
+    }
+    return base_stats
+
+
+def grad_stats(policy, train_batch, grads):
+    return {
+        "grad_gnorm": tf.global_norm(grads),
+        "vf_explained_var": explained_variance(
+            train_batch[Postprocessing.VALUE_TARGETS], policy.model.value_function()
+        ),
+    }
+
+
+def clip_gradients(policy, optimizer, loss):
+    grads_and_vars = optimizer.compute_gradients(loss, policy.model.trainable_variables())
+    grads = [g for (g, v) in grads_and_vars]
+    grads, _ = tf.clip_by_global_norm(grads, policy.config["grad_clip"])
+    clipped_grads = list(zip(grads, policy.model.trainable_variables()))
+    return clipped_grads
+
+
+def setup_mixins(policy, obs_space, action_space, config):
+    ValueNetworkMixin.__init__(policy)
+    LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"])
+    setup_moa_mixins(policy, obs_space, action_space, config)
+
+
+def build_a3c_moa_trainer(moa_config):
+    tf.keras.backend.set_floatx("float32")
+    trainer_name = "MOAA3CTrainer"
+    moa_config["use_gae"] = False
+
+    a3c_tf_policy = build_tf_policy(
+        name="A3CAuxTFPolicy",
+        get_default_config=lambda: moa_config,
+        loss_fn=actor_critic_loss,
+        stats_fn=stats,
+        grad_stats_fn=grad_stats,
+        gradients_fn=clip_gradients,
+        postprocess_fn=postprocess_a3c_moa,
+        extra_action_fetches_fn=add_value_function_fetch,
+        before_loss_init=setup_mixins,
+        mixins=[ValueNetworkMixin, LearningRateSchedule] + get_moa_mixins(),
+    )
+
+    trainer = build_trainer(
+        name=trainer_name,
+        default_policy=a3c_tf_policy,
+        default_config=moa_config,
+        validate_config=validate_config,
+    )
+
+    return trainer
diff --git a/algorithms/common_funcs_baseline.py b/algorithms/common_funcs_baseline.py
@@ -0,0 +1,13 @@
+class BaselineResetConfigMixin(object):
+    @staticmethod
+    def reset_policies(policies, new_config):
+        for policy in policies:
+            policy.entropy_coeff_schedule.value = lambda _: new_config["entropy_coeff"]
+            policy.config["entropy_coeff"] = new_config["entropy_coeff"]
+            policy.lr_schedule.value = lambda _: new_config["lr"]
+            policy.config["lr"] = new_config["lr"]
+
+    def reset_config(self, new_config):
+        self.reset_policies(self.optimizer.policies.values(), new_config)
+        self.config = new_config
+        return True