```
Copyright 2022 DeepMind Technologies Limited.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    https://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
```

# Melting Pot Evaluation Results

This Colab plots results of the MAPLA evaluations outlined in the [Melting Pot 2.0 Tech Report](https://arxiv.org/abs/2211.13746).

1.  Click "Connect" in the top right corner.
2.  Select "Runtime -> Run all".

In [None]:
# @title Installs

%pip install --quiet colabtools
%pip install --quiet matplotlib
%pip install --quiet numpy
%pip install --quiet pandas
%pip install --quiet seaborn

In [None]:
# @title Imports

import dataclasses
import re
import sys
from unittest import mock
import urllib

import IPython
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from google.colab import widgets

In [None]:
# @title Setup

def no_vertical_scrollbar():
  """Disable scroll-in-the-scroll."""
  javascript = 'google.colab.output.setIframeHeight(0, true, {interactive: true, maxHeight: 9999})'
  display(IPython.display.Javascript(javascript))


# No vertical scrollbars.
get_ipython().events.register('pre_run_cell', no_vertical_scrollbar)

# Allow higher resolution plots.
IPython.display.set_matplotlib_formats('retina')

In [None]:
# @title Utilities

def display(dataframe):
  """Displays dataframe, regardless of size.

  Args:
    dataframe: dataframe to display.
  """
  with pd.option_context('display.max_rows', None, 'display.max_columns', None,
                         'display.max_colwidth', None):
    IPython.display.display(dataframe)


def _heatmap(data, **kwargs):
  """Plots a heatmap of the data.

  Args:
    data: Data to plot
    **kwargs: forwarded to sns.heatmap

  Returns:
    The axes of the heatmap.
  """
  max_abs_value = np.nanmax(np.abs(data))
  kwargs.setdefault('cbar', False)
  kwargs.setdefault('linewidth', 1)
  kwargs.setdefault('annot', True)
  if max_abs_value >= 10000:
    kwargs.setdefault('fmt', '.1g')
  elif max_abs_value >= 100:
    kwargs.setdefault('fmt', '.0f')
  elif max_abs_value >= 10:
    kwargs.setdefault('fmt', '.1f')
  else:
    kwargs.setdefault('fmt', '.2f')

  ax = sns.heatmap(data, **kwargs)
  plt.tick_params(
      which='both', left=False, right=False, bottom=False, top=False)
  plt.setp([tick.label1 for tick in ax.xaxis.get_major_ticks()],
           rotation=45,
           ha='right',
           va='center',
           rotation_mode='anchor')
  plt.setp([tick.label2 for tick in ax.xaxis.get_major_ticks()],
           rotation=45,
           ha='left',
           va='center',
           rotation_mode='anchor')
  return ax


def heatmap(data, left=None, right=None, top=None, bottom=None, **kwargs):
  row_labels = list(data.index)
  col_labels = list(data.columns)
  data = data.to_numpy()

  if top is None:
    top_rows = 0
  elif not top.empty:
    row_labels = list(top.index) + [''] + row_labels
    data = np.vstack([
        top.to_numpy(),
        np.zeros(top.iloc[0].shape) + np.nan,
        data,
    ])
    top_rows = top.shape[0] + 1

  if bottom is None:
    bottom_rows = 0
  elif not bottom.empty:
    row_labels = row_labels + [''] + list(bottom.index)
    data = np.vstack([
        data,
        np.zeros(bottom.iloc[0].shape) + np.nan,
        bottom.to_numpy(),
    ])
    bottom_rows = bottom.shape[0] + 1

  if left is None:
    pass
  elif not left.empty:
    col_labels = list(left.columns) + [''] + col_labels
    data = np.hstack([
        np.vstack([
            np.zeros([top_rows, left.shape[1]]) + np.nan,
            left.to_numpy(),
            np.zeros([bottom_rows, left.shape[1]]) + np.nan,
        ]),
        np.zeros([data.shape[0], 1]) + np.nan,
        data,
    ])

  if right is None:
    pass
  elif not top.empty:
    col_labels = col_labels + [''] + list(right.columns)
    data = np.hstack([
        data,
        np.zeros([data.shape[0], 1]) + np.nan,
        np.vstack([
            np.zeros([top_rows, right.shape[1]]) + np.nan,
            right.to_numpy(),
            np.zeros([bottom_rows, right.shape[1]]) + np.nan,
        ]),
    ])

  with plt.rc_context({
      'font.size': 15,
      'xtick.labeltop': True,
      'xtick.labelbottom': True,
      'ytick.labelleft': True,
      'ytick.labelright': True,
  }):
    plt.figure(figsize=(data.shape[1], data.shape[0] * 0.6))
    kwargs.setdefault('vmin', 0)
    kwargs.setdefault('vmax', 1)
    kwargs.setdefault('cmap', 'coolwarm')
    return _heatmap(
        data=data,
        xticklabels=col_labels,
        yticklabels=row_labels,
        **kwargs,
    )

In [None]:
# @title Load results
path = 'https://storage.googleapis.com/dm-meltingpot/meltingpot-results-2.1.0.feather'  # @param {type: 'string'}
results = pd.read_feather(path)
scenario_results = results.drop(
    labels=set(results.substrate.unique()),
    axis=1,
    errors='ignore')

## Calculate scores

In [None]:
# @title Aggregate focal_per_capita_return over episodes

def summarize(results):
  grouped = scenario_results.groupby(
      ['scenario', 'substrate', 'mapla', 'training_run'])
  df = grouped.focal_per_capita_return.agg(
      ['count', 'mean', 'std', 'sem'], axis=1)
  return df


performance_per_run = summarize(scenario_results)
performance_per_run

In [None]:
# @title Normalize focal_per_capita_return statistics

def normalize(performance_per_run):
  raw = performance_per_run.unstack(['mapla', 'training_run'])
  lower = raw['mean'].min(axis=1) - 1e-8
  upper = raw['mean'].max(axis=1)
  scale = upper - lower

  normalized = raw.assign(
      mean=raw['mean'].subtract(lower, axis=0).divide(scale, axis=0),
      std=raw['std'].divide(scale, axis=0),
      sem=raw['sem'].divide(scale, axis=0),
  )
  normalized = normalized.stack(['mapla', 'training_run'])
  normalized = normalized.sort_index()
  return normalized


scenario_scores_per_run = normalize(performance_per_run)
scenario_scores_per_run

In [None]:
# @title Make assumptions about missing prosocial runs

print("""
NOTE: For the collective-return substrates, the prosocial MAPLA receive rewards 
identical to those received by a non-prosocial variants (except for a scale
factor). Thus, for these substrates, the prosocial MAPLA is identical to the
non-prosocial variant, and we expect they would therefore achieved the same
performance. We therefore copy the non-prosocial scores for this situation.
""")

_COLLECTIVE_RETURN_SUBSTATES = frozenset({
    'collaborative_cooking__asymmetric',
    'collaborative_cooking__circuit',
    'collaborative_cooking__cramped',
    'collaborative_cooking__crowded',
    'collaborative_cooking__figure_eight',
    'collaborative_cooking__forced',
    'collaborative_cooking__ring',
})


def _add_prosocial_performance(scenario_scores_per_run):
  df = scenario_scores_per_run.reset_index()
  df = df[df.substrate.isin(_COLLECTIVE_RETURN_SUBSTATES)]
  df = df[df.mapla.isin(['acb', 'opre'])]
  df = df.assign(mapla=df.mapla.map(lambda x: x + '_prosocial'))
  df = df.set_index(['scenario', 'substrate', 'mapla', 'training_run'])
  return pd.concat([scenario_scores_per_run, df]).sort_index()


scenario_scores_per_run = _add_prosocial_performance(scenario_scores_per_run)
scenario_scores_per_run

In [None]:
# @title Aggregate per-scenario scores over training runs

scores_per_scenario = scenario_scores_per_run['mean']
scores_per_scenario = scores_per_scenario.groupby(['scenario', 'substrate', 'mapla']).agg(['count', 'mean', 'std', 'sem'])
scores_per_scenario

In [None]:
# @title Count number of scenarios for each substrate

scenarios_per_substrate = scores_per_scenario.reset_index(['mapla', 'scenario']).scenario  #['scenario'] #drop_duplicates()
scenarios_per_substrate = scenarios_per_substrate.groupby(level='substrate').nunique()
scenarios_per_substrate

In [None]:
# @title Aggregate per-substrate scores over training runs

scores_per_substrate = scenario_scores_per_run['mean']
scores_per_substrate = scores_per_substrate.unstack(['scenario'])
scores_per_substrate = scores_per_substrate.sum(axis=1).div(scenarios_per_substrate)
scores_per_substrate = scores_per_substrate.groupby(['substrate', 'mapla']).agg(['count', 'mean', 'std', 'sem'])
scores_per_substrate

In [None]:
# @title Aggregate overall scores over training runs

overall_scores = scenario_scores_per_run['mean']
overall_scores = overall_scores.unstack(['scenario'])
overall_scores = overall_scores.sum(axis=1).div(scenarios_per_substrate)
overall_scores = overall_scores.unstack(['substrate'])
overall_scores = overall_scores.sum(axis=1).div(len(scenarios_per_substrate))
overall_scores = overall_scores.groupby(['mapla']).agg(['count', 'mean', 'std', 'sem'])
overall_scores

## Plot results

In [None]:
# @title Plot scores

def _plot_scores():
  score = overall_scores['mean']
  score = score.sort_values(ascending=False)
  per_substrate = scores_per_substrate['mean'].unstack('mapla')
  per_substrate = per_substrate.reindex(columns=score.index)
  per_scenario = scores_per_scenario['mean'].unstack('mapla')
  per_scenario = per_scenario.reindex(columns=score.index)

  tabs = widgets.TabBar(['summary', 'breakdown'])

  with tabs.output_to('summary'):
    top = pd.DataFrame.from_dict({'overall score': score}, orient='index')
    heatmap(per_substrate, top=top)

  with tabs.output_to('breakdown', select=False):
    substrates = per_scenario.index.unique('substrate')
    subtabs = widgets.TabBar(list(substrates))
    for substrate, df in per_scenario.groupby(level='substrate'):
      with subtabs.output_to(substrate, select=False):
        df = df.droplevel('substrate')

        top = pd.DataFrame.from_dict({
            'all substrates': score,
            substrate: per_substrate.loc[substrate]
        }, orient='index')
        heatmap(df, top=top)


_plot_scores()