In [1]:
import copy
import pathlib
import os.path
import json
import hashlib
import itertools
import collections
from typing import Any, Sequence, Mapping, Set

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf

2024-08-08 12:40:42.277525: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-08-08 12:40:42.352900: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-08-08 12:40:42.355094: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
import matplotlib as mpl
from matplotlib import pyplot as plt
import seaborn as sns

In [4]:
from rlplg import envsuite, core
from daaf import estimator_metrics

In [5]:
ENVS_MAPPING = {
    (
        "IceWorld",
        "4KE3ASUFQGGUPERSDDRQAZAMA46CI2CMCJHGWJ7MRNI64JMEBETNDXFFPYWTQJF46S5BJ4NXXCHNMJSLII3ROYXI76DFOC3VAABGNVA=",
    ): {"args": '{"map_name": "4x4"}', "name": "4x4"},
    ("ABCSeq", "2"): {
        "args": '{"length": 3, "distance_penalty": false}',
        "name": "n=3",
    },    
    ("ABCSeq", "3"): {
        "args": '{"length": 3, "distance_penalty": false}',
        "name": "n=3",
    },    
    ("ABCSeq", "10"): {
        "args": '{"length": 10, "distance_penalty": false}',
        "name": "n=10",
    },
    (
        "RedGreenSeq",
        "NNLHYJFTC5ENMMDZWRNQ37B6VVDXQ7WHB5EJOPXYZFLMJEZOYLTSLB4ID4WHQG57XQPNUHGZCFDCWHYGXWSBW7FBWYRZGAGBW4J7MEQ=",
    ): {
        "args": '{"cure": ["red", "green", "wait", "green", "red", "red", "green", "wait"]}',
        "name": "n=9",
    },
    (
        "FrozenLake-v1",
        "U75ZLQLLXYRFQE5KOJJGNVQZGQ65U5RVVN3ZV5F4UNYQVK6NGTAAU62O2DKMOEGACNNUQOSWGYYOV7LQHK7GAWG2CL3U3RZJFIEIB5I=",
    ): {"args": '{"is_slippery": false, "map_name": "4x4"}', "name": "4x4"},
    ("TowerOfHanoi", "4"): {"args": '{"num_disks": 4}', "name": "disks=4"},
    ("ABCSeq", "7"): {
        "args": '{"length": 7, "distance_penalty": false}',
        "name": "n=7",
    },
    (
        "IceWorld",
        "JKNDNWGM45FELU53ZLLVJEPY2SFZBCX54PSACOQOFMTDUAK5VNQ4KE45QZINGYFU5GR6D7F3GJMW7EC4TAY5PHCYRN5GPGP7YNACHEI=",
    ): {"args": '{"map_name": "8x8"}', "name": "8x8"},
    (
        "GridWorld",
        "P3VJZBIJ7PNUOFG2SCF532NH5AQ6NOBZEZ6UZNZ7D3AU3GQZSLKURMS2SRPEUF6O65F3ETJXEFNTR3UYS73TUCIIU3YIONXHAR6WE5A=",
    ): {
        "args": '{"grid": "oooooooooooo\\noooooooooooo\\noooooooooooo\\nsxxxxxxxxxxg"}',
        "name": "4x12",
    },
}

In [6]:
EXCLUDED_ENVS = set(["FrozenLake-v1"])

## Load Data

In [7]:
PATH = str(pathlib.Path.home() / "fs/daaf/exp/reward-recovery/1723120236-report.json")

In [8]:
def read_data(path: str) -> pd.DataFrame:
    return pd.read_json(path)

In [9]:
df_raw = read_data(PATH)

In [10]:
df_raw

Unnamed: 0,spec,method,output
0,"{'name': 'ABCSeq', 'args': {'length': 3, 'dist...",factor-ts,"{'least': [[-1.1666666667, -0.75, -1.083333333..."
1,"{'name': 'ABCSeq', 'args': {'length': 2, 'dist...",factor-ts,"{'least': [[-1.0, -1.0], [-1.0, -1.0]], 'ols-e..."
2,"{'name': 'RedGreenSeq', 'args': {'cure': ['red...",factor-ts,"{'least': [[-1.0, -1.0, -1.0], [-1.0, -1.0, -1..."
3,"{'name': 'RedGreenSeq', 'args': {'cure': ['red...",plain,"{'least': [[-1.0, -1.0, -1.0], [-1.0, -1.0, -1..."
4,"{'name': 'ABCSeq', 'args': {'length': 3, 'dist...",plain,"{'least': [[-1.0, -1.0, -1.0], [-1.0, -1.0, -1..."
5,"{'name': 'ABCSeq', 'args': {'length': 2, 'dist...",plain,"{'least': [[-1.0, -1.0], [-1.0, -1.0], [0.0, 0..."


In [11]:
df_raw.iloc[0]["spec"]

{'name': 'ABCSeq', 'args': {'length': 3, 'distance_penalty': False}}

In [12]:
def get_env_level(env_spec):
    loaded_env_spec = envsuite.load(env_spec["name"], **env_spec["args"])
    return loaded_env_spec.level

In [13]:
get_env_level(df_raw.iloc[0]["spec"])

'3'

In [14]:
def proc_data(df_raw: pd.DataFrame) -> pd.DataFrame:
    rows = []
    for row in df_raw.to_dict("records"):
        new_row = copy.deepcopy(row)
        
        # add env id;
        new_row["env_name"] = new_row["spec"]["name"]
        new_row["env_level"] = get_env_level(new_row["spec"])
        eid = (new_row["spec"]["name"], new_row["env_level"])
        new_row["env_label"] = ENVS_MAPPING[eid]["name"]
        rows.append(new_row)
    return pd.DataFrame(rows)

In [15]:
df_results = proc_data(df_raw)

In [16]:
df_results

Unnamed: 0,spec,method,output,env_name,env_level,env_label
0,"{'name': 'ABCSeq', 'args': {'length': 3, 'dist...",factor-ts,"{'least': [[-1.1666666667, -0.75, -1.083333333...",ABCSeq,3,n=3
1,"{'name': 'ABCSeq', 'args': {'length': 2, 'dist...",factor-ts,"{'least': [[-1.0, -1.0], [-1.0, -1.0]], 'ols-e...",ABCSeq,2,n=3
2,"{'name': 'RedGreenSeq', 'args': {'cure': ['red...",factor-ts,"{'least': [[-1.0, -1.0, -1.0], [-1.0, -1.0, -1...",RedGreenSeq,NNLHYJFTC5ENMMDZWRNQ37B6VVDXQ7WHB5EJOPXYZFLMJE...,n=9
3,"{'name': 'RedGreenSeq', 'args': {'cure': ['red...",plain,"{'least': [[-1.0, -1.0, -1.0], [-1.0, -1.0, -1...",RedGreenSeq,NNLHYJFTC5ENMMDZWRNQ37B6VVDXQ7WHB5EJOPXYZFLMJE...,n=9
4,"{'name': 'ABCSeq', 'args': {'length': 3, 'dist...",plain,"{'least': [[-1.0, -1.0, -1.0], [-1.0, -1.0, -1...",ABCSeq,3,n=3
5,"{'name': 'ABCSeq', 'args': {'length': 2, 'dist...",plain,"{'least': [[-1.0, -1.0], [-1.0, -1.0], [0.0, 0...",ABCSeq,2,n=3


In [18]:
df_results.iloc[0]["method"], df_results.iloc[0]["output"]["matrix"], df_results.iloc[0]["output"]["rhs"], df_results.iloc[0]["output"]["least"]

('factor-ts',
 [[1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0],
  [1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0],
  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 3.0, 0.0],
  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 1.0],
  [1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0],
  [1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0],
  [1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0],
  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 4.0, 0.0],
  [0.0, 1.0, 3.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]],
 [-4.0, -4.0, -4.0, -3.0, -4.0, -4.0, -4.0, -4.0, -4.0],
 [[-1.1666666667, -0.75, -1.0833333333],
  [-1.0, -0.75, -1.0833333333],
  [-1.0, -1.0, -1.0]])

The factor-ts matrix above excludes terminal states.

In [19]:
df_results.iloc[4]["method"], df_results.iloc[4]["output"]["matrix"], df_results.iloc[4]["output"]["rhs"], df_results.iloc[4]["output"]["least"]

('plain',
 [[1.0, 2.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
  [0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0],
  [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
  [0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0],
  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 1.0, 0.0, 0.0, 0.0],
  [0.0, 1.0, 3.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
  [1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0],
  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0],
  [1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0],
  [0.0, 0.0, 0.0, 2.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
  [1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0]],
 [-4.0, -4.0, -4.0, -4.0, -4.0, -4.0, -4.0, -3.0, -4.0, -3.0, -4.0, -3.0],
 [[-1.0, -1.0, -1.0],
  [-1.0, -1.0, -1.0],
  [-1.0, -1.0, -1.0],
  [3.7021965659999997e-16, 2.908664673e-16, -0.0]])

The `factor-ts` estimate above is an example of cases where reward recovery has multiple possible solutions. Without any compensating strategies, the rewards learned in such cases can be incorrect.

The `plain` estimate has correct values for most state-action pairs - this is because visitations to the terminal state anchor the values of non-terminal states visited before it.
Still, there are some minor errors in the values estimates of rewards for the terminal states, which can be manually corrected since we know them.