### Startup

In [39]:
import itertools

PREFIX = "Using the given Super Mario Bros. Gym environment, give a reward function that "
PREFIX_VARIANTS = ["encourages", "incentivizes", "motivates"]
permutations = []

### X Position Rewards


In [40]:
mains = [
    [
        ["go", "move", "walk", "run"],
        ["right", "forward", "to the right"]
    ],
    [
        ["travel", "reach"],
        [
            "as far as possible",
            "the end of the level",
            "the flagpole",
            "the finish",
            "the goal",
        ],
    ]
]
codes = [10, 50, 100, 200]

for w in itertools.product(PREFIX_VARIANTS, *mains[0]):
    main = f"{w[0]} the agent to {w[1]} {w[2]} by rewarding it for increasing its x-position."
    code = f"""class XReward(gym.Wrapper):
    def __init__(self, env):
        super(XReward, self).__init__(env)
        self._prev_x_pos = 0

    def step(self, action):
        state, reward, terminated, truncated, info = self.env.step(action)
        x_reward = max(info['x_pos'] - self._prev_x_pos, 0)
        self._prev_x_pos = info['x_pos']
        reward += x_reward
        return state, reward, terminated, truncated, info
    """
    permutations.append((PREFIX + main, code))

for w in itertools.product(PREFIX_VARIANTS, *mains[1], codes):
    main = f"{w[0]} the agent to {w[1]} {w[2]} by rewarding it for increasing its x-position."
    code = f"""class MaxXReward(gym.Wrapper):
    def __init__(self, env):
        super(MaxXReward, self).__init__(env)
        self._prev_x_pos = 0

    def step(self, action):
        state, reward, terminated, truncated, info = self.env.step(action)
        x_reward = max(info['x_pos'] - self._prev_x_pos, 0)
        self._prev_x_pos = info['x_pos']
        reward += x_reward
        if terminated:
            reward += info['x_pos'] / {w[3]}
        return state, reward, terminated, truncated, info
    """
    permutations.append((PREFIX + main, code))

print(len(permutations))

156


### Time Rewards and Penalties

In [41]:
mains = [["complete", "finish", "beat"], ["the level", "the game", "the stage"], ["as fast as possible", "quickly"]]
codes = [0, 1, 10, 20]

for w in itertools.product(PREFIX_VARIANTS, *mains, codes):
    main = f"{w[0]} the agent to {w[1]} {w[2]} {w[3]} by penalizing it for letting the time tick down{' and rewarding it with the time remaining if the agent completes ' + w[2] if w[4] else ''}."
    code = (
        f"""class TimeReward(gym.Wrapper):
    def __init__(self, env):
        super(TimeReward, self).__init__(env)
        self._current_time = 400

    def step(self, action):
        state, reward, terminated, truncated, info = self.env.step(action)
        time_reward = info["time"] - self._current_time
        self._current_time = info["time"]
        reward += time_reward
        if terminated:
            reward += info['time'] / {w[4]}
        return state, reward, terminated, truncated, info
    """
        if w[4] else f"""class TimePenalty(gym.Wrapper):
    def __init__(self, env):
        super(TimePenalty, self).__init__(env)
        self._current_time = 400

    def step(self, action):
        state, reward, terminated, truncated, info = self.env.step(action)
        time_penalty = info["time"] - self._current_time
        self._current_time = info["time"]
        penalty += time_penalty
        return state, reward, terminated, truncated, info
    """
    )
    permutations.append((PREFIX + main, code))

print(len(permutations))

372


### Powerup Rewards

In [42]:
mains = ["get", "grab", "use", "utilize", "obtain", "acquire", "pick up"]
codes = [[10, 20, 50, 100], [0, 1]]

for w in itertools.product(PREFIX_VARIANTS, mains, *codes):
    main = f"{w[0]} the agent to {w[1]} powerups by rewarding the agent when it powers up{' and penalizes the agent when it loses its powerup status' if w[3] else ''}."
    code = f"""class PowerupReward(gym.Wrapper):
    def __init__(self, env):
        super(PowerupReward, self).__init__(env)
        self._prev_status = "small"

    def step(self, action):
        state, reward, terminated, truncated, info = self.env.step(action)
        if self._prev_status == 'small' and info["status"] != 'small':
            reward += {w[2]}
        elif self._prev_status != 'small' and info["status"] == 'small':
            reward -= {w[2]}
        self._prev_status = info["status"]
        return state, reward, terminated, truncated, info
    """ if w[3] else f"""class PowerupReward(gym.Wrapper):
    def __init__(self, env):
        super(PowerupReward, self).__init__(env)
        self._prev_status = "small"

    def step(self, action):
        state, reward, terminated, truncated, info = self.env.step(action)
        if self._prev_status == 'small' and info["status"] != 'small':
            reward += {w[2]}
        self._prev_status = info["status"]
        return state, reward, terminated, truncated, info
    """

    permutations.append((PREFIX + main, code))

print(len(permutations))

540


### Score Rewards

In [43]:
mains = ["maximize", "accumulate", "go for the most"]
codes = [list(range(30, 51, 5)), list(range(5, 21, 5))]

for w in itertools.product(PREFIX_VARIANTS, mains, *codes):
    main = f"{w[0]} the agent to {w[1]} points by rewarding it for increasing the score."
    code = f"""class HighScoreReward(gym.Wrapper):
    def __init__(self, env):
        super(HighScoreReward, self).__init__(env)
        self._current_score = 0

    def step(self, action):
        state, reward, terminated, truncated, info = self.env.step(action)
        reward += (info['score'] - self._current_score) / {w[2]}
        self._current_score = info['score']
        return state, reward / {w[3]}, terminated, truncated, info
    """
    permutations.append((PREFIX + main, code))

print(len(permutations))

720


### Coin Rewards and Penalties

In [44]:
mains = [
        ("collect", "ignore"),
        ("accumulate", "avoid"),
        ("get", "avoid getting"),
        ("grab", "dodge"),
        ("prioritize", "skip"),
    ]
codes = list(range(20, 0, 5)) + [-1, 1] + list(range(5, 21, 5))

for w in itertools.product(PREFIX_VARIANTS, mains, codes):
    main = f"{w[0]} the agent to {w[1][0 if w[2] > 0 else 1]} coins by {'rewarding' if w[2] > 0 else 'penalizing'} it for increasing the coin count."
    code = f"""class CoinReward(gym.Wrapper):
    def __init__(self, env):
        super(CoinReward, self).__init__(env)
        self._current_coins = 0

    def step(self, action):
        state, reward, terminated, truncated, info = self.env.step(action)
        coin_reward = info["coins"] - self._current_coins
        self._current_coins = info["coins"]
        reward += coin_reward * {w[2]}
        return state, reward, terminated, truncated, info
    """
    permutations.append((PREFIX + main, code))

print(len(permutations))

810


### Jump Rewards and Penalties

In [45]:
mains = [
        ("higher", "lower"),
        ("more", "less"),
        ("as much as possible", "as little as possible"),
        ("constantly", "almost never"),
    ]
codes = list(range(-5, 0)) + list(range(1, 6))

for w in itertools.product(PREFIX_VARIANTS, mains, codes):
    main = f"{w[0]} the agent to jump {w[1][0 if w[2] > 0 else 1]} by {'rewarding' if w[2] > 0 else 'penalizing'} it for increasing its y-position."
    code = f"""class JumpReward(gym.Wrapper):
    def __init__(self, env):
        super(JumpReward, self).__init__(env)
        self._prev_y_pos = 0

    def step(self, action):
        state, reward, terminated, truncated, info = self.env.step(action)
        y_reward = max({min(0, w[2])}, min(info['y_pos'] - self._prev_y_pos, {max(0, w[2])}))
        self._prev_y_pos = info['y_pos']
        return state, reward, terminated, truncated, info
    """
    permutations.append((PREFIX + main, code))

print(len(permutations))

930


### Exploration Rewards

In [46]:
mains = [
    ["explore", "discover", "find"],
    ["areas", "locations", "places"],
]
codes = list(range(1, 6))

for w in itertools.product(PREFIX_VARIANTS, *mains, codes):
    main = f"{w[0]} the agent to {w[1]} new {w[2]} by rewarding it for visiting new (x, y) positions in the level."
    code = f"""class ExplorationReward(gym.Wrapper):
    def __init__(self, env):
        super(ExplorationReward, self).__init__(env)
        self._visited = set()

    def step(self, action):
        state, reward, terminated, truncated, info = self.env.step(action)
        x_pos, y_pos = info['x_pos'], info['y_pos']
        pos = (x_pos, y_pos)
        if pos not in self._visited:
            self._visited.add(pos)
            reward += {w[3]}
        return state, reward, terminated, truncated, info
    """
    permutations.append((PREFIX + main, code))

print(len(permutations))

1065


### Save to file

In [47]:
import pandas as pd
df = pd.DataFrame(permutations, columns=["Instruction", "Code"])
df.to_csv("dataset.csv", index=False)