In [1]:
import os

import gym
from gym import Wrapper
import crafter
import numpy as np
import pandas as pd

from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import SubprocVecEnv, VecMonitor
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.results_plotter import *

In [2]:
class AchievementInfoWrapper(gym.Wrapper):
    def step(self, action):
        observation, reward, done, info = self.env.step(action)
        info['achievement_count'] = np.sum(np.array(list(
            info['achievements'].values())) > 0)
        info['achievement_frac'] = info['achievement_count'] / len(info['achievements'])
        for key in info ['achievements']:
            info[key] = info['achievements'][key]
        return observation, reward, done, info

LOG_DIR = 'tmp/'
os.makedirs(LOG_DIR, exist_ok=True)

def make_env(env_name='CrafterReward-v1', data_dir='data/', n_envs=8):
    env = gym.make(env_name)
    env = AchievementInfoWrapper(env)
    # env = crafter.Recorder(
    #     env, data_dir,
    #     save_stats=True,
    #     save_episode=False,
    #     save_video=False,
    # )
    env.reset()
    info_sample = env.step(0)[3]
    env = SubprocVecEnv([lambda: env for i in range(n_envs)])
    env = VecMonitor(env, LOG_DIR, info_keywords=(
        'achievement_count', 'achievement_frac') + \
        tuple(info_sample['achievements'].keys()))
    return env

class TensorboardCallback(BaseCallback):
    def __init__(self, log_dir=LOG_DIR, verbose=0):
        self.log_dir = log_dir
        self.df_idx = 0
        super(TensorboardCallback, self).__init__(verbose)

    def _on_step(self) -> bool:
        return True

    def _on_rollout_end(self):
        df = load_results(self.log_dir)
        n_rows = df.shape[0]
        df = df.iloc[self.df_idx:n_rows]
        df.drop(columns=['index', 'r', 't', 'l'], inplace=True)
        for key in df.columns:
            if 'achievement' in key:
                val = np.mean(df[key].values)
            else:
                val = np.mean(df[key].values > 0)
            self.logger.record('achievement/' + key, val)
        self.df_idx = n_rows



In [3]:
env_name = 'CrafterReward-v1'
n_steps = int(2e5)

In [4]:
# if __name__ == '__main__':
#   env = make_env(env_name)
# obs = env.reset()
# obs, reward, done, info = env.step(env.action_space.sample())
# info

In [169]:
env = make_env()
model = PPO('CnnPolicy', env, verbose=1, tensorboard_log='./logs/')
model.learn(total_timesteps=n_steps, callback=TensorboardCallback())
model.save('models/ppo_baseline_' + env_name)

Using cuda device
Wrapping the env in a VecTransposeImage.
Logging to ./logs/PPO_3
-------------------------------------
| achievement/          |           |
|    achievement_count  | 2.43      |
|    achievement_frac   | 0.111     |
|    collect_coal       | 0         |
|    collect_diamond    | 0         |
|    collect_drink      | 0.261     |
|    collect_iron       | 0         |
|    collect_sapling    | 0.565     |
|    collect_stone      | 0         |
|    collect_wood       | 0.217     |
|    defeat_skeleton    | 0         |
|    defeat_zombie      | 0         |
|    eat_cow            | 0         |
|    eat_plant          | 0         |
|    make_iron_pickaxe  | 0         |
|    make_iron_sword    | 0         |
|    make_stone_pickaxe | 0         |
|    make_stone_sword   | 0         |
|    make_wood_pickaxe  | 0         |
|    make_wood_sword    | 0         |
|    place_furnace      | 0         |
|    place_plant        | 0.435     |
|    place_stone        | 0         |
|    

In [None]:

# # Load the trained agent
# # NOTE: if you have loading issue, you can pass `print_system_info=True`
# # to compare the system on which the model was trained vs the current one
# # model = DQN.load("dqn_lunar", env=env, print_system_info=True)
# model = DQN.load("dqn_lunar", env=env)

# # Evaluate the agent
# # NOTE: If you use wrappers with your environment that modify rewards,
# #       this will be reflected here. To evaluate with original rewards,
# #       wrap environment in a "Monitor" wrapper before other wrappers.
# mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10)

# # Enjoy trained agent
# obs = env.reset()
# for i in range(1000):
#     action, _states = model.predict(obs, deterministic=True)
#     obs, rewards, dones, info = env.step(action)
#     env.render()