##Dependencies

In [1]:
! pip3 install gym
! pip3 install torch
! pip install pygame
! pip install moviepy==1.0.3
! pip install numpy==1.23.5
! pip install imageio_ffmpeg
! pip install pyvirtualdisplay
! pip install crafter
! apt-get install -y xvfb python3-opengl ffmpeg
! unzip dataset.zip

Collecting pyvirtualdisplay
  Downloading PyVirtualDisplay-3.0-py3-none-any.whl.metadata (943 bytes)
Downloading PyVirtualDisplay-3.0-py3-none-any.whl (15 kB)
Installing collected packages: pyvirtualdisplay
Successfully installed pyvirtualdisplay-3.0
Collecting crafter
  Downloading crafter-1.8.3.tar.gz (107 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.6/107.6 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting opensimplex (from crafter)
  Downloading opensimplex-0.4.5.1-py3-none-any.whl.metadata (10 kB)
Collecting ruamel.yaml (from crafter)
  Downloading ruamel.yaml-0.18.10-py3-none-any.whl.metadata (23 kB)
Collecting ruamel.yaml.clib>=0.2.7 (from ruamel.yaml->crafter)
  Downloading ruamel.yaml.clib-0.2.12-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.7 kB)
Downloading opensimplex-0.4.5.1-py3-none-any.whl (267 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m 

##Imports and Displaing Video

In [6]:
import torch
from torch import nn
import numpy as np
from torch.utils.data import Dataset, DataLoader

import random
from tqdm import tqdm

import gym
import crafter

import argparse

import glob
import io
import base64
from IPython.display import HTML
from IPython import display as ipythondisplay

import imageio

In [7]:
def show_video():
  mp4list = glob.glob('./*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else:
    print("Could not find video")

##Model Definition

In [8]:
class Agent(nn.Module):

    def __init__(self, action_size, hidden_size=64, input_size=(3, 64, 64)):
        super().__init__()

        #network for extracting features from the input image of the game
        #(try's to learn internal state of game from looking at image)
        self.convolutional_sequence = nn.Sequential(
            nn.Conv2d(3, 8, kernel_size=3, stride=4),
            nn.ReLU(),
            nn.Conv2d(8, 16, kernel_size=3, stride=2),
            nn.ReLU(),
            nn.Conv2d(16, 32, kernel_size=3, stride=1),
            nn.ReLU(),
            nn.Flatten(),
        )

        #pass dummy input through the convolutional layer to get out the input to the value and action networks
        with torch.no_grad():
            dummy_input = torch.zeros(1, *input_size)  # input_size = (C, H, W)
            conv_out = self.convolutional_sequence(dummy_input)
            flattened_size = conv_out.shape[1]


        #network for training on predicted game state to make an action
        self.action_network = nn.Sequential(
            nn.Linear(flattened_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, action_size)
        )

        #network for training on predicted game state to create a value for a given state
        self.value_network = nn.Sequential(
            nn.Linear(flattened_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, 1),
        )

    def get_value(self, state):
        features = self.convolutional_sequence(state)
        return self.value_network(features)

    def get_logits(self, state):
        features = self.convolutional_sequence(state)
        return self.action_network(features)

    def get_action_dist(self, state):
        logits = self.get_logits(state)
        return torch.softmax(logits, dim=1)

In [9]:
# Dataset that wraps memory for a dataloader
class RLDataset(Dataset):
  def __init__(self, data):
    super().__init__()
    self.data = []
    for d in data:
      self.data.append(d)

  def __getitem__(self, index):
    return self.data[index]

  def __len__(self):
    return len(self.data)

##Preprocessing State Helper Functions

In [28]:
def preprocess(state):
  state = state.transpose(2, 0, 1)    # change the shape form (H, W, C) -> (C, H, W)
  state = state / 255.0   # normalize the input data so that it is no longer in the range 255
  state = torch.from_numpy(state).float().to(device)    # take the numpy input and make it a tensor
  return state

##Behavior Cloning Helper Functions


In [11]:
#imports the data stored in the dataset folder
def get_expert_data(device, filename='dataset'):
  expert_episodes = glob.glob(filename + '/*.npz')
  expert_data = []

  for episode in expert_episodes:
    episode_data = np.load(episode)
    observations, actions = episode_data['image'], episode_data['action']
    expert_data += list(zip(observations, actions))

  return expert_data

In [25]:
class ExpertDataset(Dataset):
  def __init__(self, data):
    super().__init__()
    self.data = []
    for observation, state in data:
      self.data.append((preprocess(observation), state))

  def __getitem__(self, index):
    return self.data[index]

  def __len__(self):
    return len(self.data)

In [31]:
#takes in data from expert players and has the model try and match their actions in a given state
def behavior_clone(epochs, agent, optim, data_loader, device):
  loop = tqdm(total=epochs, position=0, leave=False)
  agent.train()

  for epoch in range(epochs):
    total_loss = 0
    for state, action in data_loader:
      action = action.to(device)

      #forward pass
      logits = agent.get_logits(state)

      loss = torch.nn.functional.cross_entropy(logits, action)
      total_loss += loss.item()

      #backprop
      optim.zero_grad()
      loss.backward()
      optim.step()

    loop.update(1)
    loop.set_description("Epochs {}, Loss: {}".format(epoch, total_loss))

#PPO Helper Functions

In [14]:
#calculate the return for a given episode
def calculate_return(memory, rollout, gamma):
  returns = []
  R = 0
  for state, action, action_dist, reward in reversed(rollout):
    R = reward + gamma * R
    returns.insert(0, (state, action, action_dist, R))

  return memory + returns

In [15]:
#sample actions from the state
@torch.no_grad()
def get_action(agent, state):
  state = state.unsqueeze(0)  #add batch dimension to input into the network
  action_dist = agent.get_action_dist(state).squeeze(0)
  action = torch.distributions.Categorical(action_dist).sample()
  return action.item(), action_dist

In [16]:
#learning loop for ppo
def learn_ppo(optim, agent, memory_dataloader, epsilon, policy_epochs):
    for _ in range(policy_epochs):
        for batch in memory_dataloader:
            state, action, action_dist, return_tensor = batch
            action = action.to(device)
            action_dist = action_dist.to(device)
            return_tensor = return_tensor.float().to(device)

            Vst = agent.get_value(state)
            Ahat_t = (return_tensor - Vst).detach()

            pi_prime = agent.get_action_dist(state)[torch.arange(len(action)), action]
            pi = action_dist[torch.arange(len(action)), action]

            ratio = torch.divide(pi_prime, pi)
            clipped_ratio = torch.clamp(ratio, 1 - epsilon, 1 + epsilon)
            policy_loss = -torch.mean(torch.min(ratio * Ahat_t, clipped_ratio * Ahat_t))

            value_loss = torch.mean((return_tensor - Vst) ** 2)

            loss = value_loss + policy_loss

            optim.zero_grad()
            loss.backward()
            optim.step()

##Hyper Parameters

In [110]:
#Behavior Cloning
bc_batch_size = 32
bc_epochs = 20

##PPO
epochs = 60
gym_id = "CrafterReward-v1"
lr = 1e-3
seed = 0
max_steps_per_episode = 1024
num_episodes = 100
discount_factor = 0.9
batch_size = 512
epsilon = 0.2
policy_epochs = 5

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

##Training Setup

In [18]:
#set up our seeds
random.seed(seed)
np.random.seed(seed)
torch.random.manual_seed(seed)

#set up our device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#set up our environment
#MUST BE A DISCRETE ACTION SPACE
env = gym.make(gym_id)
action_size = env.action_space.n
env = crafter.Recorder(
    env, './results',
    save_stats=True,
    save_video=False,
    save_episode=False,
)

#create our agent
agent = Agent(action_size).to(device)

#init our optimizer
adam_optim = torch.optim.Adam(agent.parameters(), lr=lr)

  deprecation(
  deprecation(


##Behavior Cloning


In [34]:
expert_data = get_expert_data(device)
bc_data_loader = DataLoader(ExpertDataset(expert_data), batch_size=bc_batch_size, shuffle=True)

behavior_clone(bc_epochs, agent, adam_optim, bc_data_loader, device)



##PPO Training

In [111]:
#progress bar
loop = tqdm(total=epochs, position=0, leave=False)
results_ppo = []

for epoch in range(epochs):

    #STORAGE Variables (reset each epoch)
    memory = []
    rewards = []

    #gather data from a number of episodes
    for episode in range(num_episodes):

        #reset the environment for the rollout
        state = preprocess(env.reset())
        done = False
        rollout = []
        cum_reward = 0 #track reward gained during the episode
        step = 0 #track the number of steps in the episode

        while not done and step < max_steps_per_episode:
            #get an action
            action, action_dist = get_action(agent, state)

            #take a step
            next_state, reward, done, _ = env.step(action)
            next_state = preprocess(next_state)

            #store the step
            rollout.append((state, action, action_dist, reward))

            cum_reward += reward
            step += 1
            state = next_state

        #calculate the returns for the episode and add it to the memory
        memory = calculate_return(memory, rollout, discount_factor)
        rewards.append(cum_reward)

        env.close()

    #train on the memory for this epoch
    # Train
    dataset = RLDataset(memory)
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    learn_ppo(adam_optim, agent, loader, epsilon, policy_epochs)

    #update progress bar
    results_ppo.extend(rewards)
    loop.update(1)
    loop.set_description("Epochs {}, Rewards: {}".format(epoch, cum_reward))

Epochs 59, Rewards: 8.099999999999998: 100%|██████████| 60/60 [1:20:18<00:00, 82.88s/it]

##Display Last Iteration

In [123]:
#perform one last episode and record it
env = gym.make('CrafterReward-v1')  # Or CrafterNoReward-v1
env = crafter.Recorder(
    env, './results',
    save_stats=True,
    save_video=False,
    save_episode=False,
)

done = False
frames = []
state = env.reset()
frames.append(state)
state = preprocess(state)
cum_reward = 0

while not done:
    #get an action
    action, action_dist = get_action(agent, state)

    #take a step
    next_state, reward, done, _ = env.step(action)
    frames.append(next_state)
    next_state = preprocess(next_state)

    cum_reward += reward
    state = next_state

env.close()

print("Reward for Final Episode:", cum_reward)

imageio.mimsave('./final_run.mp4', frames, fps=15)

show_video()

Reward for Final Episode: 10.100000000000003
