<a href="https://colab.research.google.com/github/dogukartal/ML-RoadMap/blob/main/RL/Hugging%20Face/PixelCopter_v1/Reinforce_PixelCopter_v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!apt install python-opengl
!apt install ffmpeg
!apt install xvfb
!pip install pyvirtualdisplay
!pip install pyglet==1.5.1
!pip install -r https://raw.githubusercontent.com/huggingface/deep-rl-class/main/notebooks/unit4/requirements-unit4.txt

from pyvirtualdisplay import Display

virtual_display = Display(visible=0, size=(1400, 900))
virtual_display.start()

In [2]:
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
%matplotlib inline
import torch
import torch.nn as nn
from torch.nn import functional
import torch.optim as optim
from torch.distributions import Categorical
import gym
import gym_pygame
from huggingface_hub import notebook_login
import imageio

class Policy(nn.Module):
    def __init__(self, s_size, a_size, h_size):
        super(Policy, self).__init__()
        self.fc1 = nn.Linear(s_size, h_size)
        self.fc2 = nn.Linear(h_size, h_size * 2)
        self.fc3 = nn.Linear(h_size * 2, a_size)

    def forward(self, x):
        x = functional.relu(self.fc1(x))
        x = functional.relu(self.fc2(x))
        x = functional.softmax(self.fc3(x), dim=1)
        return x

    def act(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0).to("cuda")
        probs = self.forward(state).cpu()
        m = Categorical(probs)
        action = m.sample()
        return action.item(), m.log_prob(action)

def reinforce(policy, optimizer, n_training_episodes, max_t, gamma, print_every):
    scores_deque = deque(maxlen=100)
    scores = []
    # Line 3 of pseudocode
    for i_episode in range(1, n_training_episodes + 1):
        saved_log_probs = []
        rewards = []
        state = env.reset()
        # Line 4 of pseudocode
        for t in range(max_t):
            action, log_prob = policy.act(state)
            saved_log_probs.append(log_prob)
            state, reward, done, _ = env.step(action)
            rewards.append(reward)
            if done:
                break
        scores_deque.append(sum(rewards))
        scores.append(sum(rewards))

        # Line 6 of pseudocode: calculate the return
        returns = deque(maxlen=max_t)
        n_steps = len(rewards)

        for t in range(n_steps)[::-1]:
            disc_return_t = returns[0] if len(returns) > 0 else 0
            returns.appendleft(gamma * disc_return_t + rewards[t])

        eps = np.finfo(np.float32).eps.item()

        returns = torch.tensor(returns)
        returns = (returns - returns.mean()) / (returns.std() + eps)

        # Line 7:
        policy_loss = []
        for log_prob, disc_return in zip(saved_log_probs, returns):
            policy_loss.append(-log_prob * disc_return)
        policy_loss = torch.cat(policy_loss).sum()

        # Line 8: PyTorch prefers gradient descent
        optimizer.zero_grad()
        policy_loss.backward()
        optimizer.step()

        if i_episode % print_every == 0:
            print("Episode {}\tAverage Score: {:.2f}".format(i_episode, np.mean(scores_deque)))

    return scores

def evaluate_agent(env, max_steps, n_eval_episodes, policy):
    episode_rewards = []
    for episode in range(n_eval_episodes):
        state = env.reset()
        step = 0
        done = False
        total_rewards_ep = 0

        for step in range(max_steps):
            action, _ = policy.act(state)
            new_state, reward, done, info = env.step(action)
            total_rewards_ep += reward

            if done:
                break
            state = new_state
        episode_rewards.append(total_rewards_ep)
    mean_reward = np.mean(episode_rewards)
    std_reward = np.std(episode_rewards)

    return mean_reward, std_reward

env_id = "Pixelcopter-PLE-v0"
env = gym.make(env_id)
eval_env = gym.make(env_id)

s_size = env.observation_space.shape[0]
a_size = env.action_space.n

pixelcopter_hyperparameters = {
    "h_size": 64,
    "n_training_episodes": 50000,
    "n_evaluation_episodes": 10,
    "max_t": 10000,
    "gamma": 0.99,
    "lr": 1e-4,
    "env_id": env_id,
    "state_space": s_size,
    "action_space": a_size,
}

pixelcopter_policy = Policy(
    pixelcopter_hyperparameters["state_space"],
    pixelcopter_hyperparameters["action_space"],
    pixelcopter_hyperparameters["h_size"],
).to("cuda")
pixelcopter_optimizer = optim.Adam(pixelcopter_policy.parameters(), lr=pixelcopter_hyperparameters["lr"])

scores = reinforce(
    pixelcopter_policy,
    pixelcopter_optimizer,
    pixelcopter_hyperparameters["n_training_episodes"],
    pixelcopter_hyperparameters["max_t"],
    pixelcopter_hyperparameters["gamma"],
    1000,
)

evaluate_agent(
    eval_env, pixelcopter_hyperparameters["max_t"], pixelcopter_hyperparameters["n_evaluation_episodes"], pixelcopter_policy
)

couldn't import doomish
Couldn't import doom


  deprecation(
  deprecation(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(f"{pre} is not within the observation space.")
  logger.deprecation(
  if not isinstance(done, (bool, np.bool8)):
  logger.warn(
  logger.warn(f"{pre} is not within the observation space.")


Episode 1000	Average Score: -5.00
Episode 2000	Average Score: -5.00
Episode 3000	Average Score: 3.26
Episode 4000	Average Score: 6.35
Episode 5000	Average Score: 7.80
Episode 6000	Average Score: 10.89
Episode 7000	Average Score: 13.98
Episode 8000	Average Score: 12.00
Episode 9000	Average Score: 14.05
Episode 10000	Average Score: 14.64
Episode 11000	Average Score: 15.57
Episode 12000	Average Score: 17.79
Episode 13000	Average Score: 17.48
Episode 14000	Average Score: 17.43
Episode 15000	Average Score: 17.83
Episode 16000	Average Score: 21.32
Episode 17000	Average Score: 19.56
Episode 18000	Average Score: 19.60
Episode 19000	Average Score: 14.95
Episode 20000	Average Score: 18.87
Episode 21000	Average Score: 14.33
Episode 22000	Average Score: 19.24
Episode 23000	Average Score: 25.53
Episode 24000	Average Score: 19.94
Episode 25000	Average Score: 19.23
Episode 26000	Average Score: 22.60
Episode 27000	Average Score: 25.99
Episode 28000	Average Score: 23.00
Episode 29000	Average Score: 24.

(33.2, 25.110953785151214)