In [None]:
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1
!pip install -U colabgymrender
!pip install imageio==2.4.1
!pip install --upgrade AutoROM
!AutoROM --accept-license
!pip install gym[atari,accept-rom-license]

In [2]:
import gym
import os
import torch
import random
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import numpy as np
import torch.nn.functional as f
from collections import namedtuple, deque
from colabgymrender.recorder import Recorder
%matplotlib inline

  np.bool8: (False, True),

  from scipy.ndimage.filters import sobel



In [6]:
from IPython import display as ipythondisplay
from IPython.display import Video

In [7]:
BATCH_SIZE = 128
BUFFER_SIZE = 100000
GAMMA = 0.99
LR = 0.001
FREQUENCY = 4
TAU = 0.001

In [8]:
class ReplayBuffer:

  def __init__(self, action_size, buffer_size, batch_size, seed):
    self.action_size = action_size
    self.memory = deque(maxlen=buffer_size)
    self.batch_size = batch_size

    self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
    self.seed = random.seed(seed)

  def add(self, state, action, reward, next_state, done):
    self.memory.append(self.experience(state, action, reward, next_state, done))

  def sample(self):
    experience = random.sample(self.memory, k=self.batch_size)

    states = torch.from_numpy(np.vstack([i.state for i in experience if i is not None])).float()
    actions = torch.from_numpy(np.vstack([i.action for i in experience if i is not None])).float()
    rewards = torch.from_numpy(np.vstack([i.reward for i in experience if i is not None])).float()
    next_states = torch.from_numpy(np.vstack([i.next_state for i in experience if i is not None])).float()
    dones = torch.from_numpy(np.vstack([i.done for i in experience if i is not None]).astype(np.uint8)).float()

    return (states, actions, rewards, next_states, dones)

  def __len__(self):
    return len(self.memory)


In [9]:
class QNetworkType1(nn.Module):

  def __init__(self, state_size, action_size, seed):
    super(QNetworkType1, self).__init__()
    self.seed = torch.manual_seed(seed)

    self.layer1 = nn.Linear(state_size, 128)
    self.layer2 = nn.Linear(128, 128)

    self.value_function = nn.Linear(128, 1)

    self.advantage = nn.Linear(128, action_size)

  def forward(self, state):

    x = f.relu(self.layer2(f.relu(self.layer1(state))))
    value_function = self.value_function(x)
    advantage = self.advantage(x)

    return value_function + (advantage-advantage.mean(dim=1, keepdim=True))

In [10]:
class QNetworkType2(nn.Module):

  def __init__(self, state_size, action_size, seed):
    super(QNetworkType2, self).__init__()
    self.seed = torch.manual_seed(seed)

    self.layer1 = nn.Linear(state_size, 128)
    self.layer2 = nn.Linear(128, 128)

    self.value_function = nn.Linear(128, 1)

    self.advantage = nn.Linear(128, action_size)

  def forward(self, state):

    x = f.relu(self.layer2(f.relu(self.layer1(state))))
    value_function = self.value_function(x)
    advantage = self.advantage(x)

    return value_function + (advantage-advantage.mean(dim=1, keepdim=True))

In [11]:
class Agent():
  def __init__(self, state_size, action_size, seed):
    self.state_size = state_size
    self.action_size = action_size
    self.seed = random.seed(seed)

    self.qnetworkPrimary = QNetworkType1(state_size, action_size, seed)
    self.qnetworkTarget = QNetworkType1(state_size, action_size, seed)

    self.optimizer = optim.Adam(self.qnetworkPrimary.parameters(), lr=LR)

    self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
    self.timestep = 0

  def step(self, state, action, reward, next_state, done):
    self.memory.add(state, action, reward, next_state, done)

    self.timestep = (self.timestep + 1) % FREQUENCY
    if self.timestep == 0:
      if len(self.memory) > BATCH_SIZE:
        experiences = self.memory.sample()
        self.learn(experiences, GAMMA)

  def learn(self, experience, gamma):
    states, actions, rewards, next_states, dones = experience

    q_nextTarget = self.qnetworkTarget(next_states).detach().max(1)[0].unsqueeze(1)
    actions = actions.type(torch.int64)
    q_Target = rewards + (gamma*q_nextTarget*(1-dones))
    q_Expected = self.qnetworkPrimary(states).gather(1, actions)

    loss = f.mse_loss(q_Expected, q_Target)

    self.optimizer.zero_grad()
    loss.backward()
    self.optimizer.step()

    self.update(self.qnetworkPrimary, self.qnetworkTarget, TAU)

  def update(self, primaryNetwork, targetNetwork, tau):

    for target_param, local_param in zip(targetNetwork.parameters(), primaryNetwork.parameters()):
      target_param.data.copy_(tau*local_param.data + (1 - tau)*target_param.data)

  def move(self, state, eps=0):
    state = torch.from_numpy(state).float().unsqueeze(0)
    self.qnetworkPrimary.eval()
    with torch.no_grad():
      action_values = self.qnetworkPrimary(state)
    self.qnetworkPrimary.train()

    if random.random() > eps:
      return np.argmax(action_values.data.numpy())
    else:
      return random.choice(np.arange(self.action_size))

In [12]:
print("######## Environment 1: Acrobot ############")
print(" ")
env = gym.make('Acrobot-v1')
env.seed(0)

print(f"Observation Space: {env.observation_space}\n")
print(f"Observation Space Shape: {env.observation_space.shape}\n")
print(f"Action Space: {env.action_space}\n")
print(f"Number of Actions: {env.action_space.n}\n")
print(f"Reward Threshold: {env.spec.reward_threshold}")

acrobot_agent_type1 = Agent(state_size=env.observation_space.shape[0], action_size=env.action_space.n, seed=0)

  deprecation(

  deprecation(

  deprecation(



######## Environment 1: Acrobot ############
 
Observation Space: Box([ -1.        -1.        -1.        -1.       -12.566371 -28.274334], [ 1.        1.        1.        1.       12.566371 28.274334], (6,), float32)

Observation Space Shape: (6,)

Action Space: Discrete(3)

Number of Actions: 3

Reward Threshold: -100.0


In [14]:
def DDQN(episodes = 2000, max_timestep = 1000, ep_start = 1.0, ep_end = 0.01, ep_decay = 0.995):
  scores = []
  scores_window = deque(maxlen = 100)
  epsilon = ep_start

  for episode in range(1, episodes+1):
    state = env.reset()
    score = 0

    for t in range(max_timestep):
      action = acrobot_agent_type1.move(state, epsilon)
      next_state, reward, done, _ = env.step(action)
      acrobot_agent_type1.step(state, action, reward, next_state, done)
      score+=reward
      state = next_state
      if done:
        break

    scores_window.append(score)
    scores.append(score)
    epsilon = max(ep_end, ep_decay*epsilon)

    print(f'Episode {episode}    Average Score: {np.mean(scores_window)}')

    if episode % 100 == 0:
      print(f'Episode {episode}    Average Score: {np.mean(scores_window)}')
    if np.mean(scores_window) >= env.spec.reward_threshold:
      print(f"Environment solved in {episode} episodes !!!!   Average Score: {np.mean(scores_window)}")
      torch.save(acrobot_agent_type1.qnetworkPrimary.state_dict(), 'acrobot_model_weights.pth')
      break

  return scores


In [17]:
scores = DDQN()

Episode 1    Average Score: -500.0
Episode 2    Average Score: -500.0
Episode 3    Average Score: -500.0
Episode 4    Average Score: -500.0
Episode 5    Average Score: -500.0
Episode 6    Average Score: -500.0
Episode 7    Average Score: -487.14285714285717
Episode 8    Average Score: -488.75
Episode 9    Average Score: -490.0
Episode 10    Average Score: -491.0
Episode 11    Average Score: -491.8181818181818
Episode 12    Average Score: -492.5
Episode 13    Average Score: -493.0769230769231
Episode 14    Average Score: -492.92857142857144
Episode 15    Average Score: -493.4
Episode 16    Average Score: -486.375
Episode 17    Average Score: -480.7647058823529
Episode 18    Average Score: -481.8333333333333
Episode 19    Average Score: -482.7894736842105
Episode 20    Average Score: -483.65
Episode 21    Average Score: -484.42857142857144
Episode 22    Average Score: -478.72727272727275
Episode 23    Average Score: -479.5652173913044
Episode 24    Average Score: -475.7083333333333
Episo

In [18]:
acrobot_agent_type1.qnetworkPrimary.load_state_dict(torch.load('acrobot_model_weights.pth'))

frames = []
for _ in range(5):
    state = env.reset()
    for _ in range(500):
        action = acrobot_agent_type1.move(state)
        state, _, done, _ = env.step(action)
        frames.append(env.render(mode='rgb_array'))
        if done:
            break

image_dir = './acrobot_images'
os.makedirs(image_dir, exist_ok=True)
for i, frame in enumerate(frames):
    image_path = os.path.join(image_dir, f'frame_{i}.png')
    plt.imsave(image_path, frame)

import subprocess
video_path = './acrobot_video.mp4'
subprocess.call(['ffmpeg', '-framerate', '25', '-i', os.path.join(image_dir, 'frame_%d.png'), '-c:v', 'libx264', '-pix_fmt', 'yuv420p', video_path])

# Display the video
from IPython.display import Video
Video(video_path)


See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(



In [19]:
print("######## Environment 2: Cartpole ############")
print(" ")
env = gym.make('CartPole-v1')
env.seed(0)

print(f"Observation Space: {env.observation_space}\n")
print(f"Observation Space Shape: {env.observation_space.shape}\n")
print(f"Action Space: {env.action_space}\n")
print(f"Number of Actions: {env.action_space.n}\n")
print(f"Reward Threshold: {env.spec.reward_threshold}")

cartpole_agent_type1 = Agent(state_size=env.observation_space.shape[0], action_size=env.action_space.n, seed=0)

  deprecation(

  deprecation(

  deprecation(



######## Environment 2: Cartpole ############
 
Observation Space: Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)

Observation Space Shape: (4,)

Action Space: Discrete(2)

Number of Actions: 2

Reward Threshold: 475.0


In [20]:
def DDQN(episodes = 2000, max_timestep = 1000, ep_start = 1.0, ep_end = 0.01, ep_decay = 0.995):
  scores = []
  scores_window = deque(maxlen = 100)
  epsilon = ep_start

  for episode in range(1, episodes+1):
    state = env.reset()
    score = 0

    for t in range(max_timestep):
      action = cartpole_agent_type1.move(state, epsilon)
      next_state, reward, done, _ = env.step(action)
      cartpole_agent_type1.step(state, action, reward, next_state, done)
      score+=reward
      state = next_state
      if done:
        break

    scores_window.append(score)
    scores.append(score)
    epsilon = max(ep_end, ep_decay*epsilon)

    print(f'Episode {episode}    Average Score: {np.mean(scores_window)}')

    if episode % 100 == 0:
      print(f'Episode {episode}    Average Score: {np.mean(scores_window)}')
    if np.mean(scores_window) >= env.spec.reward_threshold:
      print(f"Environment solved in {episode} episodes !!!!   Average Score: {np.mean(scores_window)}")
      torch.save(cartpole_agent_type1.qnetworkPrimary.state_dict(), 'cartpole_model_weights.pth')
      break

  return scores


In [None]:
scores = DDQN()

In [None]:
cartpole_agent_type1.qnetworkPrimary.load_state_dict(torch.load('cartpole_model_weights.pth'))

frames = []
for _ in range(5):
    state = env.reset()
    for _ in range(500):
        action = cartpole_agent_type1.move(state)
        state, _, done, _ = env.step(action)
        frames.append(env.render(mode='rgb_array'))
        if done:
            break

image_dir = './cartpole_images'
os.makedirs(image_dir, exist_ok=True)
for i, frame in enumerate(frames):
    image_path = os.path.join(image_dir, f'frame_{i}.png')
    plt.imsave(image_path, frame)

import subprocess
video_path = './cartpole_video.mp4'
subprocess.call(['ffmpeg', '-framerate', '25', '-i', os.path.join(image_dir, 'frame_%d.png'), '-c:v', 'libx264', '-pix_fmt', 'yuv420p', video_path])

# Display the video
from IPython.display import Video
Video(video_path)
