# Solution to Rendering an OpenAI Gym Environment Inline
The aim of this notebook is to demistify Reinforcement Learning. To expose how simple it really is to get going. Make these systems as accessible to non-AI/ML developers as possible.
 
### Resources
- https://gym.openai.com/docs/#environments
- https://keras-rl.readthedocs.io/en/latest/agents/overview/
 
### TODO
- Refactor this notebook into nicely abstracted utilities that we can use to spin up RL agents with ease for any environment / problem domain.
- Create and showcase a custom environment creation and being navigated by the RL agent.

### Install Dependencies

In [10]:
import sys

!{sys.executable} -m pip install -U pip
!{sys.executable} -m pip install -U andas
!{sys.executable} -m pip install -U rllib3
!{sys.executable} -m pip install -U plotly
!{sys.executable} -m pip install -U opencv-python
!{sys.executable} -m pip install -U opencv-contrib-python
!{sys.executable} -m pip install -U av
!{sys.executable} -m pip install -U pyvirtualdisplay
!{sys.executable} -m pip install -U pyglet
!{sys.executable} -m pip install -U ale-py
!{sys.executable} -m pip install -U pyopengl
!{sys.executable} -m pip install -U matplotlib
!{sys.executable} -m pip install -U box2d-kengz --user



ERROR: Could not find a version that satisfies the requirement andas (from versions: none)
ERROR: No matching distribution found for andas
ERROR: Could not find a version that satisfies the requirement rllib3 (from versions: none)
ERROR: No matching distribution found for rllib3


Collecting matplotlib
  Downloading matplotlib-3.4.3-cp39-cp39-win_amd64.whl (7.1 MB)
Collecting kiwisolver>=1.0.1
  Downloading kiwisolver-1.3.2-cp39-cp39-win_amd64.whl (52 kB)
Collecting cycler>=0.10
  Downloading cycler-0.10.0-py2.py3-none-any.whl (6.5 kB)
Collecting pillow>=6.2.0
  Downloading Pillow-8.3.2-cp39-cp39-win_amd64.whl (3.2 MB)
Installing collected packages: pillow, kiwisolver, cycler, matplotlib
Successfully installed cycler-0.10.0 kiwisolver-1.3.2 matplotlib-3.4.3 pillow-8.3.2


### Setup Virtual Display

In [11]:
import matplotlib.pyplot as plt
from pyvirtualdisplay import Display

display = Display(visible=0, size=(1400, 900))
display.start()

from IPython import display

EasyProcessError: start error <EasyProcess cmd_param=['Xvfb', '-help'] cmd=['Xvfb', '-help'] oserror=[WinError 2] The system cannot find the file specified return_code=None stdout="None" stderr="None" timeout_happened=False>

### Setup Utilities

In [None]:
class OpenAIGymSessionVideo:
    def __init__(self, environment):
        self.environment = environment
        self.frames = []
    
    def renderAndCapture(self, epoch_id):
        from PIL import Image
        import base64
        from io import BytesIO

        three_d_rgb_array = self.environment.render(mode='rgb_array')
        image = Image.fromarray(three_d_rgb_array, 'RGB')
        image_buffer = BytesIO()
        image.save(image_buffer, format='PNG')

        import numpy as np
        import cv2
        import io
        import os
        
        video_fps = 30
        video_codec = cv2.VideoWriter_fourcc(*'MP4V')
        video_output = cv2.VideoWriter(f'{epoch_id}.mp4', video_codec, video_fps, image.size)

        for frame in self.frames:
            video_output.write(frame)

        video_output.release()
        # Convert the video to codecs web supports.
        os.system(f"ffmpeg -i {epoch_id}.mp4 -vcodec libx264 {epoch_id}.web.mp4")
        
        self.frames = []
        video = io.open(f'{epoch_id}.web.mp4', 'r+b').read()
        encoded_video = base64.b64encode(video)
        base64_video = encoded_video.decode('utf-8')
        video_tag =f'<video controls loop autoplay width="250px" height="200px"><source src="data:video/mp4;base64,{base64_video}" type="video/mp4" /></video>'
        
        displayHTML(video_tag)
    
    def capture(self):
        from PIL import Image
        import base64
        from io import BytesIO

        three_d_rgb_array = self.environment.render(mode='rgb_array')
        image = Image.fromarray(three_d_rgb_array, 'RGB')
        image_buffer = BytesIO()
        image.save(image_buffer, format='PNG')
        
        import numpy as np
        import cv2
        
        im_arr = np.frombuffer(image_buffer.getvalue(), dtype=np.uint8)
        self.frames.append(cv2.imdecode(im_arr, flags=cv2.IMREAD_COLOR))


In [None]:
class OpenAIGymSession:
    def __init__(self, environment_name):
        self.environment_name = environment_name
    
def start(self, episodes, max_epochs=-1):
    import gym
    import uuid

    session_id = uuid.uuid4()
    env = gym.make(self.environment_name)
    env_display = OpenAIGymSessionVideo(env)

    # For each iteration we want to run.
    for episode in range(episodes):
        env.reset()
        
        episodes_id = uuid.uuid4()
        current_epoch = 0
        # Take an initial random action / step.
        action = env.action_space.sample()
        observation, reward, done, info = env.step(action)

        # Run the loop again if the environment is not done.
        while not(done):
            current_epoch += 1

            # Break out of the loop if we have reached max_epochs with no done status.
            if max_epochs > -1 and current_epoch >= max_epochs:
                # Render last image of this iteration.
                env_display.renderAndCapture(episodes_id)
                return
        
            # Take next action / step.
            action = env.action_space.sample()
            observation, reward, done, info = env.step(action)
            env_display.capture()
            
        # Render last image of this iteration.
        env_display.renderAndCapture(episodes_id)
    env.close()

In [None]:
class OpenAIGymSessionForModel:
    def __init__(self, environment_name):
        self.environment_name = environment_name
    
    def start(self, model, episodes=1, max_epochs=-1):
        import gym
        import uuid

        session_id = uuid.uuid4()
        env = gym.make(self.environment_name)
        env_display = OpenAIGymSessionVideo(env)

        # For each iteration we want to run.
        for episode in range(episodes):
            initial_observation = env.reset()
        
            episodes_id = uuid.uuid4()
            current_epoch = 0
            # Take an initial random action / step.
            action = model.forward(initial_observation)
            observation, reward, done, info = env.step(action)

            # Run the loop again if the environment is not done.
            while not(done):
                current_epoch += 1

                # Break out of the loop if we have reached max_epochs with no done status.
                if max_epochs > -1 and current_epoch >= max_epochs:
                    # Render last image of this iteration.
                    env_display.renderAndCapture(episodes_id)
                    return
                
                # Take next action / step.
                action = model.forward(observation)
                observation, reward, done, info = env.step(action)
                env_display.capture()

        # Render last image of this iteration.
        env_display.renderAndCapture(episodes_id)
        env.close()

### Setup Environment
Our utilities support rendering various OpenAI Gym environments. See https://gym.openai.com/envs/#classic_control for more.

In [None]:
episodes = 1;
env_name = "LunarLander-v2"
session = OpenAIGymSession(env_name)

session.start(episodes);

In [None]:
episodes = 1
env_name = "CartPole-v1"
session = OpenAIGymSession(env_name)

session.start(episodes);

### Custom Environment

In [None]:
import gym
from gym import spaces

# Simple environment taking a discrete and continuious action.
class AdditionCompetitionEnv(gym.Env):
    """Custom Environment that follows gym interface"""
    metadata = {'render.modes': ['human']}

    def __init__(self):
        super(AdditionCompetitionEnv, self).__init__()

        self.state = 0
        # 0) Sutract 1, 1) Add 1
        self.action_space = gym.spaces.Discrete(3)
        self.observation_space = gym.spaces.Discrete(1)

    def step(self, action):
        if action == 0:
            self.state -= 1
        elif action == 2:
            self.state += 1
        
        done = self.state <= 200 or self.state >= 200
        info = {}
        
        return self.state, self.state, done, info
    def reset(self):
        # Reset the state of the environment to an initial state
        #self.state = 0
        
        return self.state

### Introducting a Deep Q Network

In [8]:
import numpy as np
import gym

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Flatten
from tensorflow.keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory

ENV_NAME = 'LunarLander-v2'

# Get the environment and extract the number of actions.
env = gym.make(ENV_NAME)
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n

# Next, we build a very simple model. This is the network structure.
model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())

memory = SequentialMemory(limit=50000, window_length=1)
policy = BoltzmannQPolicy()
dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10, target_model_update=1e-2, policy=policy)
dqn.compile(Adam(learning_rate=1e-3), metrics=['mae'])

# Start the training.
dqn.fit(env, nb_steps=50000, visualize=False, verbose=2)

# Persist the model state.
dqn.save_weights(f'dqn_{ENV_NAME}_weights.h5f', overwrite=True)

# Test the model 
dqn.test(env, nb_episodes=5, visualize=False)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten (Flatten)            (None, 8)                 0         
_________________________________________________________________
dense (Dense)                (None, 16)                144       
_________________________________________________________________
activation (Activation)      (None, 16)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                272       
_________________________________________________________________
activation_1 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 16)                272       
_________________________________________________________________
activation_2 (Activation)    (None, 16)                0



   185/50000: episode: 1, duration: 2.522s, episode steps: 185, steps per second:  73, episode reward: -341.377, mean reward: -1.845 [-100.000,  7.102], mean action: 1.876 [0.000, 3.000],  loss: 1.598832, mae: 1.011014, mean_q: 1.554121
   364/50000: episode: 2, duration: 1.109s, episode steps: 179, steps per second: 161, episode reward: -387.553, mean reward: -2.165 [-100.000,  3.587], mean action: 1.743 [0.000, 3.000],  loss: 21.422327, mae: 1.202801, mean_q: 1.899935
   488/50000: episode: 3, duration: 0.793s, episode steps: 124, steps per second: 156, episode reward: -65.967, mean reward: -0.532 [-100.000, 40.776], mean action: 1.548 [0.000, 3.000],  loss: 26.604265, mae: 1.886929, mean_q: 1.912362
   777/50000: episode: 4, duration: 1.891s, episode steps: 289, steps per second: 153, episode reward: -153.809, mean reward: -0.532 [-100.000, 35.946], mean action: 1.803 [0.000, 3.000],  loss: 25.640825, mae: 2.505280, mean_q: 2.503996
   915/50000: episode: 5, duration: 0.842s, episod

<keras.callbacks.History at 0x20e331b0bb0>

In [None]:
# Get the initial observations from a reset environment. (From it's initial state)
initial_observation = env.reset()
# Ask the model what the next action should be in the environment's action_space. This is an index that can be passed to env.step to take the action which would in turn return a new observation which we can look through till the environment is done etc.
next_action = dqn.forward(initial_observation)

In [None]:
initial_observation

In [None]:
next_action

In [None]:
# We can take the action suggested by the model and get a observation among other things back which we can then use in a loop.
observation, reward, done, info = env.step(dqn.forward(initial_observation))

print(env.step(dqn.forward(initial_observation)))

In [None]:
# Trained model making action decisions.
env_name = "LunarLander-v2"
session = OpenAIGymSessionForModel(env_name)

session.start(dqn);