<a href="https://colab.research.google.com/github/swastiknath/Reinforcement_Learning_With_Swastik/blob/master/RL_With_Swastik_Tabular_Q_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pyvirtualdisplay
!apt-get install -y ffmpeg xvfb python-opengl
!apt-get update
!apt-get install cmake
!pip install --upgrade setuptools
!pip install ez_setup
!pip install gym[atari]
import gym
from gym import logger as logstat
logstat.set_level(gym.logger.WARN)
import tensorflow
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
from gym.wrappers import Monitor
from gym.wrappers.monitoring.video_recorder import VideoRecorder
import math
import glob
import io
import os
import base64
from IPython.display import HTML
from IPython import display as ipythondisplay
from pyvirtualdisplay import Display


Reading package lists... Done
Building dependency tree       
Reading state information... Done
python-opengl is already the newest version (3.1.0+dfsg-1).
ffmpeg is already the newest version (7:3.4.6-0ubuntu0.18.04.1).
xvfb is already the newest version (2:1.19.6-1ubuntu4.4).
0 upgraded, 0 newly installed, 0 to remove and 84 not upgraded.
Hit:1 http://security.ubuntu.com/ubuntu bionic-security InRelease
Hit:2 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/ InRelease
Ign:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Hit:4 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu bionic InRelease
Hit:5 http://archive.ubuntu.com/ubuntu bionic InRelease
Ign:6 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:7 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release
Hit:8 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Rel

In [2]:
def record_interval(n):
    episode_interval = 100
    return n % episode_interval == 0

def wrap_env(env):

  # env = Monitor(env, "recording")
  env = Monitor(env, './video', video_callable=record_interval, force=True)
  video_recorder = VideoRecorder(env, enabled=True)
  return env, video_recorder
display = Display(visible=0, size=(1400,900))
display.start()


xdpyinfo was not found, X start can not be checked! Please install xdpyinfo!


<Display cmd_param=['Xvfb', '-br', '-nolisten', 'tcp', '-screen', '0', '1400x900x24', ':1005'] cmd=['Xvfb', '-br', '-nolisten', 'tcp', '-screen', '0', '1400x900x24', ':1005'] oserror=None return_code=None stdout="None" stderr="None" timeout_happened=False>

In [0]:
import gym
import collections
import torch
import torch.nn as nn 

class Player:
  def __init__(self, env_name):
    self.env = gym.make(env_name)
    self.state = self.env.reset()
    self.values = collections.defaultdict(float)
    self.dbg_info_s = []

  def init_env(self):
    '''
    Initializing the Player. 
    '''
    action = self.env.action_space.sample()
    old_state = self.state 
    new_state, reward, is_done, dbg_info = self.env.step(action)
    self.state = self.env.reset() if is_done else new_state
    self.dbg_info_s.append(dbg_info)
    return (old_state, action, reward, new_state)

  def max_action_value_and_action(self, state):
    '''
    Obtaining the best Action Values from the sampled Action Space and 
    obtaining the best action for stepping to. 
    '''
    max_action_value, max_action = None, None
    for act in range(self.env.action_space.n):
      curr_act_val = self.values[(state, act)]
      if max_action_value is None or curr_act_val > max_action_value:
        max_action_value = curr_act_val
        max_action = act  
    return max_action_value, max_action  

  def bellman_update(self, old_state, action, new_state, reward, GAMMA, learningRate):
    ''' 
     Applying Bellman Update to the Action Value for each state.
     Q(s, a) <- (1 - alpha) * Q(s, a) + (r + Gamma * max(Q(s', a')))
     Parameter Space :
     S = Current State 
     A = Action Space Sample for the Current State
     S' = New State
     R = Rewards Obtained
     GAMMA = GAMMA VALUE for Bellman Updates (Required)
     learningRate = ALPHA VALUE for smoother updates (Required)
    '''
    max_q_val,_ = self.max_action_value_and_action(new_state)
    updt_q_val = reward + (GAMMA * max_q_val)
    old_q_val = self.values[(old_state, action)]
    self.values[(old_state, action)] = (1- learningRate) * old_q_val + updt_q_val * learningRate

  def play_game_episodes(self, env):
    '''
    Play the agent in the envrionment and collect and step to the maximum 
    Value of Action and Action.
    '''
    total_rewards = 0.0
    state = env.reset()
    while True:
      _, max_act = self.max_action_value_and_action(state)
      new_state, reward, is_done, info = env.step(max_act)
      total_rewards += reward
      if is_done:
        break
      state = new_state
    return total_rewards

In [0]:
from tqdm import tqdm
if __name__ == "__main__":
  ENV_NAME = "FrozenLake-v0"
  test_env, video_record = wrap_env(gym.make(ENV_NAME))
  GAMMA = 0.9
  ALPHA = 0.2
  TEST_EPS = 20
  
  progress = tqdm(total=TEST_EPS)
  player = Player(ENV_NAME)
  iter_num = 0
  max_reward = 0.0
  while True:
    iter_num +=1
    s, a, r, nxt_s = player.init_env() 
    progress.update(1)
    player.bellman_update(s, a, nxt_s, r, GAMMA, ALPHA)
    reward = 0.0
    video_record.capture_frame()
    for _ in range(TEST_EPS):
      reward += player.play_game_episodes(test_env)
      progress.set_description("Episode Reward Collected: %.2f" % reward)
    reward /= TEST_EPS
    if reward > max_reward:
      video_record.capture_frame()
      old_reward = max_reward
      max_reward = reward 
      print("Reward Updated from {} --> {}".format(old_reward, reward))
    if reward > 0.9:
      video_record.capture_frame()
      print("Environment Solved with reward : {}".format(max_reward))
      progress.close()
      break;
  test_env.close()

Episode Reward Collected: 0.00: : 51it [00:02, 22.23it/s]