**A) Installing gym pacakages for gym openai to work on Colab**

In [3]:
# A) Installing gym pacakages

%%bash

# install required system dependencies
apt-get install -y xvfb x11-utils

# install required python dependencies (might need to install additional gym extras depending)
pip install gym[box2d]==0.17.* pyvirtualdisplay==0.2.* PyOpenGL==3.1.* PyOpenGL-accelerate==3.1.*

Reading package lists...
Building dependency tree...
Reading state information...
The following additional packages will be installed:
  libxxf86dga1
Suggested packages:
  mesa-utils
The following NEW packages will be installed:
  libxxf86dga1 x11-utils xvfb
0 upgraded, 3 newly installed, 0 to remove and 40 not upgraded.
Need to get 994 kB of archives.
After this operation, 2,981 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/main amd64 libxxf86dga1 amd64 2:1.1.4-1 [13.7 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic/main amd64 x11-utils amd64 7.7+3build1 [196 kB]
Get:3 http://archive.ubuntu.com/ubuntu bionic-updates/universe amd64 xvfb amd64 2:1.19.6-1ubuntu4.9 [784 kB]
Fetched 994 kB in 1s (987 kB/s)
Selecting previously unselected package libxxf86dga1:amd64.
(Reading database ... (Reading database ... 5%(Reading database ... 10%(Reading database ... 15%(Reading database ... 20%(Reading database ... 25%(Reading database ... 30%(Read

In [4]:

import pyvirtualdisplay
_display = pyvirtualdisplay.Display(visible=False,  # use False with Xvfb
                                    size=(1400, 900))
_ = _display.start()

**B) Trying out already coded up examples of Gym OpenAI**

In [5]:
# B) Trying out examples already present

import gym
env = gym.make('CartPole-v0')
for i_episode in range(20):
    observation = env.reset()
    for t in range(100):
        env.render()
        print(observation)
        action = env.action_space.sample()
        observation, reward, done, info = env.step(action)
        if done:
            print("Episode finished after {} timesteps".format(t+1))
            break
env.close()

[ 0.00105027 -0.04555951  0.01046626 -0.04611013]
[ 1.39079272e-04 -2.40829967e-01  9.54405721e-03  2.49856537e-01]
[-0.00467752 -0.4360869   0.01454119  0.54553451]
[-0.01339926 -0.63141012  0.02545188  0.84276333]
[-0.02602746 -0.82687004  0.04230714  1.14334028]
[-0.04256486 -1.02251846  0.06517395  1.44898491]
[-0.06301523 -1.21837827  0.09415365  1.76129846]
[-0.0873828  -1.41443122  0.12937962  2.08171565]
[-0.11567142 -1.22083445  0.17101393  1.83167916]
[-0.14008811 -1.02796922  0.20764751  1.59663477]
Episode finished after 10 timesteps
[ 0.03386634 -0.04677948  0.01652247 -0.0414463 ]
[ 0.03293075  0.1481017   0.01569355 -0.32887078]
[ 0.03589278 -0.04724011  0.00911613 -0.03128044]
[ 0.03494798 -0.2424916   0.00849052  0.26426473]
[ 0.03009815 -0.4377337   0.01377582  0.55961351]
[ 0.02134347 -0.24280779  0.02496809  0.27130235]
[ 0.01648732 -0.04805086  0.03039413 -0.01340214]
[ 0.0155263   0.14662232  0.03012609 -0.29634253]
[ 0.01845875 -0.04891586  0.02419924  0.00568738

In [7]:
import gym
env = gym.make('CartPole-v0')
print(env.action_space)
#> Discrete(2)
print(env.observation_space)
#> Box(4,)

print(env.observation_space.high)
#> array([ 2.4       ,         inf,  0.20943951,         inf])
print(env.observation_space.low)
#> array([-2.4       ,        -inf, -0.20943951,        -inf])

Discrete(2)
Box(-3.4028234663852886e+38, 3.4028234663852886e+38, (4,), float32)
[4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38]
[-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38]


In [8]:
import gym
env = gym.make("Taxi-v3").env
env.render()

env.reset() # reset environment to a new, random state
env.render()

print("Action Space {}".format(env.action_space))
print("State Space {}".format(env.observation_space))

+---------+
|R: | : :[35mG[0m|
| : | : : |
| : : : : |
|[43m [0m| : | : |
|Y| : |[34;1mB[0m: |
+---------+

+---------+
|R: | : :[35mG[0m|
| : | : : |
| : : : : |
| | : | :[43m [0m|
|Y| : |[34;1mB[0m: |
+---------+

Action Space Discrete(6)
State Space Discrete(500)


In [9]:
state = env.encode(3, 1, 2, 0) # (taxi row, taxi column, passenger index, destination index)
print("State:", state)

env.s = state
env.render()
env.P[328]

State: 328
+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : : |
| |[43m [0m: | : |
|[34;1mY[0m| : |B: |
+---------+



{0: [(1.0, 428, -1, False)],
 1: [(1.0, 228, -1, False)],
 2: [(1.0, 348, -1, False)],
 3: [(1.0, 328, -1, False)],
 4: [(1.0, 328, -10, False)],
 5: [(1.0, 328, -10, False)]}

Randomized method - without reinforcement learning

In [10]:
env.s = 328  # set environment to illustration's state

epochs = 0
penalties, reward = 0, 0

frames = [] # for animation

done = False

while not done:
    action = env.action_space.sample()
    state, reward, done, info = env.step(action)

    if reward == -10:
        penalties += 1
    
    # Put each rendered frame into dict for animation
    frames.append({
        'frame': env.render(mode='ansi'),
        'state': state,
        'action': action,
        'reward': reward
        }
    )

    epochs += 1
    
    
print("Timesteps taken: {}".format(epochs))
print("Penalties incurred: {}".format(penalties))

Timesteps taken: 196
Penalties incurred: 64


In [11]:
from IPython.display import clear_output
from time import sleep

def print_frames(frames):
    for i, frame in enumerate(frames):
        clear_output(wait=True)
        #print(frame['frame'].getvalue())
        print(f"Timestep: {i + 1}")
        print(f"State: {frame['state']}")
        print(f"Action: {frame['action']}")
        print(f"Reward: {frame['reward']}")
        sleep(.1)
        
print_frames(frames)

Timestep: 196
State: 0
Action: 5
Reward: 20


Reinforcement learning agent

In [12]:
import numpy as np
q_table = np.zeros([env.observation_space.n, env.action_space.n])

In [13]:
%%time
"""Training the agent"""

import random
from IPython.display import clear_output

# Hyperparameters
alpha = 0.1
gamma = 0.6
epsilon = 0.1

# For plotting metrics
all_epochs = []
all_penalties = []

for i in range(1, 100001):
    state = env.reset()

    epochs, penalties, reward, = 0, 0, 0
    done = False
    
    while not done:
        if random.uniform(0, 1) < epsilon:
            action = env.action_space.sample() # Explore action space
        else:
            action = np.argmax(q_table[state]) # Exploit learned values

        next_state, reward, done, info = env.step(action) 
        
        old_value = q_table[state, action]
        next_max = np.max(q_table[next_state])
        
        new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
        q_table[state, action] = new_value

        if reward == -10:
            penalties += 1

        state = next_state
        epochs += 1
        
    if i % 100 == 0:
        clear_output(wait=True)
        print(f"Episode: {i}")

print("Training finished.\n")

Episode: 100000
Training finished.

CPU times: user 1min 9s, sys: 15.9 s, total: 1min 25s
Wall time: 1min 12s


In [14]:
q_table[328]

array([ -2.40651492,  -2.27325184,  -2.40951211,  -2.35190513,
       -10.96367769, -11.03164519])

In [15]:
"""Evaluate agent's performance after Q-learning"""

total_epochs, total_penalties = 0, 0
episodes = 100

for _ in range(episodes):
    state = env.reset()
    epochs, penalties, reward = 0, 0, 0
    
    done = False
    
    while not done:
        action = np.argmax(q_table[state])
        state, reward, done, info = env.step(action)

        if reward == -10:
            penalties += 1

        epochs += 1

    total_penalties += penalties
    total_epochs += epochs

print(f"Results after {episodes} episodes:")
print(f"Average timesteps per episode: {total_epochs / episodes}")
print(f"Average penalties per episode: {total_penalties / episodes}")

Results after 100 episodes:
Average timesteps per episode: 12.95
Average penalties per episode: 0.0


**3) Making the enviornment for convex optimization based on the problem statement**

In [29]:
# 3) Making own enviornment based on the problem statement

class Convex_env(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, array, step_size):
        super(Convex_env, self).__init__()
        self.array = array
        self.x = 0
        self.y = 0
        self.cost = 0
        self.step_size = step_size
        self.action_space = spaces.Discrete(2) # GO down the minima and STOP once limit is reached.
        self.observation_space = spaces.box(low=np.array([-100,-100]), high=np.array([+100,+100]), shape=(0,2), dtype=np.float16) # Observation space = (x,y)

    def _reset(self):
        # Resetting state to initial state
        self.x = 0 # Resetting x cordinate
        self.y = 0 # Resetting y cordinate
        return self._step()

    def _step(self, action):

        """

        Parameters
        ----------
        action :

        Returns
        -------
        ob, reward, episode_over, info : tuple
            ob (object) :
                an environment-specific object representing your observation of
                the environment.
            reward (float) :
                amount of reward achieved by the previous action. The scale
                varies between environments, but the goal is always to increase
                your total reward.
            episode_over (bool) :
                whether it's time to reset the environment again. Most (but not
                all) tasks are divided up into well-defined episodes, and done
                being True indicates the episode has terminated. (For example,
                perhaps the pole tipped too far, or you lost your last life.)
            info (dict) :
                 diagnostic information useful for debugging. It can sometimes
                 be useful for learning (for example, it might contain the raw
                 probabilities behind the environment's last state change).
                 However, official evaluations of your agent are not allowed to
                 use this for learning.
        """
        self._take_action(action)
        self.current_step += 1
        self.status = self.env.step()
        reward = self._get_reward()
        ob = self.env.getState()
        episode_over = self.cost <= 0.0001
        return ob, reward, episode_over, {}

    def _take_action(action):
        action[0] = action_type

        self.cost = array[0]*x*x + array[1]*y*y + array[2]*x*y + array[3]*x + array[4]*y
        delta_x = (2*array[0]*x + array[2]*y + array[3])*step_size
        delta_y = (2*array[1]*y + array[2]*x + array[4])*step_size
        self.x = x - delta_x
        self.y = y - delta_y

    def _get_reward(self):
        optimal_x = (array[2]*array[4] - 2*array[1]*array[3])/(4*array[0]*array[1] - array[2]*array[2])
        optimal_y = (-array[3] - 2*array[0]*optimal_x)/(array[2])
        optimal_cost = array[0]*optimal_x*optimal_x + array[1]*optimal_y*optimal_y + array[2]*optimal_x*yoptimal_y + array[3]*optimal_x + array[4]*optimal_y
        diff = (optimal_cost - cost)*(optimal_cost - cost)
        return 1/diff

    def _render(self, mode='human', close=False):
        print(f'Step: {self.current_step}')
        print(f'Co-ordinates: ({self.x},{self.y})')
        print(f'Function value: {self.cost}')
        print(f'Reward for this step: {reward}')