# Reinforcement Learning using OpenAI gym

We can win maximum 200 points if we correctly balance for the entire time in this game.
First, we'll explore the random playing strategy.

In [4]:
import gym
import time
env = gym.make('CartPole-v0')
env.reset()

array([0.04909453, 0.01366191, 0.00240834, 0.0348062 ])

In [5]:
env.unwrapped.state

array([0.04909453, 0.01366191, 0.00240834, 0.0348062 ])

In [6]:
env.step(1)

(array([ 0.04936777,  0.20874924,  0.00310447, -0.25711589]), 1.0, False, {})

In [13]:
# Random playing strategy and visualization
for episode in range(20):
    done = False
    env.reset()
    while not done:
        state, reward, done, info = env.step(env.action_space.sample()) # take a random action
        env.render()
env.close()

For installation and getting started, [go here](https://gym.openai.com/docs/).
Observations
If we ever want to do better than take random actions at each step, it’d probably be good to actually know what our actions are doing to the environment.

The environment’s step function returns exactly what we need. In fact, step returns four values. These are:

state (object): an environment-specific object representing your observation of the environment. For example, pixel data from a camera, joint angles and joint velocities of a robot, or the board state in a board game.

reward (float): amount of reward achieved by the previous action. The scale varies between environments, but the goal is always to increase your total reward.

done (boolean): whether it’s time to reset the environment again. Most (but not all) tasks are divided up into well-defined episodes, and done being True indicates the episode has terminated. (For example, perhaps the pole tipped too far, or you lost your last life.)

info (dict): diagnostic information useful for debugging. It can sometimes be useful for learning (for example, it might contain the raw probabilities behind the environment’s last state change). However, official evaluations of your agent are not allowed to use this for learning.

This is just an implementation of the classic “agent-environment loop”. Each timestep, the agent chooses an action, and the environment returns an observation and a reward.


The process gets started by calling reset(), which returns an initial observation. So a more proper way of writing the previous code would be to respect the done flag:

In [16]:
import numpy as np
import time

In [17]:
best_weights = np.zeros(4)
best_time = 0

In [11]:
# Playing 10000 episodes and saving the best weights for the player
for i in range(100):
    new_weights = np.random.uniform(-1,1,4)
    times = [] 
    for j in range(100):
        state = env.reset()
        done = False
        t = 0
        while not done:
            t+=1
#             env.render()
#             time.sleep(0.1)
            # if angle +ve, turn right, else turn left
            if np.dot(state, new_weights) > 0:
                action = 1
            else:
                action = 0
                
            state, reward, done, info = env.step(action)

            if done:
                times.append(t)
                break
    avg_time = np.sum(times) / len(times)
    if avg_time > best_time:
        best_time = avg_time
        best_weights = new_weights
    if i % 10 == 0:
        print(best_time)
        
env.close()

9.27
94.88
193.06
193.06
193.06
193.06
193.06
193.06
200.0
200.0


In [20]:
import numpy as np
import matplotlib.pyplot as plt
import os
from collections import deque
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import random

In [33]:
class Agent:
    def __init__(self,state_size,action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95 #Discount Factor
        self.epsilon = 1.0 # Exploration Rate: How much to act randomly, 
        self.epsilon_decay = 0.995
        self.epsilon_min = 0.01
        self.learning_rate = 0.001 
        self.model = self._create_model()
        
    
    def _create_model(self):
        #Neural Network To Approximate Q-Value function
        model = Sequential()
        model.add(Dense(24,input_dim=self.state_size,activation='relu')) #1st Hidden Layer
        model.add(Dense(24,activation='relu')) #2nd Hidden Layer
        model.add(Dense(self.action_size,activation='linear'))
        model.compile(loss='mse',optimizer=Adam(lr=self.learning_rate))
        return model
    
    def remember(self,state,action,reward,next_state,done):
        self.memory.append((state,action,reward,next_state,done)) #remembering previous experiences
        
    def act(self,state):
        # Exploration vs Exploitation
        if np.random.rand()<=self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state) # predict reward value based upon current state
        return np.argmax(act_values[0]) #Left or Right
    
    def train(self,batch_size=32): #method that trains NN with experiences sampled from memory
        minibatch = random.sample(self.memory,batch_size)
        for state,action,reward,next_state,done in minibatch:
            
            if not done: #boolean 
                target = reward + self.gamma*np.amax(self.model.predict(next_state)[0])
            else:
                target = reward
            target_f = self.model.predict(state)
            target_f[0][action] = target
            self.model.fit(state,target_f,epochs=1,verbose=0) #single epoch, x =state, y = target_f, loss--> target_f - 
            
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
    
    def load(self,name):
        self.model.load_weights(name)
    def save(self,name):
        self.model.save_weights(name)         

In [34]:
n_episodes = 1000
output_dir = "cartpole_model/"

In [35]:
agent = Agent(state_size=4,action_size=2)
done = False
state_size = 4
action_size =2
batch_size = 32

In [37]:
agent = Agent(state_size, action_size) # initialise agent
done = False
for e in range(n_episodes):
    state = env.reset()
    state = np.reshape(state,[1,state_size])
    
    for time in range(5000):
        env.render()
        action = agent.act(state) #action is 0 or 1
        next_state,reward,done,other_info = env.step(action) 
        reward = reward if not done else -10
        next_state = np.reshape(next_state,[1,state_size])
        agent.remember(state,action,reward,next_state,done)
        state = next_state
        
        if done:
            print("Game Episode :{}/{}, High Score:{},Exploration Rate:{:.2}".format(e,n_episodes,time,agent.epsilon))
            break
            
    if len(agent.memory)>batch_size:
        agent.train(batch_size)
    
    if e%50==0:
        agent.save(output_dir+"weights_"+'{:04d}'.format(e)+".hdf5")
        
env.close()
# We see that as the number of episodes progresses, a majority of the values are 199 i.e. 200 episodes cause we are starting with 0.

Game Episode :0/1000, High Score:15,Exploration Rate:1.0
Game Episode :1/1000, High Score:16,Exploration Rate:1.0
Game Episode :2/1000, High Score:30,Exploration Rate:0.99
Game Episode :3/1000, High Score:11,Exploration Rate:0.99
Game Episode :4/1000, High Score:43,Exploration Rate:0.99
Game Episode :5/1000, High Score:50,Exploration Rate:0.98
Game Episode :6/1000, High Score:15,Exploration Rate:0.98
Game Episode :7/1000, High Score:19,Exploration Rate:0.97
Game Episode :8/1000, High Score:15,Exploration Rate:0.97
Game Episode :9/1000, High Score:27,Exploration Rate:0.96
Game Episode :10/1000, High Score:20,Exploration Rate:0.96
Game Episode :11/1000, High Score:21,Exploration Rate:0.95
Game Episode :12/1000, High Score:26,Exploration Rate:0.95
Game Episode :13/1000, High Score:25,Exploration Rate:0.94
Game Episode :14/1000, High Score:35,Exploration Rate:0.94
Game Episode :15/1000, High Score:45,Exploration Rate:0.93
Game Episode :16/1000, High Score:19,Exploration Rate:0.93
Game Epis