In [1]:
import gym
from gym import envs

In [2]:
print(len(envs.registry.all()))

859


In [3]:
env = gym.make("CartPole-v0")
env.reset()

array([-0.03139016,  0.02794889, -0.04401998,  0.02013752])

In [4]:
env.action_space

Discrete(2)

In [5]:
env.action_space.n

2

In [6]:
env.observation_space

Box(-3.4028234663852886e+38, 3.4028234663852886e+38, (4,), float32)

In [7]:
env.observation_space.shape

(4,)

In [8]:
env.reset()

for t in range(1000):
    env.render()
    random_action = env.action_space.sample()
    env.step(random_action)
    
env.close()



In [9]:
random_action = env.action_space.sample()
print(random_action)

1


In [20]:
for game_episode in range(20):
    observation = env.reset()
    
    for timesteps in range(50):
        env.render()
        
        action = env.action_space.sample()
        observation, reward, done, other_info = env.step(action)
        
        if done:
            print("Game Episode: {}/{} and High Score is: {}".format(game_episode, 20, timesteps))
            break
    print(observation, reward, done, other_info)
    
env.close()
print("All 20 episodes over!")

Game Episode: 0/20 and High Score is: 22
[-0.00235619 -0.23200582  0.21244283  0.97276862] 1.0 True {}
Game Episode: 1/20 and High Score is: 14
[ 0.13257581  0.64573249 -0.21309266 -1.32279238] 1.0 True {}
Game Episode: 2/20 and High Score is: 17
[ 0.15239779  0.45460986 -0.22693169 -1.0282224 ] 1.0 True {}
Game Episode: 3/20 and High Score is: 25
[-0.07834063 -0.35494073  0.23032277  0.92595248] 1.0 True {}
Game Episode: 4/20 and High Score is: 13
[ 0.10894059  0.38012041 -0.21387581 -1.00471022] 1.0 True {}
Game Episode: 5/20 and High Score is: 23
[ 0.17231007  0.45602003 -0.2239744  -1.1073463 ] 1.0 True {}
Game Episode: 6/20 and High Score is: 20
[ 0.18342353  1.38206447 -0.21165518 -2.14023524] 1.0 True {}
Game Episode: 7/20 and High Score is: 26
[-0.20001621 -0.58749505  0.2228952   1.19982945] 1.0 True {}
Game Episode: 8/20 and High Score is: 11
[ 0.08275216  0.36205401 -0.22159364 -0.93886645] 1.0 True {}
Game Episode: 9/20 and High Score is: 21
[-0.24776151 -1.58236232  0.2521

In [21]:
import numpy as np
import matplotlib.pyplot as plt

from collections import deque
import random

from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

## Agent Design and Neural Model

In [23]:
class Agent:
    
    def __init__(self, state_size, action_size, deque_size = 2000, gamma = 0.95, learning_rate = 0.001):
        
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen = deque_size)
        self.gamma = gamma
        
        self.epsilon = 1.0
        self.epsilon_decay = 0.995
        self.epsilon_min = 0.01
        self.learning_rate = learning_rate
        
        self.model = self._create_model()
        
    def _create_model(self):
        
        model = Sequential()
        model.add(Dense(24, input_dim = self.state_size, activation = 'relu'))
        model.add(Dense(24, activation = 'relu'))
        model.add(Dense(self.action_size, activation = 'linear'))
        
        model.compile(loss = 'mse', optimizer = Adam(learning_rate = self.learning_rate))
        return model
    
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))   # Remember past Experiences
        
    def action(self, state):
        
        # Sampling according to the Epsilon Greedy Method
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)   # Take a random action
        
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])
    
    def train(self, batch_size = 32):
        
        # Training using replay buffer technique
        minibatch = random.sample(self.memory , batch_size)
        for experience in minibatch:
            state, action, reward, next_state, done = experience
            
            if not done:
                # game is not yet over, bellman equation to approx the target_value of reward
                expected_discount_return = reward + self.gamma*np.amax(self.model.predict(next_state)[0])
            else:
                expected_discount_return = reward
                
            target_f = self.model.predict(state)
            target_f[0][action] = expected_discount_return
            
            self.model.fit(state, target_f, epochs = 1, verbose = 0)
            
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay 
    
    def load(self, name):
        self.model.load_weights(name)
        
    def save(self, name):
        self.model.save_weights(name)       

## Training the DQN Agent (Deep Q-Learner)

In [25]:
n_episodes = 1000
batch_size = 32
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
print(state_size , action_size)

4 2


In [None]:
agent = Agent(state_size , action_size)
done = False

for episode in range(n_episodes):
    state = env.reset()
    state = np.reshape(state , [1, state_size])
    
    for t in range(500):
        env.render()
        action = agent.action(state)
        next_state, reward, done , other_info = env.step(action)
        
        next_state = np.reshape(next_state , [1, state_size])
        agent.remember(state, action, reward, next_state, done)
        state = next_state
        
        if done:
            print("Game Episode :{}/{}, High Score: {}, Exploration Rate: {:.2f}".format(episode, n_episodes, t, agent.epsilon))
            break
    
    if len(agent.memory) > batch_size:
        agent.train(batch_size)
        
    print(next_state, reward, done, other_info)
    
        
print('Deep Q-Learner Model Trained - All episodes over!')   
env.close()

### Playing game with learned weights

In [29]:
done = False
for e in range(20):
    
    state = env.reset()
    state = np.reshape(state,[1,state_size])
    
    for t in range(500):
        env.render()
        action = np.argmax(agent.model.predict(state)[0])
        next_state, reward, done, other_info = env.step(action)
        next_state = np.reshape(next_state,[1,state_size])
        state = next_state
        
        if done:
            print("Game Episode :{}/{} High Score :{}".format(e,20,t))
            break
            
env.close()
print("All 20 episodes over!")

Game Episode :0/20 High Score :181
Game Episode :1/20 High Score :199
Game Episode :2/20 High Score :199
Game Episode :3/20 High Score :199
Game Episode :4/20 High Score :199
Game Episode :5/20 High Score :193
Game Episode :6/20 High Score :199
Game Episode :7/20 High Score :199
Game Episode :8/20 High Score :199
Game Episode :9/20 High Score :191
Game Episode :10/20 High Score :199
Game Episode :11/20 High Score :199
Game Episode :12/20 High Score :194
Game Episode :13/20 High Score :199
Game Episode :14/20 High Score :199
Game Episode :15/20 High Score :199
Game Episode :16/20 High Score :199
Game Episode :17/20 High Score :199
Game Episode :18/20 High Score :199
Game Episode :19/20 High Score :199
All 20 episode over!
