In [1]:
import numpy as np
import pandas as pd 
import datetime as datetime
import gym
from gym import envs

from tensorflow.keras import Model, Sequential
from tensorflow.keras.layers import Dense, Embedding, Reshape
from tensorflow.keras.optimizers import Adam    

import os
import time

import random 
from collections import deque

envs.registry.all()
env = gym.make('CartPole-v0')




In [2]:
class Agent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        
        #experience replay
        self.memory = deque(maxlen = 20000)
        
        #discount rate
        self.gamma = 0.95
        
        #epsilon-greedy params
        self.epsilon = 1.0
        self.epsilon_decay = 0.999
        self.epsilon_min = 0.0001
        
        self.learning_rate = 0.001
        
        self.model = self._build_model()
        self.target = self._build_model()
        self.alighn_target_model()
    
    def set_test_mode(self,value,gamma = 0.95):
    
        if value:
            self.gamma = 0
        else:
            self.gamma = gamma
            
    
    def _build_model(self):
          
        model = Sequential()
        
        #hyper params to tune
        model.add(Dense(30, input_dim = self.state_size, activation = 'relu'))
        model.add(Dense(24, activation = 'relu'))
        model.add(Dense(self.action_size, activation = 'linear'))
        
        model.compile(loss = 'mse', optimizer = Adam(learning_rate = self.learning_rate))
        
        return model
    
    def alighn_target_model(self):
        self.target.set_weights(self.model.get_weights())
    
    def remember(self, state, action,reward,next_state,done):
        self.memory.append((state, action, reward, next_state, done))
        
    def act(self, state):
        #epsilon-greedy choice of the action to perform
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])
    
    def replay(self, batch_size):  
        
        states = np.ndarray((0,4))
        next_states = np.ndarray((0,4))
        actions = []
        rewards =[]
        terminateds = []

        minibatch = random.sample(self.memory, batch_size)
        
        for state, action, reward, next_state, terminated in minibatch:     
            states = np.vstack((states, state))
            next_states = np.vstack((next_states, next_state))
            actions.append(action)
            rewards.append(reward)
            terminateds.append(terminated)
        
        pred_Q = self.model.predict(states) #predicted q-values
        target_Q = self.target.predict(next_states)
        
        for i in range(len(pred_Q)):         
            if terminateds[i]:
                pred_Q[i,int(actions[i])] = rewards[i]
            else:
                pred_Q[i,int(actions[i])] = rewards[i] + self.gamma * np.amax(target_Q[i])
                
                
        self.model.fit(states, pred_Q, epochs =1, verbose = 0)        
        
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
        
        
    def load(self, path):
        self.model.load_weights(path)
        
    def save(self, path):
        self.model.save_weights(path)

In [13]:
#sort of main, to define as a class "controller"



env = gym.make('CartPole-v1')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
batch_size = 32

n_episode = 50
UPDATE_FREQ = 16
NETW_UPDATE_FREQ = 200
loop_number = 0

output_dir = 'model_output/cartepole'

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

#agent = Agent(state_size, action_size)

for e in range(n_episode):
    
    state = env.reset()
    state = np.reshape(state, [1,state_size])
    done = False
    score = 0
    
    while not done:
        env.render()
        score+=1
        action = agent.act(state)
        
        next_state, reward, done, _ = env.step(action)
        next_state = np.reshape(next_state, [1,state_size])
        
        agent.remember(state, action, reward, next_state, done)
        
        state = next_state
        
        if loop_number % UPDATE_FREQ == 0 and loop_number > batch_size:
            agent.replay(batch_size)
            pass
            
        if loop_number % NETW_UPDATE_FREQ == 0:
            
            agent.alighn_target_model()
            
        loop_number += 1
        if done:
            print("episode : {}/{}, score : {}, e : {:.2}".format(e, n_episode, score, agent.epsilon))
           
            break
        
    if e%50 == 0:
        #agent.save(output_dir + "weights_" + '{:04d}'.format(e) + ".hdf5")
        pass

env.close()
    
        

KeyboardInterrupt: 

In [None]:
# Evaluate for 100 episodes
agent.set_test_mode(True)

sum_rewards = 0.0
nbr_episode = 1
for _ in range(nbr_episode):
    state = env.reset()
    state = np.reshape(state, [1,state_size])
    done = False
    while not done:
        env.render()
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)
        next_state = np.reshape(next_state, [1,state_size])
        sum_rewards += reward
        
        state = next_state
    print(sum_rewards)
print('Mean evaluation return:', sum_rewards / nbr_episode)

# Close agent and environment

env.close()

In [17]:
state = np.array([[1,1,1,1]])
env.close()

states = np.ndarray((0,4))
next_states = np.ndarray((0,4))
pred_Q = np.ndarray((0,2))
target_Q = np.ndarray((0,2))

actions = np.ndarray((0,1))
rewards = np.ndarray((0,1))

states = np.vstack((states, state))
states = np.vstack((states, state))
states = np.vstack((states, state))

next_states = np.vstack((next_states,state))
next_states = np.vstack((next_states,state))
next_states = np.vstack((next_states,state))

actions = np.vstack((actions,np.array([[0]])))
actions = np.vstack((actions,np.array([[1]])))
actions = np.vstack((actions,np.array([[0]])))

rewards = np.vstack((rewards,np.array([[1]])))
rewards = np.vstack((rewards,np.array([[1]])))
rewards = np.vstack((rewards,np.array([[1]])))

pred_Q = np.vstack((pred_Q,np.array([[0.3,0.5]])))
pred_Q = np.vstack((pred_Q,np.array([[0.3,0.5]])))
pred_Q = np.vstack((pred_Q,np.array([[0.3,0.5]])))

target_Q = np.vstack((target_Q,np.array([[0,2]])))
target_Q = np.vstack((target_Q,np.array([[1,0]])))
target_Q = np.vstack((target_Q,np.array([[1,3]])))


print(states,"\n")
print(next_states,"\n")
print(actions,"\n")
print(rewards,"\n")
print(pred_Q[i,int(actions[i,0])])



for i in range(len(pred_Q)):
    pred_Q[i,int(actions[i,0])] = rewards[i,0] + 0.9 * np.amax(target_Q[i])
print("test : \n" ,pred_Q) 




[[1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]] 

[[1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]] 

[[0.]
 [1.]
 [0.]] 

[[1.]
 [1.]
 [1.]] 



NameError: name 'i' is not defined