In [1]:
import numpy as np
import pandas as pd 
import datetime as datetime
import gym
from gym import envs

from tensorflow.keras import Model, Sequential
from tensorflow.keras.layers import Dense, Embedding, Reshape
from tensorflow.keras.optimizers import Adam    

import os
import time

import random 
from collections import deque

envs.registry.all()
env = gym.make('CartPole-v0')

In [21]:
class Agent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        
        #experience replay
        self.memory = deque(maxlen = 2000)
        
        #discount rate
        self.gamma = 0.95
        
        #epsilon-greedy params
        self.epsilon = 1.0
        self.epsilon_decay = 0.995
        self.epsilon_min = 0.005
        
        self.learning_rate = 0.001
        
        self.model = self._build_model()
    
    def _build_model(self):
        model = Sequential()
        
        #hyper params to tune
        model.add(Dense(30, input_dim = self.state_size, activation = 'relu'))
        model.add(Dense(30,activation = 'relu'))
        model.add(Dense(self.action_size, activation = 'linear'))
        
        model.compile(loss = 'mse', optimizer = Adam(lr = self.learning_rate))
        
        return model
    
    def remember(self, state, action,reward,next_state,done):
        self.memory.append((state, action, reward, next_state, done))
        
    def act(self, state):
        #epsilon-greedy choice of the action to perform
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])
    
    def replay(self, batch_size):
        
        batch = random.sample(self.memory,batch_size)
        
        for state, action, reward, next_state, done in batch:
            
            target = reward
            if not done:
                target = (reward + self.gamma * np.amax(self.model.predict(next_state)[0]))
            target_f = self.model.predict(state)
            target_f[0][action] = target
            
            self.model.fit(state, target_f, epochs =1, verbose = 0)
            
            
        print('It took', (time.time()-start), 'seconds.')
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
        
    def load(self, path):
        self.model.load_weights(path)
        
    def save(self, path):
        self.model.save_weights(path)

In [23]:
#sort of main, to define as a class "controller"
env = gym.make('CartPole-v0')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
batch_size = 32
n_episode = 1000
output_dir = 'model_output/cartepole'

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

agent = Agent(state_size, action_size)

done = False
for e in range(n_episode):
    start = time.time()
    state = env.reset()
    state = np.reshape(state, [1,state_size])
    
    for t in range(1000):
        
        action = agent.act(state)
        
        next_state, reward, done, _ = env.step(action)
        next_state = np.reshape(next_state, [1,state_size])
        
        agent.remember(state, action, reward, next_state, done)
        
        state = next_state
        
        if done:
            print("episode : {}/{}, score : {}, e : {:.2}".format(e, n_episode, t, agent.epsilon))
            break
    
    if len(agent.memory) > batch_size:
            
            
            agent.replay(batch_size)
            

    if e%50 == 0:
        agent.save(output_dir + "weights_" + '{:04d}'.format(e) + ".hdf5")
    print('It took', (time.time()-start), 'seconds.')
        

episode : 0/100, score : 61, e : 1.0
It took 2.9200339317321777 seconds.
It took 2.9356813430786133 seconds.
episode : 1/100, score : 20, e : 1.0
It took 2.274090051651001 seconds.
It took 2.274090051651001 seconds.
episode : 2/100, score : 14, e : 1.0
It took 2.3833136558532715 seconds.
It took 2.3833136558532715 seconds.
episode : 3/100, score : 16, e : 0.99
It took 2.448948860168457 seconds.
It took 2.4499452114105225 seconds.
episode : 4/100, score : 14, e : 0.99
It took 2.3178343772888184 seconds.
It took 2.3178343772888184 seconds.
episode : 5/100, score : 34, e : 0.99
It took 2.5685486793518066 seconds.
It took 2.5685486793518066 seconds.
episode : 6/100, score : 17, e : 0.99
It took 2.469534397125244 seconds.
It took 2.469534397125244 seconds.
episode : 7/100, score : 20, e : 0.99
It took 2.2785542011260986 seconds.
It took 2.2785542011260986 seconds.
episode : 8/100, score : 28, e : 0.98
It took 2.4056668281555176 seconds.
It took 2.4056668281555176 seconds.
episode : 9/100, s

It took 2.6319289207458496 seconds.
It took 2.6319289207458496 seconds.
episode : 76/100, score : 46, e : 0.86
It took 2.7447614669799805 seconds.
It took 2.7447614669799805 seconds.
episode : 77/100, score : 13, e : 0.86
It took 2.4365732669830322 seconds.
It took 2.4365732669830322 seconds.
episode : 78/100, score : 21, e : 0.86
It took 2.634970188140869 seconds.
It took 2.634970188140869 seconds.
episode : 79/100, score : 14, e : 0.85
It took 2.689075469970703 seconds.
It took 2.689075469970703 seconds.
episode : 80/100, score : 20, e : 0.85
It took 2.5743978023529053 seconds.
It took 2.5743978023529053 seconds.
episode : 81/100, score : 48, e : 0.85
It took 2.637932777404785 seconds.
It took 2.637932777404785 seconds.
episode : 82/100, score : 23, e : 0.85
It took 2.5641653537750244 seconds.
It took 2.5641653537750244 seconds.
episode : 83/100, score : 14, e : 0.85
It took 2.616396427154541 seconds.
It took 2.616396427154541 seconds.
episode : 84/100, score : 9, e : 0.85
It took 2.