### importing modules

In [2]:
import matplotlib.pyplot as plt

%matplotlib inline
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (20, 10)
plt.rcParams['font.size'] = 15

In [3]:
import numpy as np
import gym
from collections import deque
import random
import pickle
import time

In [4]:
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

Using TensorFlow backend.


### some constants and variables

In [5]:
# variables

discount = 0.9
epsilon = 0.1
memory_cap = 999999999

## classes for Replays

In [None]:
class Replay_Default():
    
    def __init__(self, buffer_size=50):
        self.buffer_size = buffer_size
        self.experience = deque(maxlen=memory_cap)
        
    def add_experience(self, s, a, r, s_next, terminal):
        self.experience.append([s, a, r, s_next, terminal])
        
    def experience_replay(self):
        min_buffer_len = np.min([len(self.experience), self.buffer_size])
        exp_batch = random.sample(self.experience, min_buffer_len)
        
        for s, a, r, s_next, terminal in exp_batch:
            q_update = r
            
            if not terminal:
                q_update = (r + discount * np.amax(self.clf.predict(s_next)[0]))
            
            q_vals = self.clf.predict(s)
            q_vals[0][a] = q_update
            self.clf.fit(s, q_vals, verbose=0)
            
    def reset(self):
        self.buffer_size = buffer_size
        self.experience = deque(maxlen=memory_cap)

In [None]:
class Replay_HER():
    
    def __init__(self, buffer_size=50):
        self.buffer_size = buffer_size
        self.experience = deque(maxlen=memory_cap)
        self.her_experience = deque(maxlen=memory_cap)
        
    def add_experience(self, s, a, r, s_next, terminal):
        self.experience.append([s, a, r, s_next, terminal])
        
    def add_her_experience(self, s, a, r, s_next, terminal):
        self.her_experience.append([s, a, r, s, terminal])
        
    def modify_her_list(self):
        new_her_experience = copy.deepcopy(self.her_experience)
        new_her_len = len(new_her_experience)
        her_goal = self.her_experience[-1][3]
        
        for i in range(new_her_len):
            new_her_experience[-1-i][0] = her_goal
            new_her_experience[-1-i][2] = -1.0
            new_her_experience[-1-i][3] = her_goal
            new_her_experience[-1-i][4] = False
            
            if (np.sum(np.abs((new_her_experience[-1-i][4] - her_goal))) == 0):
                new_her_experience[-1-i][2] = 0.0
                new_her_experience[-1-i][4] = True
        
        for hx in new_her_experience:
            self.experience.append(hx)
            
    def reset(self):
        self.buffer_size = buffer_size
        self.experience = deque(maxlen=memory_cap)
        self.her_experience = deque(maxlen=memory_cap)

In [6]:
class Replay_PER():
    
    def __init__(self, buffer_size=50):
        self.buffer_size = buffer_size
        self.experience = deque(maxlen=memory_cap)
        
    def add_experience(self, s, a, r, s_next, terminal):
        self.experience.append([s, a, r, s_next, terminal])
            
    def reset(self):
        self.buffer_size = buffer_size
        self.experience = deque(maxlen=memory_cap)

## class for DQN

In [10]:
class DQN:
    
    def __init__(self, o_space, a_space, lr):
        self.a_space = a_space
        self.lr = lr
        
        self.clf = Sequential()
        self.clf.add(Dense(64, input_shape=(o_space,), activation="relu"))
        self.clf.add(Dense(64, activation="relu"))
        self.clf.add(Dense(self.a_space, activation="linear"))
        self.clf.compile(loss="mse", optimizer=Adam(lr=lr))
        
    def get_action(self, s):
        random_explore = np.random.rand()
        
        if random_explore < epsilon:
            return random.randrange(self.a_space)
        
        q_vals = self.clf.predict(s)
        
        return np.argmax(q_vals[0])
    
    def experience_replay(self, replay):
        min_buffer_len = np.min([len(replay.experience), replay.buffer_size])
        exp_batch = random.sample(replay.experience, min_buffer_len)
        
        for s, a, r, s_next, terminal in exp_batch:
            q_update = r
            
            if not terminal:
                q_update = (r + discount * np.amax(self.clf.predict(s_next)[0]))
            
            q_vals = self.clf.predict(s)
            q_vals[0][a] = q_update
            self.clf.fit(s, q_vals, verbose=0)

### main

In [284]:
# learning_rates = [0.5, 0.1, 0.01, 0.001, 0.0001]
# buffers = [50, 100, 250, 500]
# methods = ['sarsa', 'exp_sarsa', 'q']
# seeds = np.arange(20)

# max_episodes = 100

# more doable (recommended by Veronica)

learning_rates = [0.1, 0.01, 0.001]
buffers = [50, 100, 250, 500]
methods = ['sarsa', 'exp_sarsa', 'q']
seeds = np.arange(5)
max_episodes = 100

# learning_rates = [0.1, 0.01, 0.001]
# learning_rates = [0.1, 0.01]
# buffers = [50]
# methods = ['exp_sarsa']
# seeds = np.arange(1)

# max_episodes = 25

In [8]:
# default values to start off with

buffer = 50
lr = 0.01
seed = 0
max_episodes = 250

In [12]:
%%time

print(buffer, lr, seed)
np.random.seed(seed)

eps_steps = []
env = gym.make('CartPole-v1')
o_space = env.observation_space.shape[0]
a_space = env.action_space.n

replay = Replay_Default(buffer_size=buffer)
clf = DQN(o_space, a_space, lr)

num_episode = 0

for num_episode in range(1, max_episodes+1):# while True:
    start = time.time()
    s = env.reset()
    s = np.reshape(s, [1, o_space])

    num_step = 0
    while True:
        num_step += 1
        a = clf.get_action(s)
        s_next, r, terminal, _ = env.step(a)
        s_next = np.reshape(s_next, [1, o_space])

        if terminal:
            r = -r
        else:
            r = r
        
        replay.add_experience(s, a, r, s_next, terminal)

        s = s_next

        if terminal:
            print("num_episode: {num_episode}; terminal_step: {num_step}".format(num_episode=num_episode, num_step=num_step))
            eps_steps.append(num_step)
            break
    
    clf.experience_replay(replay)
        
    end = time.time()
    elapsed = end - start
    
#     print("elapsed", elapsed)

50 0.01 0
num_episode: 1; terminal_step: 44
num_episode: 2; terminal_step: 9
num_episode: 3; terminal_step: 9
num_episode: 4; terminal_step: 29
num_episode: 5; terminal_step: 11
num_episode: 6; terminal_step: 9
num_episode: 7; terminal_step: 10
num_episode: 8; terminal_step: 25
num_episode: 9; terminal_step: 10
num_episode: 10; terminal_step: 27
num_episode: 11; terminal_step: 16
num_episode: 12; terminal_step: 15
num_episode: 13; terminal_step: 24
num_episode: 14; terminal_step: 26
num_episode: 15; terminal_step: 15
num_episode: 16; terminal_step: 11
num_episode: 17; terminal_step: 8
num_episode: 18; terminal_step: 25
num_episode: 19; terminal_step: 31
num_episode: 20; terminal_step: 15
num_episode: 21; terminal_step: 51
num_episode: 22; terminal_step: 10
num_episode: 23; terminal_step: 17
num_episode: 24; terminal_step: 11
num_episode: 25; terminal_step: 25
num_episode: 26; terminal_step: 18
num_episode: 27; terminal_step: 16
num_episode: 28; terminal_step: 28
num_episode: 29; termin

num_episode: 232; terminal_step: 85
num_episode: 233; terminal_step: 500
num_episode: 234; terminal_step: 500
num_episode: 235; terminal_step: 500
num_episode: 236; terminal_step: 9
num_episode: 237; terminal_step: 500
num_episode: 238; terminal_step: 500
num_episode: 239; terminal_step: 500
num_episode: 240; terminal_step: 500
num_episode: 241; terminal_step: 10
num_episode: 242; terminal_step: 151
num_episode: 243; terminal_step: 193
num_episode: 244; terminal_step: 500
num_episode: 245; terminal_step: 10
num_episode: 246; terminal_step: 10
num_episode: 247; terminal_step: 27
num_episode: 248; terminal_step: 500
num_episode: 249; terminal_step: 500
num_episode: 250; terminal_step: 500
CPU times: user 41.1 s, sys: 9.79 s, total: 50.9 s
Wall time: 29.9 s


In [None]:
##################################