In [1]:
import copy
import numpy as np
import mgym
from keras.layers import Dense
from keras.optimizers import Adam
from keras.models import Sequential

Using TensorFlow backend.


In [2]:
EPISODES = 2500

In [3]:
class DeepSARSAgent:
    def __init__(self, env):
        self.env = env
        
        self.state_size = 4
        self.discount_factor = 0.99
        self.learning_rate = 0.001
        
        self.epsilon = 1.
        self.epsilon_decay = 0.9999
        self.epsilon_min = 0.01
        self.model = self._build_model()
        
    def _build_model(self):
        model = Sequential()
        model.add(Dense(30, input_dim=self.state_size, activation='relu'))
        model.add(Dense(30, activation='relu'))
        model.add(Dense(self.env.action_space.n, activation='linear'))
        model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
        return model
    
    def get_action(self, state):
        if np.random.rand() <= self.epsilon:
            return self.env.action_space.sample()
        else:
            state = np.float32(state)
            q_values = self.model.predict(state)
            return np.argmax(q_values[0])
        
    def train_model(self, state, action, reward, next_state, next_action, done):
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
            
        state = np.float32(state)
        next_state = np.float32(next_state)
        target = self.model.predict(state)[0]
        
        if done:
            target[action] = reward
        else:
            target[action] = (reward + self.discount_factor * self.model.predict(next_state)[0][next_action])
            
        target = np.reshape(target, [1, -1])
        
        self.model.fit(state, target, epochs=1, verbose=0)

In [4]:
env = mgym.make("5x5moving")

In [5]:
agent = DeepSARSAgent(env)

In [6]:
scores, steps = [], []

for e in range(EPISODES):
    done = False
    score = 0
    state = env.reset()
    state = np.reshape(state, [1, -1])
    step = 0
    
    while not done:
        action = agent.get_action(state)
        next_state, reward, done, _ = env.step(action)
        next_state = np.reshape(next_state, [1, -1])
        next_action = agent.get_action(next_state)
        
        agent.train_model(state, action, reward, next_state, next_action, done)
        
        state = next_state
        score += reward
        state = copy.deepcopy(next_state)
        step += 1
        
        if done:
            scores.append(score)
            steps.append(step)
            
    if e % 10 == 0:
        print(e, score, step)

0 -27 195
10 -9 114
20 -10 94
30 1 47
40 -3 14
50 -1 37
60 -2 27
70 1 22
80 -2 74
90 0 39
100 -1 32
110 -7 31
120 -2 37
130 -1 21
140 0 45
150 1 12
160 -7 97
170 -2 27
180 0 16
190 0 12
200 -1 19
210 1 18
220 1 11
230 -6 66
240 -2 11
250 1 13
260 0 15
270 1 8
280 0 12
290 1 18
300 -7 162
310 0 15
320 1 23
330 -2 13
340 1 9
350 1 12
360 1 9
370 0 19
380 1 12
390 0 69
400 -1 19
410 -2 75
420 0 9
430 0 11
440 0 11
450 1 10
460 0 11
470 1 9
480 1 25
490 0 15
500 1 9
510 1 12
520 1 9
530 1 17
540 1 12
550 1 44
560 0 36
570 0 10
580 0 20
590 1 8
600 1 25
610 1 8
620 1 41
630 -6 80
640 0 98
650 1 8
660 1 9
670 1 9
680 1 21
690 0 11
700 1 10
710 1 24
720 1 27
730 1 58
740 1 12
750 1 8
760 0 11
770 0 10
780 0 12
790 1 10
800 1 8
810 1 10
820 1 8
830 1 8
840 1 43
850 0 11
860 1 38
870 1 10
880 1 16
890 1 15
900 1 8
910 1 8
920 1 12
930 1 8
940 1 12
950 1 8
960 0 12
970 -2 4707
980 -2 352
990 -14 14840


KeyboardInterrupt: 

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.plot(scores)