In [4]:
import numpy as np
import threading
import random
import time
from skimage.transform import resize
from skimage.color import rgb2gray
from collections import deque

import gym
import pandas as pd
import tensorflow as tf

In [81]:
# https://www.oreilly.com/learning/introduction-to-reinforcement-learning-and-openai-gym
# https://github.com/coreylynch/async-rl/blob/master/async_dqn.py

In [5]:
env = gym.make("Taxi-v2")

In [6]:
env.reset()

242

In [7]:
env.render()

+---------+
|[34;1mR[0m: | : :G|
| : : : : |
| : :[43m [0m: : |
| | : | : |
|[35mY[0m| : |B: |
+---------+



In [8]:
env.observation_space

Discrete(500)

In [76]:
env.action_space

Discrete(6)

In [49]:
env.env.s = 200
env.render()

+---------+
|[35m[34;1mR[0m[0m: | : :G|
| : : : : |
|[43m [0m: : : : |
| | : | : |
|Y| : |B: |
+---------+



In [56]:
# Step 0 - South
env.step(0)
env.render()

+---------+
|[35m[34;1mR[0m[0m: | : :G|
| : : : : |
|[43m [0m: : : : |
| | : | : |
|Y| : |B: |
+---------+
  (South)


In [59]:
state,reward,done,info = env.step(0)
print(state,reward,done,info)

132 -1 False {'prob': 1.0}


In [62]:
# Random Solver
state = env.reset()
counter = 0
reward = None
while reward != 20:
    state, reward, done, info = env.step(env.action_space.sample())
    counter += 1

print(counter)

2689


In [65]:
Q = np.zeros([env.observation_space.n, env.action_space.n])
G = 0
alpha = 0.618

for episode in range(1,1001):
    done = False
    G, reward = 0,0
    state = env.reset()
    while done != True:
            action = np.argmax(Q[state]) #1
            state2, reward, done, info = env.step(action) #2
            Q[state,action] += alpha * (reward + np.max(Q[state2]) - Q[state,action]) #3
            G += reward
            state = state2   
    if episode % 50 == 0:
        print('Episode {} Total Reward: {}'.format(episode,G))

Episode 50 Total Reward: -157
Episode 100 Total Reward: -73
Episode 150 Total Reward: -8
Episode 200 Total Reward: 14
Episode 250 Total Reward: 6
Episode 300 Total Reward: 12
Episode 350 Total Reward: 7
Episode 400 Total Reward: 12
Episode 450 Total Reward: -7
Episode 500 Total Reward: 8
Episode 550 Total Reward: 8
Episode 600 Total Reward: 10
Episode 650 Total Reward: 8
Episode 700 Total Reward: 9
Episode 750 Total Reward: 6
Episode 800 Total Reward: 8
Episode 850 Total Reward: 10
Episode 900 Total Reward: 10
Episode 950 Total Reward: 5
Episode 1000 Total Reward: 6


## PacMan
* https://github.com/tflearn/tflearn/blob/master/examples/reinforcement_learning/atari_1step_qlearning.py

In [10]:
env = gym.make("MsPacman-v0")
state = env.reset()

In [11]:
env.render()

In [68]:
env.action_space.n

9

In [69]:
env.env.get_action_meanings()

['NOOP',
 'UP',
 'RIGHT',
 'LEFT',
 'DOWN',
 'UPRIGHT',
 'UPLEFT',
 'DOWNRIGHT',
 'DOWNLEFT']

### Atari

In [30]:
env = gym.make('SpaceInvaders-v0')
observation_n = env.reset() #retrieve observation

[2017-10-31 22:04:39,528] Making new env: SpaceInvaders-v0


In [21]:
env.render()

In [31]:
env.observation_space

Box(210, 160, 3)

In [17]:
# while True:
#     action_n = [[('KeyEvent','ArrowLeft',True)] for ob in observation_n]

In [15]:
# observation_n - observation of environment
# reward_n : if action was beneficial
# done_n : Indicates if the game is over or not: Yes/No
# info : Additional Info .. 

observation_n, reward_n, done_n, info = env.step(action_n)


array([80, 89, 22], dtype=uint8)

In [4]:
for _ in range(100):
    env.render()
    env.step(env.action_space.sample())
    

## Reinforcement Learning

In [12]:
# -*- coding: utf-8 -*-
import random
import gym
import numpy as np
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam


Using TensorFlow backend.


In [13]:

EPISODES = 1000


class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95    # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.model = self._build_model()

    def _build_model(self):
        # Neural Net for Deep-Q learning Model
        model = Sequential()
        model.add(Dense(24, input_dim=self.state_size, activation='relu'))
        model.add(Dense(24, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss='mse',
                      optimizer=Adam(lr=self.learning_rate))
        return model

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])  # returns action

    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                target = (reward + self.gamma *
                          np.amax(self.model.predict(next_state)[0]))
            target_f = self.model.predict(state)
            target_f[0][action] = target
            self.model.fit(state, target_f, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def load(self, name):
        self.model.load_weights(name)

    def save(self, name):
        self.model.save_weights(name)



  

In [None]:
env = gym.make('CartPole-v1')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
agent = DQNAgent(state_size, action_size)
# agent.load("./save/cartpole-dqn.h5")
done = False
batch_size = 32

for e in range(EPISODES):
    state = env.reset()
    state = np.reshape(state, [1, state_size])
    for time in range(500):
        # env.render()
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)
        reward = reward if not done else -10
        next_state = np.reshape(next_state, [1, state_size])
        agent.remember(state, action, reward, next_state, done)
        state = next_state
        if done:
            print("episode: {}/{}, score: {}, e: {:.2}".format(e, EPISODES, time, agent.epsilon))
            break
        if len(agent.memory) > batch_size:
            agent.replay(batch_size)
        # if e % 10 == 0:
        #     agent.save("./save/cartpole-dqn.h5")


episode: 0/1000, score: 19, e: 1.0
episode: 1/1000, score: 18, e: 0.97
episode: 2/1000, score: 15, e: 0.9
episode: 3/1000, score: 19, e: 0.82
episode: 4/1000, score: 20, e: 0.74
episode: 5/1000, score: 10, e: 0.7
episode: 6/1000, score: 15, e: 0.65
episode: 7/1000, score: 23, e: 0.58
episode: 8/1000, score: 10, e: 0.55
episode: 9/1000, score: 10, e: 0.53
episode: 10/1000, score: 11, e: 0.5
episode: 11/1000, score: 16, e: 0.46
episode: 12/1000, score: 18, e: 0.42
episode: 13/1000, score: 11, e: 0.4
episode: 14/1000, score: 9, e: 0.38
episode: 15/1000, score: 10, e: 0.36
episode: 16/1000, score: 14, e: 0.34
episode: 17/1000, score: 10, e: 0.32
episode: 18/1000, score: 10, e: 0.3
episode: 19/1000, score: 9, e: 0.29
episode: 20/1000, score: 12, e: 0.27
episode: 21/1000, score: 9, e: 0.26
episode: 22/1000, score: 11, e: 0.25
episode: 23/1000, score: 9, e: 0.24
episode: 24/1000, score: 9, e: 0.23
episode: 25/1000, score: 12, e: 0.21
episode: 26/1000, score: 28, e: 0.19
episode: 27/1000, scor