**You may need to install [OpenCV](https://pypi.python.org/pypi/opencv-python) and [scikit-video](http://www.scikit-video.org/stable/).**

In [1]:
import keras
import numpy as np
import io
import base64
from IPython.display import HTML
import skvideo.io
import cv2
import json
from collections import deque

from keras.models import Sequential,model_from_json
from keras.layers.core import Dense
from keras.optimizers import sgd
from keras.layers import Conv2D, MaxPooling2D, Activation, AveragePooling2D,Reshape,BatchNormalization, Flatten

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
size = 13
T = 200

In [3]:
class DQN(object):
    def __init__(self, grid_size,  epsilon = 0.1, n_action = 4, memory_size=100, batch_size = 16, n_state=2):
        self.n_action = n_action
        self.epsilon = epsilon
        # Discount for Q learning
        self.discount = 0.99
        self.grid_size = grid_size
        # number of state
        self.n_state = n_state
        # Memory
        self.memory = deque(maxlen = memory_size)
        # Batch size when learning
        self.batch_size = batch_size

    def set_epsilon(self,e):
        self.epsilon = e
    
    def act(self,s):
        """ This function should return the next action to do:
        an integer between 0 and 4 (not included) with a random exploration of epsilon"""
        if np.random.rand() <= self.epsilon:
            a = np.random.randint(0, self.n_action, size=1)[0]
        else:
            a = self.learned_act(s)

        return a
    
    def learned_act(self, s):
        return np.argmax(self.model.predict(s[None])[0])

    def reinforce(self, s_, n_s_, a_, r_, game_over_):
        self.memory.append([s_, n_s_, a_, r_, game_over_])
        input_states = np.zeros((self.batch_size, 5, 5, self.n_state))
        target_q = np.zeros((self.batch_size, 4))
        
        replays = np.random.choice(np.arange(len(self.memory)), 
                                   size = self.batch_size, 
                                   replace = False)
        
        for i, n in enumerate(replays):
            s_, n_s_, a_, r_, game_over_ = self.memory[n]
            input_states[i] = s_
            if game_over_: target_q[i, a_] = r_
            else:
                pred = self.model.predict(n_s_[None])
                target_q[i, a_] = r_ + self.discount*np.max(pred)
                
        # HINT: Clip the target to avoid exploiding gradients.. -- clipping is a bit tighter
        target_q = np.clip(target_q, -3, 3)
        l = self.model.train_on_batch(input_states, target_q)
        return l

    def save(self,name_weights='model.h5',name_model='model.json'):
        self.model.save_weights(name_weights, overwrite=True)
        with open(name_model, "w") as outfile:
            json.dump(self.model.to_json(), outfile)
            
    def load(self,name_weights='model.h5',name_model='model.json'):
        with open(name_model, "r") as jfile:
            model = model_from_json(json.load(jfile))
        model.load_weights(name_weights)
        model.compile("sgd", "mse")
        self.model = model

In [4]:
class DQN_CNN(DQN):
    def __init__(self, *args,lr=0.1,**kwargs):
        super(DQN_CNN, self).__init__(*args,**kwargs)
        
        ###### FILL IN
        model = Sequential()
        model.add(Conv2D(32, kernel_size = 2, activation = 'relu', input_shape = (5,5,self.n_state)))
        model.add(Conv2D(64, kernel_size = 2, activation = 'relu'))
        model.add(Flatten())
        model.add(Dense(4))
        model.compile(sgd(lr=lr, decay=1e-4, momentum=0.0), "mse")
        self.model = model

In [5]:
def init_memory(agent, env):
    # Render the environment
    state = env.reset()
    game_over = False
    for i in range(agent.batch_size):
        action = np.random.randint(0,4)

        prev_state = state
        state, reward, game_over = env.act(action)

        agent.memory.append([prev_state, state, action, reward, game_over])

In [6]:
def test(agent, env, epochs, prefix=''):
    # Number of won games
    score = 0
    for e in range(epochs):
        
        state = env.reset()
        # This assumes that the games will end
        game_over = False

        win = lose = 0

        while not game_over:
            # The agent performs an action
            action = agent.act(state)

            # Apply an action to the environment, get the next state, the reward
            # and if the games end
            prev_state = state
            state, reward, game_over = env.act(action, train = False)

            # Update the counters
            if reward > 0: win = win + reward
            if reward < 0: lose = lose - reward
        
        # Save as a mp4
        env.draw('video/test/' + prefix + str(e))

        # Update stats
        score += win-lose

        print("Win/lose count {}/{}. Average score ({})"
              .format(win, lose, score/(1+e)))
    print('Final score: '+str(score/epochs))

In [7]:
def train(agent, env, epoch, prefix='', e_start = 1.0, e_end = 0.01, decay_rate = 0.001):
    score = loss = 0
    decay_step = 0
    
    init_memory(agent, env)

    for e in range(epoch):
        # At each epoch, we restart to a fresh game and get the initial state
        state = env.reset()
        # This assumes that the games will terminate
        game_over = False
        win = lose = 0
        
        while not game_over:
            # The agent performs an action
            decay_step += 1
            epsilon = e_end + (e_start - e_end) * np.exp(-decay_rate * decay_step)
            agent.set_epsilon(epsilon)
            action = agent.act(state)

            # Apply an action to the environment, get the next state, the reward
            # and if the games end
            prev_state = state
            state, reward, game_over = env.act(action)

            # Update the counters
            if reward > 0: win += reward
            if reward < 0: lose -= reward

            # Apply the reinforcement strategy
            loss = agent.reinforce(prev_state, state,  action, reward, game_over)

        # Save as a mp4
        if e % 10 == 0: env.draw('video/train/' + prefix+str(e))

        # Update stats
        score += win-lose

        print("Epoch {:03d}/{:03d} | Loss {:.4f} | Win/lose count {}/{} ({})"
              .format(e, epoch, loss, win, lose, win-lose))
        agent.save(name_weights='model/' + prefix + '_model.h5',name_model='model/' + prefix + '_model.json')

In [8]:
class EnvironmentExploring(object):
    def __init__(self, grid_size=10, max_time=500, temperature=0.1):
        grid_size = grid_size+4
        self.grid_size = grid_size
        self.max_time = max_time
        self.temperature = temperature

        self.scale=16

        self.to_draw = np.zeros((max_time+2, grid_size*self.scale, grid_size*self.scale, 3))


    def draw(self,e):
        skvideo.io.vwrite(str(e) + '.mp4', self.to_draw)

    def get_frame(self,t):
        b = np.zeros((self.grid_size,self.grid_size,3))+128
        b[self.board>0,0] = 256
        b[self.board < 0, 2] = 256
        b[self.x,self.y,:]=256
        b[-2:,:,:]=100
        b[:,-2:,:]=100
        b[:2,:,:]=100
        b[:,:2,:]=100
        
        b =  cv2.resize(b, None, fx=self.scale, fy=self.scale, interpolation=cv2.INTER_NEAREST)

        self.to_draw[t,:,:,:]=b


    def act(self, action, train = True):
        """This function returns the new state, reward and decides if the
        game ends."""

        self.get_frame(int(self.t))

        self.position = np.zeros((self.grid_size, self.grid_size))

        self.position[0:2,:]= -1
        self.position[:,0:2] = -1
        self.position[-2:, :] = -1
        self.position[-2:, :] = -1

        self.position[self.x, self.y] = 1
        if action == 0:
            if self.x == self.grid_size-3:
                self.x = self.x-1
            else:
                self.x = self.x + 1
        elif action == 1:
            if self.x == 2:
                self.x = self.x+1
            else:
                self.x = self.x-1
        elif action == 2:
            if self.y == self.grid_size - 3:
                self.y = self.y - 1
            else:
                self.y = self.y + 1
        elif action == 3:
            if self.y == 2:
                self.y = self.y + 1
            else:
                self.y = self.y - 1
        else:
            RuntimeError('Error: action not recognized')

        self.t = self.t + 1
        if train:
            reward = self.malus_position[self.x, self.y] + self.board[self.x, self.y]
        else:
            reward = self.board[self.x, self.y]
            
        self.board[self.x, self.y] = 0
        self.malus_position[self.x, self.y] = -0.1

        game_over = self.t > self.max_time
        state = np.concatenate((self.malus_position.reshape(self.grid_size, self.grid_size,1),
                                self.board.reshape(self.grid_size, self.grid_size,1),
                                self.position.reshape(self.grid_size, self.grid_size,1)),axis=2)
        state = state[self.x-2:self.x+3,self.y-2:self.y+3,:]

        return state, reward, game_over

    def reset(self):
        """This function resets the game and returns the initial state"""

        self.x = np.random.randint(3, self.grid_size-3, size=1)[0]
        self.y = np.random.randint(3, self.grid_size-3, size=1)[0]


        bonus = 0.5*np.random.binomial(1,self.temperature,size=self.grid_size**2)
        bonus = bonus.reshape(self.grid_size,self.grid_size)

        malus = -1.0*np.random.binomial(1,self.temperature,size=self.grid_size**2)
        malus = malus.reshape(self.grid_size, self.grid_size)

        self.to_draw = np.zeros((self.max_time+2, self.grid_size*self.scale, self.grid_size*self.scale, 3))


        malus[bonus>0]=0

        self.board = bonus + malus

        self.position = np.zeros((self.grid_size, self.grid_size))
        self.position[0:2,:]= -1
        self.position[:,0:2] = -1
        self.position[-2:, :] = -1
        self.position[-2:, :] = -1
        self.board[self.x,self.y] = 0
        self.t = 0

        self.malus_position = np.zeros((self.grid_size, self.grid_size))
        self.malus_position[self.x, self.y] = -0.1

        state = np.concatenate((self.malus_position.reshape(self.grid_size, self.grid_size,1),
                                self.board.reshape(self.grid_size, self.grid_size,1),
                                self.position.reshape(self.grid_size, self.grid_size,1)),axis=2)

        state = state[self.x - 2:self.x + 3, self.y - 2:self.y + 3, :]
        return state

In [12]:
# Training
env = EnvironmentExploring(grid_size=size, max_time=T, temperature=0.3)
agent = DQN_CNN(size, lr=.1, epsilon = 0.1, memory_size=2000, batch_size = 32,n_state=3)
train(agent, env, 50, prefix='explore')

Epoch 000/050 | Loss 0.0057 | Win/lose count 6.0/21.40000000000003 (-15.40000000000003)
Epoch 001/050 | Loss 0.0101 | Win/lose count 8.0/30.500000000000103 (-22.500000000000103)
Epoch 002/050 | Loss 0.0039 | Win/lose count 11.5/23.100000000000026 (-11.600000000000026)
Epoch 003/050 | Loss 0.0103 | Win/lose count 9.0/20.200000000000024 (-11.200000000000024)
Epoch 004/050 | Loss 0.0052 | Win/lose count 10.0/22.300000000000065 (-12.300000000000065)
Epoch 005/050 | Loss 0.0039 | Win/lose count 14.0/20.6 (-6.600000000000001)
Epoch 006/050 | Loss 0.0538 | Win/lose count 20.5/17.29999999999999 (3.20000000000001)
Epoch 007/050 | Loss 0.0169 | Win/lose count 14.5/20.50000000000003 (-6.000000000000028)
Epoch 008/050 | Loss 0.0059 | Win/lose count 20.5/16.499999999999975 (4.000000000000025)
Epoch 009/050 | Loss 0.0054 | Win/lose count 15.5/14.799999999999967 (0.700000000000033)
Epoch 010/050 | Loss 0.0423 | Win/lose count 20.0/16.299999999999972 (3.7000000000000277)
Epoch 011/050 | Loss 0.0535 | 

In [10]:
env = EnvironmentExploring(grid_size=size, max_time=T, temperature=0.3)
agent = DQN_CNN(size, lr=.1, epsilon = 0.1, memory_size=2000, batch_size = 32, n_state = 3)
agent.load(name_weights='model/model.h5',name_model='model/model.json')
print('Test of the CNN')
#test(agent,env,5,prefix='explore')

Test of the CNN


In [41]:
prefix = 'win3'
c = 0
max_score = 0
min_score = 100
scores = []
while c < 1000:

    state = env.reset()
    # This assumes that the games will end
    game_over = False

    win = lose = 0

    while not game_over:
        # The agent performs an action
        action = agent.act(state)

        # Apply an action to the environment, get the next state, the reward
        # and if the games end
        prev_state = state
        state, reward, game_over = env.act(action, train = False)

        # Update the counters
        if reward > 0: win = win + reward
        if reward < 0: lose = lose -reward
            
    c += 1
    scores.append(score)
    if 0.5 not in env.board[2:-2,2:-2] and win-lose>score:
        score = win-lose
        print(score, c)
        env.draw('video/test/' + prefix)

20.5 45
24.0 570


***