In [3]:
from matplotlib import pyplot as plt
from scipy import signal
import numpy as np
import random

In [4]:
%matplotlib inline

In [14]:
shape = (13,13)

# Implement game logic

In [15]:
class Gomoku:
    
    def __init__(self, shape):
        self.shape = shape
        self.reset()
        
    def reset(self):
        self.last_player = 1
        self.board = np.stack((np.zeros(self.shape), np.zeros(self.shape),np.ones(self.shape)), axis=2)
        self.previous_board = np.copy(self.board)
    
    def draw(self):
        plt.imshow(self.board)
    
    def list_actions(self):
        return np.transpose(np.nonzero(self.board[:,:,2])).tolist()
        
    def take_action(self, action):
        self.previous_board = np.copy(self.board)
        self.last_player = 1 - self.last_player 
        pixel = np.zeros((3))
        pixel[self.last_player] = 1
        self.board[action] = pixel
        self.last_action = action
        return self.__revard()
    
    def get_last_action(self):
        return self.last_action
    
    def get_state(self):
        return self.convert_state_for_player(self.board, self.last_player)
    
    def get_raw_state(self):
        return self.board
    
    def get_previous_state(self):
        return self.convert_state_for_player(self.previous_board, self.last_player)
    
    def convert_state_for_player(self, board, player):
        result = np.copy(board)
        
        if player == 1:
            result[:,:,[0,1]] = result[:,:,[1,0]]
        
        return result

    def game_over(self):
        return self.__won(0) or self.__won(1) or np.count_nonzero(self.board[:,:,2]) == 0
    
    def __revard(self):
        return 1 if self.__won(self.last_player) else 0
        
    def __won(self, player):
        board = self.board[:,:, player]
        return (
            self.__has_five_by(np.identity(5), board) or 
            self.__has_five_by(np.fliplr(np.identity(5)), board) or 
            self.__has_five_by(np.ones((1,5)), board) or 
            self.__has_five_by(np.ones((5,1)), board)
        )
    
    def __has_five_by(self, mask, board):
        return np.count_nonzero(signal.convolve2d(mask, board) == 5) > 0
    
game = Gomoku(shape)

# Deep Q learning

In [16]:
from keras.models import Sequential
from keras.layers.core import Dropout
from keras.layers.convolutional import Conv2D
from keras.layers.normalization import BatchNormalization
from keras.optimizers import Nadam

from IPython import display

In [17]:
hidden_layer_count = 40

In [18]:
model = Sequential([
    Conv2D(hidden_layer_count, (3,3), padding='same', activation='relu', input_shape=game.get_state().shape),
    Conv2D(hidden_layer_count, (3,3), padding='same', activation='relu'),
    Conv2D(hidden_layer_count, (3,3), padding='same', activation='relu'),
    BatchNormalization(axis=3),
    Conv2D(hidden_layer_count, (3,3), padding='same', activation='relu'),
    Conv2D(hidden_layer_count, (3,3), padding='same', activation='relu'),
    Conv2D(hidden_layer_count, (3,3), padding='same', activation='relu'),
    BatchNormalization(axis=3),
    Conv2D(hidden_layer_count, (3,3), padding='same', activation='relu'),
    Conv2D(hidden_layer_count, (3,3), padding='same', activation='relu'),
    Conv2D(hidden_layer_count, (3,3), padding='same', activation='relu'),
    BatchNormalization(axis=3),
    Conv2D(1, (3,3), padding='same')
])

opt = Nadam(lr=1e-5)
model.compile(optimizer=opt, loss='mean_squared_error')

In [19]:
model.load_weights('/data/trained_models/gomoku_q_learn/initial_v1.h5')

In [20]:
epsilon = 0.01
delta = 0.6

In [21]:
def make_move():
    if game.game_over():
        return
    
    if np.random.rand() < epsilon:
        action = tuple(random.choice(game.list_actions()))
    else: 
        q = model.predict(np.expand_dims(game.get_state(), axis=0)).reshape(shape)

        prefered_moves = sorted(game.list_actions(), key=lambda x: -q[tuple(x)])
        
        i = 0
        while np.random.rand() < delta and i < len(prefered_moves)-1:
            i += 1

        action = tuple(prefered_moves[i])

    revard = game.take_action(action)

    experiences.append([game.get_previous_state(), action, revard, game.get_state(), game.game_over()])

def make_manual_move(i, j):
    if game.game_over():
        return
    
    game.take_action((i,j))
    experiences.append([game.get_previous_state(), action, revard, game.get_state(), game.game_over()])

In [29]:
from IPython.core.display import display as core_display
from IPython.core.display import HTML 

def state_for_display():
    raw_state = game.get_raw_state()
    last_action = game.get_last_action()
    color_map = model.predict(np.expand_dims(game.get_state(), axis=0)).reshape(shape)
    color_map = color_map - np.min(color_map)
    color_map = (color_map / np.max(color_map) * 255).astype(int)
    
    html = '<table>'
    for i in range(shape[0]):
        html += '<tr>'
        for j in range(shape[1]):
                        
            html += "<td style='border:1px solid gray; width:25px; height:25px; fotn-weight:bold; text-align:center; "
            if last_action == (i,j): 
                html += "background-color:rgb(255, "+str(color_map[i,j])+", "+str(255-color_map[i,j])+");" 
            else: 
                html += "background-color:rgb(0, "+ str(color_map[i,j]) +", "+str(255-color_map[i,j])+");" 
                
            html += "'"
            if raw_state[i,j,2]:
                html += " onclick='make_move("+str(i)+","+str(j)+")'"
            html += ">"
            if raw_state[i,j,0] == 1:
                html += 'X'
            if raw_state[i,j,1] == 1:
                html += 'O'
            if raw_state[i,j,0] == 1:
                html += '&nbsp;'                
                
            html += "</td>"
        html += '</tr>'

    html += '</table>'
    if game.game_over():
        html += '<h3>Game Over</h3>'
    print(html)

game.reset()



html = """

<div id='display_div'></div>
<button onclick='next()'>Make Next AI Move</button>
<button onclick='play()'>Autoplay</button>
<button onclick='reset()'>New Game</button>

<script type="text/Javascript">

    function display_state(out) {
        document.getElementById('display_div').innerHTML = out.content.text;
    }

    function next() {
        var kernel = IPython.notebook.kernel;
        kernel.execute('make_move()');
        kernel.execute('state_for_display()', {"iopub" : {"output":display_state}});
    }
    
    function make_move(i,j) {
        var kernel = IPython.notebook.kernel;
        kernel.execute('make_manual_move('+i+', '+j+')');
        kernel.execute('state_for_display()', {"iopub" : {"output":display_state}});
        setTimeout(next, 300);
    }
    
    function play() {
        var kernel = IPython.notebook.kernel;
        next();
        kernel.execute('print(game.game_over(), end="")', {"iopub" : {"output":function(out) {
            if(out.content.text == "False") {
                setTimeout(play, 100);
            }
        }}});
        
    }
    
    function reset() {
        var kernel = IPython.notebook.kernel;
        kernel.execute('game.reset()');
        next()
    }
    
    next()
</script>
"""

core_display(HTML(html))

In [23]:
epsilon = 0.2
delta = 0.7

In [24]:
experiences = []

In [26]:
opt.lr = 1e-5

In [27]:
discount = 0.9

In [28]:
for i in range(10000):
    game.reset()
    game_length = 0
    while not game.game_over():
        make_move()
        game_length+=1

    experiences[-2][2] = -1
    
    for k in range(5):
        print('.', end='')
        seed_batch = random.sample(experiences, min(100,len(experiences)))

        original_q = model.predict(np.array([item[0] for item in seed_batch]))
        
        new_q_max = []
        for index in range(len(seed_batch)):
            valid_counter_moves = np.transpose(np.nonzero(seed_batch[index][3][:,:,2])).tolist()

            next_valid_states = list(map(lambda m: np.copy(seed_batch[index][3]), valid_counter_moves))
            
            if len(next_valid_states) > 0:
                for state_index in range(len(valid_counter_moves)):
                    next_valid_states[state_index][tuple(valid_counter_moves[state_index])] = np.array([0,1,0])

                new_q = np.array(model.predict(np.array(next_valid_states)))

                new_q_max.append(np.min(np.max(np.max(new_q, axis=1), axis=1), axis=0)[0])
            else:
                new_q_max.append(0)
        
        revards = np.array([item[2] for item in seed_batch])
        game_over = np.array([item[4] for item in seed_batch]).astype(int)
        calculated_q = (revards + discount * np.array(new_q_max) * (1-game_over)).tolist()

        desired_q = np.copy(original_q)
        for index in range(len(calculated_q)):
            desired_q[index][seed_batch[index][1]] = calculated_q[index]

        loss = model.train_on_batch(np.array([item[0] for item in seed_batch]), desired_q)
    
    display.clear_output(wait=True)
    print(i, game_length, loss, '(', len(experiences), ')')
    
    if len(experiences) > 30000:
        experiences = random.sample(experiences, 25000)
    

49 39 0.00742505 ( 2825 )
.

KeyboardInterrupt: 

In [2]:
import random
random.sample(range(100), 10)

[55, 31, 49, 38, 68, 58, 33, 63, 6, 78]