# Mock AlphaGo (3) - Reinforced Learning
In this notebook, we will train the model by letting them compete each other.

In [None]:
import os, numpy as np
from caffe2.python import core, model_helper, workspace, brew, utils
from caffe2.proto import caffe2_pb2

%matplotlib inline
from matplotlib import pyplot

# how many games will be run in this tournament
# if greater than 1, make sure randomize the choice
GAMES_ITERS = 16

if workspace.has_gpu_support:
    device_opts = core.DeviceOption(caffe2_pb2.CUDA, workspace.GetDefaultGPUID())
    print('Running in GPU mode on default device {}'.format(workspace.GetDefaultGPUID()))
else :
    device_opts = core.DeviceOption(caffe2_pb2.CPU, 0)
    print('Running in CPU mode')

arg_scope = {"order": "NCHW"}
    
ROOT_FOLDER = os.path.join(os.path.expanduser('~'), 'python', 'tutorial_data','go','param') # folder stores the loss/accuracy log

### Config for black player
BLACK_WORKSPACE = os.path.join(ROOT_FOLDER,'black')
BLACK_CONV_LEVEL = 13
BLACK_FILTERS = 192
BLACK_PRE_TRAINED_ITERS = 1
# before traning, where to load the params
BLACK_LOAD_FOLDER = os.path.join(ROOT_FOLDER, "RL-B-conv={}-k={}-iter={}".format(BLACK_CONV_LEVEL,BLACK_FILTERS,BLACK_PRE_TRAINED_ITERS))

### Config for white player
WHITE_WORKSPACE = os.path.join(ROOT_FOLDER,'white')
WHITE_CONV_LEVEL = 13
WHITE_FILTERS = 192
WHITE_PRE_TRAINED_ITERS = 1
# before traning, where to load the params
WHITE_LOAD_FOLDER = os.path.join(ROOT_FOLDER, "RL-W-conv={}-k={}-iter={}".format(WHITE_CONV_LEVEL,WHITE_FILTERS,WHITE_PRE_TRAINED_ITERS))

# BOARD_POSITION contains SGF symbol which represents each row (or column) of the board
# It can be used to convert between 0,1,2,3... and a,b,c,d...
# Symbol [tt] or [] represents PASS in SGF, therefore is omitted
BOARD_POSITION = 'abcdefghijklmnopqrs'

print('Black {}/{}/{} vs. White {}/{}/{}'.format(
    BLACK_CONV_LEVEL, BLACK_FILTERS, BLACK_PRE_TRAINED_ITERS,
    WHITE_CONV_LEVEL, WHITE_FILTERS, WHITE_PRE_TRAINED_ITERS))

>training params

In [None]:
BASE_LR = -0.003 # (-0.01,0) The base Learning Rate, alphago uses -0.003 and half the number every 80m steps

TRAIN_BATCHES = 16 # how many samples will be trained within one mini-batch, depends on your hardware
#PRE_TRAINED_ITERS = 60000 # [0, infinity) how many batches the model has been trained before
#SKIP_TRAINED_DATA = 0 # [0, infinity) if this is a resumed training, how many input data will be skipped
TRAIN_ITERS = 1 # [0, infinity) how many batches the model will be trained

# after training, where to store the params
BLACK_SAVE_FOLDER = os.path.join(ROOT_FOLDER, "RL-B-conv={}-k={}-iter={}".format(BLACK_CONV_LEVEL,BLACK_FILTERS,BLACK_PRE_TRAINED_ITERS+TRAIN_ITERS))

# after training, where to store the params
WHITE_SAVE_FOLDER = os.path.join(ROOT_FOLDER, "RL-W-conv={}-k={}-iter={}".format(WHITE_CONV_LEVEL,WHITE_FILTERS,WHITE_PRE_TRAINED_ITERS+TRAIN_ITERS))

## AlphaGo Neural Network Architecture
Refer to AlphaGo
>  We also trained a faster but less accurate rollout policy pπ(a|s), using a linear softmax of small pattern features (see Extended Data Table 4) with weights π; this achieved an accuracy of 24.2%, using just 2µs to select an action, rather than 3ms for the policy network.

In [None]:
from modeling import AddConvModel, AddTrainingOperators

## Build the actual network

In [None]:
import caffe2.python.predictor.predictor_exporter as pe

data = np.empty(shape=(TRAIN_BATCHES,48,19,19), dtype=np.float32)
label = np.empty(shape=(TRAIN_BATCHES,1), dtype=np.int32)

### Train Net Initialize
>Train Net: Blob('data','label') ==> Predict Net ==> Loss ==> Backward Propergation

In [None]:
# Initialize Black player
workspace.SwitchWorkspace(BLACK_WORKSPACE, True)
with core.DeviceScope(device_opts):
    black_train_model = model_helper.ModelHelper(name="black_train_model", arg_scope=arg_scope, init_params=True)
    workspace.FeedBlob("data", data, device_option=device_opts)
    predict = AddConvModel(black_train_model, "data", conv_level=BLACK_CONV_LEVEL, filters=BLACK_FILTERS)
    workspace.FeedBlob("label", data, device_option=device_opts)
    AddTrainingOperators(black_train_model, "predict", "label", base_lr=BASE_LR)
    workspace.RunNetOnce(black_train_model.param_init_net)
    workspace.CreateNet(black_train_model.net, overwrite=True)

In [None]:
# Initialize White player
workspace.SwitchWorkspace(WHITE_WORKSPACE, True)
with core.DeviceScope(device_opts):
    white_train_model = model_helper.ModelHelper(name="white_train_model", arg_scope=arg_scope, init_params=True)
    workspace.FeedBlob("data", data, device_option=device_opts)
    predict = AddConvModel(white_train_model, "data", conv_level=WHITE_CONV_LEVEL, filters=WHITE_FILTERS)
    workspace.FeedBlob("label", data, device_option=device_opts)
    AddTrainingOperators(white_train_model, "predict", "label", base_lr=BASE_LR)
    workspace.RunNetOnce(white_train_model.param_init_net)
    workspace.CreateNet(white_train_model.net, overwrite=True)

### Deploy Net
Build Deploy Net and Init Deploy Net with saved weight and bias.
>Predict Net: Blob('data') ==> Predict Net ==> Blob('predict')

In [None]:
# Initialize Black player
workspace.SwitchWorkspace(BLACK_WORKSPACE, True)
with core.DeviceScope(device_opts):
    black_deploy_model = model_helper.ModelHelper(name="black_policy_deploy", arg_scope=arg_scope, init_params=False)
    if BLACK_PRE_TRAINED_ITERS > 0:
        black_deploy_model.net = pe.prepare_prediction_net(os.path.join(BLACK_LOAD_FOLDER, "policy_model.minidb"), "minidb", device_option=device_opts)
    else:
        AddConvModel(black_deploy_model, "data", conv_level=BLACK_CONV_LEVEL, filters=BLACK_FILTERS)
        workspace.RunNetOnce(black_deploy_model.param_init_net)
        workspace.CreateNet(black_deploy_model.net, overwrite=True)

In [None]:
# Initialize White player
workspace.SwitchWorkspace(WHITE_WORKSPACE, True)
with core.DeviceScope(device_opts):
    white_deploy_model = model_helper.ModelHelper(name="white_policy_deploy", arg_scope=arg_scope, init_params=False)
    if WHITE_PRE_TRAINED_ITERS > 0:
        white_deploy_model.net = pe.prepare_prediction_net(os.path.join(WHITE_LOAD_FOLDER, "policy_model.minidb"), "minidb", device_option=device_opts)
    else:
        AddConvModel(white_deploy_model, "data", conv_level=WHITE_CONV_LEVEL, filters=WHITE_FILTERS)
        workspace.RunNetOnce(white_deploy_model.param_init_net)
        workspace.CreateNet(white_deploy_model.net, overwrite=True)

## Run the tournament and training

### Compete

In [None]:
from go import GameState, BLACK, WHITE, EMPTY
from preprocessing import Preprocess

DEFAULT_FEATURES = [
    "board", "ones", "turns_since", "liberties", "capture_size",
    "self_atari_size", "liberties_after", "ladder_capture", "ladder_escape",
    "sensibleness", "zeros"]

game_state = [GameState() for i in range(GAMES_ITERS)]
game_result = [0] * GAMES_ITERS # 0 - Not Ended; BLACK - Black Wins; WHITE - White Wins
p = [Preprocess(DEFAULT_FEATURES)] * GAMES_ITERS
history = [[] for i in range(GAMES_ITERS)]
# board before current move
board = [p[i].state_to_tensor(game_state[i]).astype(np.float32) for i in range(GAMES_ITERS)]

# diverse the first step for GAMES_ITERS choices
workspace.SwitchWorkspace(BLACK_WORKSPACE)
workspace.FeedBlob('data', board[0], device_option=device_opts)

workspace.RunNet(black_deploy_model.net)
init_move = np.reshape(workspace.FetchBlob('predict'), (-1))
init_sorted_move = np.argsort(-init_move) # shape=(361,)

current_choice = init_sorted_move[0:GAMES_ITERS]
x = current_choice/19 # tensor
y = current_choice%19 # tensor

for i in range(GAMES_ITERS):
    history[i].append(('B',x[i],y[i],board[i]))
    game_state[i].do_move(action=(x[i],y[i]),color = BLACK)
    print('game({}) step({}) black move({},{})'.format(i, 0, x[i], y[i]))

In [None]:
# for each step in the game
for step in range(1,500):

#    for i in range(GAMES_ITERS):
#        if not game_result[i]:
#            board[i] = np.append(board[i], p[i].state_to_tensor(game_state[i]).astype(np.float32), axis=0)
    board = np.concatenate([p[i].state_to_tensor(game_state[i]).astype(np.float32) for i in range(GAMES_ITERS)])

    if step % 2 == 0:
        # black move
        workspace.SwitchWorkspace(BLACK_WORKSPACE)
        workspace.FeedBlob('data', board, device_option=device_opts)
        workspace.RunNet(black_deploy_model.net)
        move = np.reshape(workspace.FetchBlob('predict'), (GAMES_ITERS,-1))
        sorted_move = np.argsort(-move)
        for i in range(GAMES_ITERS):
            if game_result[i]:
                break;
            legal_moves = [ x*19+y for (x,y) in game_state[i].get_legal_moves(include_eyes=False)] # [59, 72, ...]
            if len(legal_moves) > 0: # at least 1 legal move
                mask = np.in1d(sorted_move[i], legal_moves) # [True, False, True, ...]
                current_choice = sorted_move[i][mask][0] # The top legal move
                (x, y) = (current_choice/19, current_choice%19)
                history[i].append(('B',x,y,board[i]))
                game_state[i].do_move(action=(x,y),color = BLACK) # End of Game?
                print('game({}) step({}) black move({},{})'.format(i, step, x, y))
            else:
                game_result[i] = game_state[i].is_end_of_game
    else:
        # white move
        workspace.SwitchWorkspace(WHITE_WORKSPACE)
        workspace.FeedBlob('data', board, device_option=device_opts)
        workspace.RunNet(white_deploy_model.net)
        move = np.reshape(workspace.FetchBlob('predict'), (GAMES_ITERS,-1))
        sorted_move = np.argsort(-move)
        for i in range(GAMES_ITERS):
            if game_result[i]:
                break;
            legal_moves = [ x*19+y for (x,y) in game_state[i].get_legal_moves(include_eyes=False)] # [59, 72, ...]
            if len(legal_moves) > 0: # at least 1 legal move
                mask = np.in1d(sorted_move[i], legal_moves) # [True, False, True, ...]
                current_choice = sorted_move[i][mask][0] # The top legal move
                (x, y) = (current_choice/19, current_choice%19)
                history[i].append(('W',x,y,board[i]))
                game_state[i].do_move(action=(x,y),color = WHITE) # End of Game?
                print('game({}) step({}) white move({},{})'.format(i, step, x, y))
            else:
                game_result[i] = game_state[i].is_end_of_game

    if np.all(game_result):
        break

### Record the game in SGF format

In [None]:
from sgfutil import WriteBackSGF

#comment out for better performance
for i in range(GAMES_ITERS):
    filename = os.path.join(
        os.path.expanduser('~'), 'python', 'tutorial_files','selfplay',
        '({}_{}_{})vs({}_{}_{})_{}'.format(BLACK_CONV_LEVEL, BLACK_FILTERS, BLACK_PRE_TRAINED_ITERS,
                                        WHITE_CONV_LEVEL, WHITE_FILTERS, WHITE_PRE_TRAINED_ITERS, i))
    WriteBackSGF(game_state[i], history[i], filename)

## Learn from the winning games

In [None]:
iter = 0
k = 0
for i in range(GAMES_ITERS):
    winner = game_state[i].get_winner()
    print('Learning {} steps in {} of {} games. {} wins'.format(iter * 32, i, GAMES_ITERS, history[i][0][0]))
    for step in history[i]:
        if (step[0] == 'B' and winner == BLACK) or (step[0] == 'W' and winner == WHITE):
            data[k] = step[3]
            label[k] = step[1]*19+step[2]
            k += 1
            if k == TRAIN_BATCHES:
                iter += 1
                k = 0
                workspace.SwitchWorkspace(BLACK_WORKSPACE)
                workspace.FeedBlob("data", data, device_option=device_opts)
                workspace.FeedBlob("label", label, device_option=device_opts)
                workspace.RunNet(black_train_model.net)
                workspace.SwitchWorkspace(WHITE_WORKSPACE)
                workspace.FeedBlob("data", data, device_option=device_opts)
                workspace.FeedBlob("label", label, device_option=device_opts)
                workspace.RunNet(white_train_model.net)
print('Finished')

### Save the RL model

In [None]:
if not os.path.exists(BLACK_SAVE_FOLDER):
    os.makedirs(BLACK_SAVE_FOLDER)
# construct the model to be exported
pe_meta = pe.PredictorExportMeta(
    predict_net=black_deploy_model.net.Proto(),
    parameters=[str(b) for b in black_deploy_model.params], 
    inputs=["data"],
    outputs=["predict"],
)
pe.save_to_db("minidb", os.path.join(BLACK_SAVE_FOLDER, "policy_model.minidb"), pe_meta)
print('Params saved to {}'.format(BLACK_SAVE_FOLDER))
    
if not os.path.exists(WHITE_SAVE_FOLDER):
    os.makedirs(WHITE_SAVE_FOLDER)
# construct the model to be exported
pe_meta = pe.PredictorExportMeta(
    predict_net=white_deploy_model.net.Proto(),
    parameters=[str(b) for b in white_deploy_model.params], 
    inputs=["data"],
    outputs=["predict"],
)
pe.save_to_db("minidb", os.path.join(WHITE_SAVE_FOLDER, "policy_model.minidb"), pe_meta)
print('Params saved to {}'.format(WHITE_SAVE_FOLDER))