# Mock AlphaGo (3) - Reinforced Learning
In this notebook, we will train the model by letting them compete each other.

In [None]:
import os, numpy as np
from caffe2.python import core, model_helper, workspace, brew, utils
from caffe2.proto import caffe2_pb2

%matplotlib inline
from matplotlib import pyplot

# how many games will be run in this tournament
# if greater than 1, make sure randomize the choice
GAMES_ITERS = 16

if workspace.has_gpu_support:
    device_opts = core.DeviceOption(caffe2_pb2.CUDA, workspace.GetDefaultGPUID())
    print('Running in GPU mode on default device {}'.format(workspace.GetDefaultGPUID()))
else :
    device_opts = core.DeviceOption(caffe2_pb2.CPU, 0)
    print('Running in CPU mode')

arg_scope = {"order": "NCHW"}
    
ROOT_FOLDER = os.path.join(os.path.expanduser('~'), 'python', 'tutorial_data','go','param') # folder stores the loss/accuracy log
DATA_FOLDER = os.path.join(os.path.expanduser('~'), 'python', 'tutorial_data','go')

### Config for black player
BLACK_WORKSPACE = os.path.join(ROOT_FOLDER,'black')
BLACK_CONV_LEVEL = 4
BLACK_FILTERS = 128
BLACK_PRE_TRAINED_ITERS = 1
# before traning, where to load the params
BLACK_LOAD_FOLDER = os.path.join(ROOT_FOLDER, "RL-B-conv={}-k={}-iter={}".format(BLACK_CONV_LEVEL,BLACK_FILTERS,'na'))
BLACK_LOAD_INIT_NET = os.path.join(BLACK_LOAD_FOLDER, "param_init_net.pb")
BLACK_LOAD_PREDICT_NET = os.path.join(BLACK_LOAD_FOLDER, "predict_net.pb")

### Config for white player
WHITE_WORKSPACE = os.path.join(ROOT_FOLDER,'white')
WHITE_CONV_LEVEL = 13
WHITE_FILTERS = 192
WHITE_PRE_TRAINED_ITERS = 1
# before traning, where to load the params
WHITE_LOAD_FOLDER = os.path.join(ROOT_FOLDER, "RL-W-conv={}-k={}-iter={}".format(WHITE_CONV_LEVEL,WHITE_FILTERS,'na'))
WHITE_LOAD_INIT_NET = os.path.join(WHITE_LOAD_FOLDER, "param_init_net.pb")
WHITE_LOAD_PREDICT_NET = os.path.join(WHITE_LOAD_FOLDER, "predict_net.pb")

# BOARD_POSITION contains SGF symbol which represents each row (or column) of the board
# It can be used to convert between 0,1,2,3... and a,b,c,d...
# Symbol [tt] or [] represents PASS in SGF, therefore is omitted
BOARD_POSITION = 'abcdefghijklmnopqrs'

print('Black {}/{}/{} vs. White {}/{}/{}'.format(
    BLACK_CONV_LEVEL, BLACK_FILTERS, BLACK_PRE_TRAINED_ITERS,
    WHITE_CONV_LEVEL, WHITE_FILTERS, WHITE_PRE_TRAINED_ITERS))

>training params

In [None]:
BASE_LR = -0.003 # (-0.01,0) The base Learning Rate, alphago uses -0.003 and half the number every 80m steps

TRAIN_BATCHES = 16 # how many samples will be trained within one mini-batch, depends on your hardware
#PRE_TRAINED_ITERS = 60000 # [0, infinity) how many batches the model has been trained before
#SKIP_TRAINED_DATA = 0 # [0, infinity) if this is a resumed training, how many input data will be skipped
#TRAIN_ITERS = 40000 # [0, infinity) how many batches the model will be trained

# after training, where to store the params
BLACK_SAVE_FOLDER = os.path.join(ROOT_FOLDER, "RL-B-conv={}-k={}-iter={}".format(BLACK_CONV_LEVEL,BLACK_FILTERS,'na'))
BLACK_SAVE_INIT_NET = os.path.join(BLACK_SAVE_FOLDER, "param_init_net.pb")
BLACK_SAVE_PREDICT_NET = os.path.join(BLACK_SAVE_FOLDER, "predict_net.pb")
# after training, where to store the params
WHITE_SAVE_FOLDER = os.path.join(ROOT_FOLDER, "RL-W-conv={}-k={}-iter={}".format(WHITE_CONV_LEVEL,WHITE_FILTERS,'na'))
WHITE_SAVE_INIT_NET = os.path.join(WHITE_SAVE_FOLDER, "param_init_net.pb")
WHITE_SAVE_PREDICT_NET = os.path.join(WHITE_SAVE_FOLDER, "predict_net.pb")

## AlphaGo Neural Network Architecture
Refer to AlphaGo
>  We also trained a faster but less accurate rollout policy pπ(a|s), using a linear softmax of small pattern features (see Extended Data Table 4) with weights π; this achieved an accuracy of 24.2%, using just 2µs to select an action, rather than 3ms for the policy network.

### DCNN

In [None]:
def AddConvModel(model, data, conv_level=13, filters=192):
    ''' color is BLACK or WHITE. Black and white players don't share model params.
    '''
    # Layer 1: 48 x 19 x 19 -pad-> 48 x 23 x 23 -conv-> 192 x 19 x 19
    pad1 = model.PadImage(data, 'pad1', pad_t=2, pad_l=2, pad_b=2, pad_r=2, mode="constant", value=0.)
    conv1 = brew.conv(model, pad1, 'conv1', dim_in=48, dim_out=filters, kernel=5)
    relu1 = brew.relu(model, conv1, 'relu1')
    # Layer 2-12: 192 x 19 x 19 -pad-> 192 x 21 x 21 -conv-> 192 x 19 x 19
    if conv_level > 2:
        pad2 = model.PadImage(relu1, 'pad2', pad_t=1, pad_l=1, pad_b=1, pad_r=1, mode="constant", value=0.)
        conv2 = brew.conv(model, pad2, 'conv2', dim_in=filters, dim_out=filters, kernel=3)
        relu2 = brew.relu(model, conv2, 'relu2')
        relu = relu2
    #
    if conv_level > 3:
        pad3 = model.PadImage(relu2, 'pad3', pad_t=1, pad_l=1, pad_b=1, pad_r=1, mode="constant", value=0.)
        conv3 = brew.conv(model, pad3, 'conv3', dim_in=filters, dim_out=filters, kernel=3)
        relu3 = brew.relu(model, conv3, 'relu3')
        relu = relu3
    #
    if conv_level > 4:
        pad4 = model.PadImage(relu3, 'pad4', pad_t=1, pad_l=1, pad_b=1, pad_r=1, mode="constant", value=0.)
        conv4 = brew.conv(model, pad4, 'conv4', dim_in=filters, dim_out=filters, kernel=3)
        relu4 = brew.relu(model, conv4, 'relu4')
        relu = relu4
    #
    if conv_level > 5:
        pad5 = model.PadImage(relu4, 'pad5', pad_t=1, pad_l=1, pad_b=1, pad_r=1, mode="constant", value=0.)
        conv5 = brew.conv(model, pad5, 'conv5', dim_in=filters, dim_out=filters, kernel=3)
        relu5 = brew.relu(model, conv5, 'relu5')
        relu = relu5
    #
    if conv_level > 6:
        pad6 = model.PadImage(relu5, 'pad6', pad_t=1, pad_l=1, pad_b=1, pad_r=1, mode="constant", value=0.)
        conv6 = brew.conv(model, pad6, 'conv6', dim_in=filters, dim_out=filters, kernel=3)
        relu6 = brew.relu(model, conv6, 'relu6')
        relu = relu6
    #
    if conv_level > 7:
        pad7 = model.PadImage(relu6, 'pad7', pad_t=1, pad_l=1, pad_b=1, pad_r=1, mode="constant", value=0.)
        conv7 = brew.conv(model, pad7, 'conv7', dim_in=filters, dim_out=filters, kernel=3)
        relu7 = brew.relu(model, conv7, 'relu7')
        relu = relu7
    #
    if conv_level > 8:
        pad8 = model.PadImage(relu7, 'pad8', pad_t=1, pad_l=1, pad_b=1, pad_r=1, mode="constant", value=0.)
        conv8 = brew.conv(model, pad8, 'conv8', dim_in=filters, dim_out=filters, kernel=3)
        relu8 = brew.relu(model, conv8, 'relu8')
        relu = relu8
    #
    if conv_level > 9:
        pad9 = model.PadImage(relu8, 'pad9', pad_t=1, pad_l=1, pad_b=1, pad_r=1, mode="constant", value=0.)
        conv9 = brew.conv(model, pad9, 'conv9', dim_in=filters, dim_out=filters, kernel=3)
        relu9 = brew.relu(model, conv9, 'relu9')
        relu = relu9
    #
    if conv_level > 10:
        pad10 = model.PadImage(relu9, 'pad10', pad_t=1, pad_l=1, pad_b=1, pad_r=1, mode="constant", value=0.)
        conv10 = brew.conv(model, pad10, 'conv10', dim_in=filters, dim_out=filters, kernel=3)
        relu10 = brew.relu(model, conv10, 'relu10')
        relu = relu10
    #
    if conv_level > 11:
        pad11 = model.PadImage(relu10, 'pad11', pad_t=1, pad_l=1, pad_b=1, pad_r=1, mode="constant", value=0.)
        conv11 = brew.conv(model, pad11, 'conv11', dim_in=filters, dim_out=filters, kernel=3)
        relu11 = brew.relu(model, conv11, 'relu11')
        relu = relu11
    #
    if conv_level > 12:
        pad12 = model.PadImage(relu11, 'pad12', pad_t=1, pad_l=1, pad_b=1, pad_r=1, mode="constant", value=0.)
        conv12 = brew.conv(model, pad12, 'conv12', dim_in=filters, dim_out=filters, kernel=3)
        relu12 = brew.relu(model, conv12, 'relu12')
        relu = relu12
    # Layer 13: 192 x 19 x 19 -conv-> 1 x 19 x 19 -softmax-> 361
    conv13 = brew.conv(model, relu, 'conv13', dim_in=filters, dim_out=1, kernel=1)
    ## todo: bias layer?
    softmax = brew.softmax(model, conv13, 'softmax')
    predict = model.Flatten(softmax, 'predict')
    return predict

### Training Operator (Backward Propagation)
The training operator is almost same as MNIST. Refer to AlphaGo
>TBD

In [None]:
def AddTrainingOperators(model, softmax, label):
    """Adds training operators to the model."""
    xent = model.LabelCrossEntropy([softmax, label], 'xent')
    # compute the expected loss
    loss = model.AveragedLoss(xent, "loss")
    # track the accuracy of the model
    # AddAccuracy(model, softmax, label)
    # use the average loss we just computed to add gradient operators to the model
    model.AddGradientOperators([loss])
    # do a simple stochastic gradient descent
    ITER = brew.iter(model, "iter")
    # set the learning rate schedule
    LR = model.LearningRate(
        ITER, "LR", base_lr=BASE_LR, policy="fixed", stepsize=1, gamma=0.999 ) # when policy=fixed, stepsize and gamma are ignored
    # ONE is a constant value that is used in the gradient update. We only need
    # to create it once, so it is explicitly placed in param_init_net.
    ONE = model.param_init_net.ConstantFill([], "ONE", shape=[1], value=1.0)
    # Now, for each parameter, we do the gradient updates.
    for param in model.params:
        # Note how we get the gradient of each parameter - ModelHelper keeps
        # track of that.
        param_grad = model.param_to_grad[param]
        # The update is a simple weighted sum: param = param + param_grad * LR
        model.WeightedSum([param, ONE, param_grad, LR], param)

## Build the actual network

In [None]:
import caffe2.python.predictor.predictor_exporter as pe

data = np.empty(shape=(TRAIN_BATCHES,48,19,19), dtype=np.float32)
label = np.empty(shape=(TRAIN_BATCHES,1), dtype=np.int32)

### Train Net Initialize
>Train Net: Blob('data','label') ==> Predict Net ==> Loss ==> Backward Propergation

In [None]:
# Initialize Black player
workspace.SwitchWorkspace(BLACK_WORKSPACE, True)
with core.DeviceScope(device_opts):
    black_train_model = model_helper.ModelHelper(name="black_train_model", arg_scope=arg_scope, init_params=True)
    workspace.FeedBlob("data", data, device_option=device_opts)
    workspace.FeedBlob("label", data, device_option=device_opts)
    #black_train_model.GivenTensorFill([],"data",shape=(TRAIN_BATCHES,48,19,19),values=data)
    #black_train_model.GivenTensorFill([],"label",shape=(TRAIN_BATCHES,1),values=label)
    predict = AddConvModel(black_train_model, "data", conv_level=BLACK_CONV_LEVEL, filters=BLACK_FILTERS)
    AddTrainingOperators(black_train_model, predict, "label")
workspace.RunNetOnce(black_train_model.param_init_net)
workspace.CreateNet(black_train_model.net, overwrite=True)

In [None]:
# Initialize White player
workspace.SwitchWorkspace(WHITE_WORKSPACE, True)
with core.DeviceScope(device_opts):
    white_train_model = model_helper.ModelHelper(name="white_train_model", arg_scope=arg_scope, init_params=True)
    workspace.FeedBlob("data", data, device_option=device_opts)
    workspace.FeedBlob("label", data, device_option=device_opts)
    #white_train_model.GivenTensorFill([],"data",shape=(TRAIN_BATCHES,48,19,19),values=data)
    #white_train_model.GivenTensorFill([],"label",shape=(TRAIN_BATCHES,1),values=label)
    predict = AddConvModel(white_train_model, "data", conv_level=WHITE_CONV_LEVEL, filters=WHITE_FILTERS)
    AddTrainingOperators(white_train_model, predict, "label")
workspace.RunNetOnce(white_train_model.param_init_net)
workspace.CreateNet(white_train_model.net, overwrite=True)

### Deploy Net
Build Deploy Net and Init Deploy Net with saved weight and bias.
>Predict Net: Blob('data') ==> Predict Net ==> Blob('predict')

In [None]:
# Initialize Black player
workspace.SwitchWorkspace(BLACK_WORKSPACE, True)
with core.DeviceScope(device_opts):
    black_deploy_model = model_helper.ModelHelper(name="black_policy_deploy", arg_scope=arg_scope, init_params=False)
    AddConvModel(black_deploy_model, "data", conv_level=BLACK_CONV_LEVEL, filters=BLACK_FILTERS)
if BLACK_PRE_TRAINED_ITERS > 0:
    black_deploy_model.net = pe.prepare_prediction_net(os.path.join(BLACK_LOAD_FOLDER, "policy_model.minidb"), "minidb", device_option=device_opts)
else:
    workspace.RunNetOnce(black_deploy_model.param_init_net)
workspace.CreateNet(black_deploy_model.net, overwrite=True)

# Initialize White player
workspace.SwitchWorkspace(WHITE_WORKSPACE, True)
with core.DeviceScope(device_opts):
    white_deploy_model = model_helper.ModelHelper(name="white_policy_deploy", arg_scope=arg_scope, init_params=False)
    AddConvModel(white_deploy_model, "data", conv_level=WHITE_CONV_LEVEL, filters=WHITE_FILTERS)
if WHITE_PRE_TRAINED_ITERS > 0:
    white_deploy_model.net = pe.prepare_prediction_net(os.path.join(WHITE_LOAD_FOLDER, "policy_model.minidb"), "minidb", device_option=device_opts)
else:
    workspace.RunNetOnce(white_deploy_model.param_init_net)
workspace.CreateNet(white_deploy_model.net, overwrite=True)

## Run the tournament and training

### Compete

In [None]:
from go import GameState, BLACK, WHITE, EMPTY
from preprocessing import Preprocess
from datetime import datetime

game_state = [GameState() for i in range(GAMES_ITERS)]
game_result = [0] * GAMES_ITERS # 0 - Not Ended; BLACK - Black Wins; WHITE - White Wins
p = [Preprocess()] * GAMES_ITERS
history = [[] for i in range(GAMES_ITERS)]
# board before current move
board = [p[i].state_to_tensor(game_state[i]).astype(np.float32) for i in range(GAMES_ITERS)]

# diverse the first step for GAMES_ITERS choices
workspace.SwitchWorkspace(BLACK_WORKSPACE)
workspace.FeedBlob('data', board[0], device_option=device_opts)

workspace.RunNet(black_deploy_model.net)
init_move = np.reshape(workspace.FetchBlob('predict'), (-1))
init_sorted_move = np.argsort(-init_move) # shape=(361,)

current_choice = init_sorted_move[0:GAMES_ITERS]
x = current_choice/19 # tensor
y = current_choice%19 # tensor

for i in range(GAMES_ITERS):
    history[i].append(('B',x[i],y[i],board[i]))
    game_state[i].do_move(action=(x[i],y[i]),color = BLACK)
    print('game({}) step({}) black move({},{})'.format(i, 0, x[i], y[i]))

In [None]:
# for each step in the game
for step in range(1,500):

#    for i in range(GAMES_ITERS):
#        if not game_result[i]:
#            board[i] = np.append(board[i], p[i].state_to_tensor(game_state[i]).astype(np.float32), axis=0)
    board = np.concatenate([p[i].state_to_tensor(game_state[i]).astype(np.float32) for i in range(GAMES_ITERS)])

    # pass move = if all moves are illegal
    pass_move = [True] * GAMES_ITERS
    
    if step % 2 == 0:
        # black move
        workspace.SwitchWorkspace(BLACK_WORKSPACE)
        workspace.FeedBlob('data', board, device_option=device_opts)
        workspace.RunNet(black_deploy_model.net)
        move = np.reshape(workspace.FetchBlob('predict'), (GAMES_ITERS,-1))
        sorted_move = np.argsort(-move)
        for i in range(GAMES_ITERS):
            if game_result[i]:
                break;
            for current_choice in sorted_move[i]:
                x = current_choice/19
                y = current_choice%19
                if game_state[i].is_legal(action=(x,y)) and not game_state[i].is_eye((x, y), BLACK):
                    pass_move[i] = False
                    history[i].append(('B',x,y,board[i]))
                    game_result[i] = game_state[i].do_move(action=(x,y),color = BLACK) # End of Game?
                    print('game({}) step({}) black move({},{})'.format(i, step, x, y))
                    break
    else:
        # white move
        workspace.SwitchWorkspace(WHITE_WORKSPACE)
        workspace.FeedBlob('data', board, device_option=device_opts)
        workspace.RunNet(white_deploy_model.net)
        move = np.reshape(workspace.FetchBlob('predict'), (GAMES_ITERS,-1))
        sorted_move = np.argsort(-move)
        for i in range(GAMES_ITERS):
            if game_result[i]:
                break;
            for current_choice in sorted_move[i]:
                x = current_choice/19
                y = current_choice%19
                if game_state[i].is_legal(action=(x,y)) and not game_state[i].is_eye((x, y), WHITE):
                    pass_move[i] = False
                    history[i].append(('W',x,y,board[i]))
                    game_result[i] = game_state[i].do_move(action=(x,y),color = WHITE) # End of Game?
                    print('game({}) step({}) white move({},{})'.format(i, step, x, y))
                    break
                    
    game_result = [game_result[i] or pass_move[i] for i in range(GAMES_ITERS)]
    
    if np.all(game_result):
        break

### Record the game in SGF format

In [None]:
import sgf
def write_back_sgf(game_state, history, i):
    parser = sgf.Parser()
    collection = sgf.Collection(parser)
    parser.start_gametree()
    parser.start_node()
    parser.start_property('FF') # SGF format version
    parser.add_prop_value('4')
    parser.end_property()
    parser.start_property('SZ') # Board Size = 19
    parser.add_prop_value('19')
    parser.end_property()
    parser.start_property('KM') # Komi = 7.5
    parser.add_prop_value('7.5')
    parser.end_property()
    parser.start_property('PB') # Black Player = Supervised Learning / Reinforced Learning
    parser.add_prop_value('RL-{}')
    parser.end_property()
    parser.start_property('PW') # White Player = Supervised Learning / Reinforced Learning
    parser.add_prop_value('SL-{}')
    parser.end_property()
    parser.start_property('DT') # Game Date
    parser.add_prop_value(datetime.now().strftime("%Y-%m-%d"))
    parser.end_property()
    parser.start_property('RE') # Result = B+, W+, T
    winner = game_state.get_winner()
    if winner == BLACK:
        parser.add_prop_value('B+')
        winner = 'B+'
    elif winner == WHITE:
        parser.add_prop_value('W+')
        winner = 'W+'
    else:
        parser.add_prop_value('T')
        winner = 'T'
    parser.end_property()
    parser.end_node()
    
    for step in history:
        parser.start_node()
        parser.start_property(step[0]) # or W
        parser.add_prop_value(BOARD_POSITION[step[1]]+BOARD_POSITION[step[2]])
        parser.end_property()
        parser.end_node()
    
    parser.end_gametree()
    
    # record the game in SGF
    with open(os.path.join(os.path.expanduser('~'), 'python', 'tutorial_files','selfplay',
                           '({}_{}_{})vs({}_{}_{})_{}_{}_{}.sgf'.format(
                               BLACK_CONV_LEVEL, BLACK_FILTERS, BLACK_PRE_TRAINED_ITERS,
                               WHITE_CONV_LEVEL, WHITE_FILTERS, WHITE_PRE_TRAINED_ITERS,
                               winner,
                               i,
                               datetime.now().strftime("%Y-%m-%d"))), "w") as f:
        collection.output(f)

#comment out for better performance
for i in range(GAMES_ITERS):
    write_back_sgf(game_state[i], history[i], i)

## Learn from the winning games

In [None]:
iter = 0
k = 0
for i in range(GAMES_ITERS):
    winner = game_state[i].get_winner()
    print('Learning {} steps in {} of {} games. {} wins'.format(iter * 32, i, GAMES_ITERS, history[i][0][0]))
    for step in history[i]:
        if (step[0] == 'B' and winner == BLACK) or (step[0] == 'W' and winner == WHITE):
            data[k] = step[3]
            label[k] = step[1]*19+step[2]
            k += 1
            if k == TRAIN_BATCHES:
                iter += 1
                k = 0
                workspace.SwitchWorkspace(BLACK_WORKSPACE)
                workspace.FeedBlob("data", data, device_option=device_opts)
                workspace.FeedBlob("label", label, device_option=device_opts)
                workspace.RunNet(black_train_model.net)
                workspace.SwitchWorkspace(WHITE_WORKSPACE)
                workspace.FeedBlob("data", data, device_option=device_opts)
                workspace.FeedBlob("label", label, device_option=device_opts)
                workspace.RunNet(white_train_model.net)
print('Finished')

### Save the RL model

In [None]:
if not os.path.exists(BLACK_SAVE_FOLDER):
    os.makedirs(BLACK_SAVE_FOLDER)
# construct the model to be exported
pe_meta = pe.PredictorExportMeta(
    predict_net=black_deploy_model.net.Proto(),
    parameters=[str(b) for b in black_deploy_model.params], 
    inputs=["data"],
    outputs=["predict"],
)
pe.save_to_db("minidb", os.path.join(BLACK_SAVE_FOLDER, "policy_model.minidb"), pe_meta)
print('Params saved to {}'.format(BLACK_SAVE_FOLDER))
    
if not os.path.exists(WHITE_SAVE_FOLDER):
    os.makedirs(WHITE_SAVE_FOLDER)
# construct the model to be exported
pe_meta = pe.PredictorExportMeta(
    predict_net=white_deploy_model.net.Proto(),
    parameters=[str(b) for b in white_deploy_model.params], 
    inputs=["data"],
    outputs=["predict"],
)
pe.save_to_db("minidb", os.path.join(WHITE_SAVE_FOLDER, "policy_model.minidb"), pe_meta)
print('Params saved to {}'.format(WHITE_SAVE_FOLDER))