# Initialization

In [58]:
println("Loading dependencies")
using StatsBase;
using JSON;
using YAML;
using BenchmarkTools;
using ProgressMeter;
using SQLite;
using DataFrames;
using ThreadsX;
using Flux;
using CUDA;
include("../common/game.jl");

Loading dependencies




In [2]:
println("Initializing training with $(Threads.nthreads()) threads")

Initializing training with 8 threads


In [80]:
println("Loading training config from config.yaml")
config = YAML.load_file("config.yaml");
println("Loading model parameters from model.yaml")
model = YAML.load_file(config["model-parameters"]);

Loading training config from config.yaml
Loading model parameters from model.yaml


# State Space

In [72]:
if config["statespace"]["generate"]
    println("Generating state space")
    
    println("Saving to $(config["statespace"]["filepath"])")
    
else
    println("Loading state space from $(config["statespace"]["filepath"])")
    STATE_SPACE = Int64[JSON.parsefile(config["statespace"]["filepath"],use_mmap=false)...];
end;

Loading state space from statespace.json


# ETL

In [8]:
training_games_query = """
SELECT 
    games.game_id, winners.username as winner_username, losers.username as loser_username,
    winners.is_bot as winner_is_bot, losers.is_bot as loser_is_bot, COUNT(*)/2 as duration
FROM games
LEFT JOIN players winners
    on games.winner_id = winners.player_id
LEFT JOIN players losers
    on games.loser_id = losers.player_id
RIGHT JOIN breaths 
    on games.game_id = breaths.game_id
WHERE
    (games.game_id>=$(model["data"]["epoch-start"] == -Inf ? 0 : model["data"]["epoch-start"]))
AND
(games.game_id<=$(model["data"]["epoch-end"] == Inf ? time() : model["data"]["epoch-end"]))
AND 
    ((winner_is_bot=0) OR (loser_is_bot=0))
GROUP BY games.game_id
"""
#Replace winner/loser IDs with usernames, and indicate whether each is a bot.
#Right join the breaths table in order to count the duration of each game.
#Games in the training set must have at least one human player, and must occur
#before the epoch time cutoff.

training_breaths_query = """
SELECT
    training_games.game_id, breaths.state, breaths.action, breaths.is_winner,
    (CASE breaths.is_winner
        WHEN 1 THEN training_games.winner_is_bot
        ELSE training_games.loser_is_bot
    END) as is_bot
FROM ($(training_games_query)) training_games
RIGHT JOIN breaths
    on training_games.game_id = breaths.game_id
WHERE (is_bot=0)
"""
#Use a right join on the `training_games` table to ensure we are only using breaths
#from the training set.
;

In [74]:
println("Retrieving gameplay data")

db = eval(Meta.parse(config["parless"]["database"]))
#Initialize connection to database.

training_breaths = DBInterface.execute(db,training_breaths_query) |> DataFrame

println("""Data contain $(nrow(training_breaths)) breaths.""")

n_starting_states = sum(1 for state in unique(training_breaths[:,"state"]) if state in STARTING_STATES)
println("""$(n_starting_states) of $(length(STARTING_STATES)) possible starting states ($(
    round(100*n_starting_states/length(STARTING_STATES),digits=2))%).""")

println("""$(length(unique(training_breaths[:,"state"]))) distinct states of $(
    length(STATE_SPACE)) possible states visited ($(
    round(100*length(unique(training_breaths[:,"state"]))/length(STATE_SPACE),digits=2)
    )% of state space).""")

DBInterface.close!(db)

Retrieving gameplay data
Data contain 7226 breaths.
987 of 2150 possible starting states (45.91%).
5361 distinct states of 668884 possible states visited (0.8% of state space).


# PARLESS

In [35]:
parless_strategies = Dict{Int64,Dict{Int64,Float64}}()

if model["parless"]["enabled"]
    println("Performing PARLESS reweighting")  
    
        @showprogress for state in unique(training_breaths[:,"state"])
            action_counts = countmap(
                training_breaths[training_breaths[:,"state"].==state,"action"]
            )
            actions = possible_actions(state)
            dirichlet_posterior = Dict(
                action => get(action_counts,action,0)+model["parless"]["prior-pseudocounts"]/
                    length(actions) for action in actions
            )
            sum_pseudocounts = sum(values(dirichlet_posterior))
            categorical_posterior = Dict(
                action => pseudocounts/sum_pseudocounts 
                for (action,pseudocounts) in dirichlet_posterior
            )
            parless_strategies[state] = categorical_posterior
        end
end

Performing PARLESS reweighting


[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:07[39m


# PAWNN

In [36]:
println("Formatting data for PAWNN extrapolation")
training_state_matrix = vcat([
    transpose(state_int2vector(state)) for state in unique(training_breaths[:,"state"])]...
    #Reverse the digits so the first component is breath number; transpose observations 
    #into row-vectors. Multiply by 1.0 so vectors are float-valued.
)

training_strategy_matrix = vcat([
    transpose(strategy_dict2vector(parless_strategies[state])) 
    for state in unique(training_breaths[:,"state"])]...
    #Match strategy vectors to their corresponding states.
);

Formatting data for PAWNN


In [55]:
println("Initializing PAWNN network")

Random.seed!(config["pawnn"]["random-seed"])

pawnn_network = Chain(
    (eval(Meta.parse(layer)) for layer in model["pawnn"]["network-structure"])...
)

loss_metric = eval(Meta.parse(model["pawnn"]["loss-metric"]))
loss(x,y) = loss_metric(pawnn_network(x), y)
optimizer = eval(Meta.parse(model["pawnn"]["optimizer"]));
#Initialize the model.

Initializing PAWNN network


In [81]:
if !isnothing(config["pawnn"]["initial-weights-file"])
     println("Loading network weights from $(config["pawnn"]["initial-weights-file"])")
    
    loaded_weights = JSON.parsefile(config["pawnn"]["initial-weights-file"],use_mmap=false)

    reshaped_weights = [
        (typeof(layer[1]) <: Vector) ? hcat([col for col in layer]...) : layer
        #Convert each layer's parameters from a nested list back into a matrix.
        #Single-column matrices will not be formatted as nested lists, so watch out.
        for layer in loaded_weights
    ]
    
    Flux.loadparams!(
        pawnn_network,
        reshaped_weights
    )
    #Load previously saved parameters.
end

In [71]:
processor = cpu
if config["pawnn"]["use-gpu"]
    if CUDA.functional()
         println("Initializing training process on GPU")
        processor = gpu
    else
        println("No CUDA-capable GPU detected; initializing training process on CPU instead")
    end
else
    println("Initializing training process on CPU")
end

No CUDA-capable GPU detected; initializing training process on CPU instead


In [56]:
println("Training PAWNN network on $(size(training_strategy_matrix,1)) distinct PARLESS strategies.")

@showprogress for epoch in 1:model["pawnn"]["n-epochs"]
    Flux.train!(
        loss,
        Flux.params(pawnn_network),
        zip(
            eachrow(training_state_matrix),
            eachrow(training_strategy_matrix)
            #The `eachrow` calls are essentiall; otherwise, `zip` doesn't know
            #what to pair up.
        ), 
        optimizer
    )
end
#Train the model.

Training PAWNN network


[32mProgress: 100%|█████████████████████████████████████████| Time: 0:01:05[39m


In [None]:
if !isnothing(config["pawnn"]["weights-output-file"])
    println("Saving network weights to $(config["pawnn"]["weights-output-file"])")
    open(config["pawnn"]["weights-output-file"],"w") do f
        JSON.print(f,[layer_params for layer_params in Flux.params(pawnn_network)])
        #Parameters are formatted as a vector of matrices. Each matrix is serialized as a list
        #of column vectors. 
    end 
end

# Transition Mapping

In [173]:
println("Calculating state-action transition probabilities: ");

progress = Progress(length(STATE_SPACE))

transitions = ThreadsX.mapreduce(
    state -> begin next!(progress); return Dict(state=>transitionmap(state)) end,
    merge,
    STATE_SPACE;
    init = Dict{Int64, Dict{Int64, Dict{Int64, Float64}}}()
    #Must specify initial value as an empty dict.
);

Calculating state-action transition probabilities: 


[30m 100%|███████████████████████████████████████████████████| Time: 0:38:35[39m                                                                            08[39m[30m  23%|████████████                                       |  ETA: 0:14:55[39m


# Value Iteration

In [83]:
function state_reward(s::Int)
    if state < 0
        return (state == -1) ? +1.0 : -1.0
        #If < 0, it's either -1 (a win) or -2 (a loss).
    else
        return 0.0
    end
    #Note: values must be specified as floats or else
    #nested dicts in the Q-function may assume values to be
    #integers.
end

function state_action_reward(state::Int,action::Int,transition_probabilities::Dict)
    """This function returns the immediate expected reward of a (state,action) pair,
    calculated as the probability-weighted sum of the state-rewards of the possible
    successor states."""
    return sum([p*state_reward(successor) for (successor,p) in transition_probabilities[state][action]])
end

function initialize_Q_function(transition_probabilities)
    """This function initializes a state-action value function
    such that each (state,action) pair maps to its immediate expected reward."""
    Q_function = Dict(
        state => Dict(
            action => state_action_reward(state,action,transition_probabilities)
            for action in keys(transitions)
        )
        for (state,transitions) in transition_probabilities
    )
end

function value_iteration!(Q_function::Dict,transition_probabilities::Dict;discount=0.95)
    """This function performs a single training episode of value iteration on a given Q-function
    dictionary, based on an input transition probability map.

    Note: this function mutates the `Q_function` argument."""
    greedy_choice = (state -> argmax(Q_function[state]))
    #Use a greedy policy, in which we choose the action with the highest state-action value for the
    #given state.
    V_function = Dict(state => Q_function[state][greedy_choice(state)] for state in STATE_SPACE)
    #The state-value function assigns each state the value resulting from taking the
    #optimal action from that state.

    for (state, action_maps) in transition_probabilities
        for (action, successors) in action_maps
            immediate_reward = state_action_reward(state,action,transition_probabilities)
            successor_rewards = sum([p*V_function[successor] for (successor,p) in successors])
            Q_function[state][action] = immediate_reward + discount*successor_rewards
            #Update each value of the Q-function so that it is equal to the immediate
            #state-action reward plus the discounted sum of the state-values
            #of the immediate successor states, weighted by their transition probabilities.
        end
    end
    return Q_function

end

function Q_function_error(Q1::Dict,Q2::Dict;method=:rmse)
    """This function computes the error between two Q-functions
    by comparing all (state,action) pairs. By default, the RMSE
    is calculated, however it is also possible to compute the number
    of greedy choices that are different."""
    err = 0
    for (state,actions) in Q1
        if method == :rmse
            for (action,value) in actions
                err += (Q1[state][action] - Q2[state][action])^2
                #Sum the squared errors for every (state,action) pair.
            end
        else
            err += !(argmax(Q1[state])==argmax(Q2[state]))
            #Add 1 to the error for any state for which the highest-value action
            #is different between Q1 and Q2.
        end
    end
    if method == :rmse
        return sqrt(err)
    else
        return err
    end
end
;