# MCTS POMDP

Reusing much of polar_pomdp0.45 for problem structure, particle filter, etc.

### Setup

In [2]:
using Plots
using ParticleFilters
using Distributions
using StaticArrays
using LinearAlgebra
using Random
using StatsBase
#using Reel
using SparseArrays
using GridInterpolations
using DataStructures
using DataFrames
using CSV
using Distributed

In [3]:
include("atan2.jl")
include("obs_rel.jl")
include("polargrid_rel_qual.jl")
;

In [4]:
rng = MersenneTwister(2)
TGT_SPD = 1
;

### Problem Structure

Function to randomly determine next target course

In [5]:
function next_crs(crs,rng)
    if rand(rng) < .9
        return crs
    end
    crs = (crs + rand(rng,[-1,1])*30) % 360
    if crs < 0 crs += 360 end
    return crs
end
;

True state transition function

In [6]:
# state as tuple (x, y, crs, spd) of target (spd of o/s)
function f(state, control, rng)
    r, θ, crs, spd = state
    θ += control[1]
    spd = control[2]
    if θ < 0 θ += 360 end
    θ = θ % 360
    crs -= control[1]
    if crs < 0 crs += 360 end
    crs = crs % 360
    x = r*cos(π/180*θ)
    y = r*sin(π/180*θ)
    pos = [x + TGT_SPD*cos(π/180*crs) - spd, y + 
        TGT_SPD*sin(π/180*crs)]
    crs = next_crs(crs,rng)
    r = sqrt(pos[1]^2 + pos[2]^2)
    θ = atan2(pos[1],pos[2])*180/π
    if θ < 0 θ += 360 end
    return (r, θ, crs, spd)::NTuple{4, Real}
end
;

Wrapper for f that returns vector rather than Tuple for particle filter

In [7]:
function f2(x, u, rng)
    temp = [i for i in f(x, u, rng)]
    return temp
end
;

Reward function

In [8]:
function r(s)
    range = s[1]
    if range > 150 return -.1 end  # reward to not lose track of contact
    if range <= 10 return -1 end  # collision avoidance
    return .1  # being in "sweet spot" maximizes reward
end
;

Action space and function to convert from action to index and vice versa

In [9]:
action_space = ((-30,1), (-30, 2), (0, 1), (0, 2), (30, 1), (30, 2))

action_to_index(a) = trunc(Int, 2*(a[1]/30+1) + a[2])

function index_to_action(a)
    if a % 2 == 0
        return ( trunc(Int,(((a - 2) / 2) - 1) * 30), 2)
    else
        return ( trunc(Int,(((a - 1) / 2) - 1) * 30), 1)
    end
end
;

### Particle Filter

Will be used for our belief state

In [10]:
num_particles = 500
model = ParticleFilterModel{Vector{Float64}}(f2, g)
pfilter = SIRParticleFilter(model, num_particles)
;

## MCTS Algorithm

#### MCTS Functions

Function to return index of optimal action using current Q values and possibly the exploration bonus

In [11]:
function arg_max_action(Q, N, history, c=nothing, exploration_bonus=false)
    
    # only need to compute if exploration possibility
    if exploration_bonus
        N_h = 0
        for action in action_to_index.(action_space)
            new_index = copy(history)
            append!(new_index, action)
            N_h += N[new_index]
        end    
    end
    
    values = Float64[]
    for action in action_to_index.(action_space)
        
        new_index = copy(history)
        append!(new_index, action)
        
        # best action with exploration possibility
        if exploration_bonus
            
            # ensure an action chosen zero times is always chosen
            if N[new_index] == 0
                return action
            end
            
            # compute exploration bonus, checking for zeroes (I don't think this will ever occur anyway...)
            if log(N_h) < 0
                numerator = 0
            else
                numerator = sqrt(log(N_h))
            end
            
            denominator = N[new_index]
            
            exp_bonus = c * numerator / denominator
            
            append!(values, Q[new_index] + exp_bonus)
        
        # strictly best action
        else
            append!(values, Q[new_index])
        end
    end
    
    return argmax(values)
    
end
;

Function to rollout with random actions until we reach satisfactory depth

In [12]:
function rollout_random(state, depth)
    
    if depth == 0 return 0 end
    
    # random action
    random_action_index = rand(rng,action_to_index.(action_space))
    action = index_to_action(random_action_index)
    
    # generate next state and reward with random action; observation doesn't matter
    state_prime = f2(state, action, rng)
    reward = r(Tuple(state_prime))
    
    return reward + lambda * rollout_random(state_prime, depth-1)
    
end
;

Simulate function includes search, expansion, and rollout

In [13]:
function simulate(Q, N, state, history, depth, c)
   
    if depth == 0 return 0 end
    
    
    # expansion
    test_index = copy(history)
    append!(test_index, 1)
    
    if !haskey(Q, test_index)
        for action in action_to_index.(action_space)
 
            # initialize Q and N to zeros
            new_index = copy(history)
            append!(new_index, action)
            Q[new_index] = 0
            N[new_index] = 0
            
        end
        
        return rollout_random(state, depth)
    end
    
    
    # search
    # find optimal action to explore
    search_action_index = arg_max_action(Q, N, history, c, true)
    action = index_to_action(search_action_index)
    
    # take action; get new state, observation, and reward
    state_prime = f2(state, action, rng)
    observation = h(state_prime, rng)
    reward = r(Tuple(state_prime))
    
    # recursive call after taking action and getting observation
    new_history = copy(history)
    append!(new_history, search_action_index)
    append!(new_history, observation)
    q = reward + lambda * simulate(Q, N, state_prime, new_history, depth-1, c)
    
    # update counts and values
    update_index = copy(history)
    append!(update_index, search_action_index)
    N[update_index] += 1
    Q[update_index] += ((q - Q[update_index]) / N[update_index])
    
    return (Q, N, q)
    
end
;

Main MCTS function; called by MCTS wrapper at each time step to choose an action

In [14]:
function select_action(Q, N, belief, depth, c)
    
    # empty history at top recursive call
    history = Int64[]
    
    # loop
    # timed loop, how long should intervals be?
    #start_time = time_ns()
    #while (time_ns() - start_time) / 1.0e9 < 1 # 1 second timer to start
    
    # counter for now, switch to time later
    counter = 0
    while counter < 100 # probably increase; small for debugging
        
        # draw state randomly based on belief state (pick a random particle)
        state = rand(rng,belief)
        
        # simulate
        simulate(Q, N, float(state), history, depth, c) # HERE!!!
        
        counter+=1
    end
    
    best_action_index = arg_max_action(Q, N, history)
    action = index_to_action(best_action_index)
    return (Q, N, action)
    
end
;

#### MCTS loop

Function to advance history tree after an action is chosen and observation is recorded

In [39]:
function modify_history_tree(last_action, last_obs)
    
    newQ = Dict{Array{Int64,1},Float64}()
    newN = Dict{Array{Int64,1},Float64}()
    
    for key in keys(Q)
        if length(key) > 2 && key[1] == action_to_index(last_action) && key[2] == last_obs
            newQ[key[3:length(key)]] = Q[key]
            newN[key[3:length(key)]] = N[key]
        else
            continue
        end
    end
    
    return (newQ, newN)
    
end
;

Perform a 500 time step trial with MCTS

In [48]:
function trial(depth, c)
    
    # Initialize true state and belief state (particle filter); we assume perfect knowledge at start of simulation (could experiment otherwise with random beliefs)
    
    # true state
    # for now state is [range, bearing, relative course, own speed]
    # assume a starting position within range of sensor and not too close
    true_state = [rand(rng, 100:150), rand(rng,0:359), rand(rng,0:11)*30, 1]

    # belief state
    # assume perfect knowledge at first time step
    #belief = ParticleCollection([true_state for i in 1:num_particles])
    # assume we just know it's between 100-150 distance
    belief = ParticleCollection([[rand(rng, 100:150), rand(rng,0:359), rand(rng,0:11)*30, 1] for i in 1:num_particles])
    
    
    
    # Simulation prep/initialization; for now we start with no prior knowledge for Q values/N values, could incorporate this later
    
    # global Q and N dictionaries, indexed by history (and optionally action to follow all in same array; using ints)
    Q = Dict{Array{Int64,1},Float64}()
    N = Dict{Array{Int64,1},Float64}()

    # global scope, not manipulating these parameters for now
    # lambda, discount factor
    #lambda = 0.95

    # experiment with different depth parameters 
    depth = depth
    # exploration factor, experiment with different values
    c = c
    
    action = nothing
    observation = nothing
    
    
    
    # run simulation
    
    total_reward = 0

    # 500 time steps with an action to be selected at each
    num_iters = 5#00
    
    for time_step = 1:num_iters
       
        #if time_step % 100 == 0 
        #    @show time_step
        #end
    
        # if action taken, modify history tree
        if action != nothing
            if true
                println("")
                println(N)
                println("")
                println(action_to_index(action))
                println(observation)
            end
            (Q,N) = modify_history_tree(action, observation)
            if true
                println("")
                println(N)
            end
        end
    
    
        # select an action
        #action = select_action(belief, depth, c)
        (Q, N, action) = select_action(Q, N, belief, depth, c)
    
        # take action; get next true state, obs, and reward
        next_state = f2(true_state, action, rng)
        observation = h(next_state, rng)
        reward = r(Tuple(next_state))
        true_state = next_state
    
        # update belief state (particle filter)
        belief = update(pfilter, belief, action, observation)
    
        # accumulate reward
        total_reward += reward
        # might want to keep track of each step, could use an array to track states, reward, actions, obs
    
    end
    
    return total_reward
    
end
;

#### Random Trial

500 time step random action simulation, for comparison to MCTS

In [49]:
function random_trial()
    
    # Initialize true state and belief state (particle filter); we assume perfect knowledge at start of simulation (could experiment otherwise with random beliefs)
    
    # true state
    # for now state is [range, bearing, relative course, own speed]
    # assume a starting position within range of sensor and not too close
    true_state = [rand(rng, 100:150), rand(rng,0:359), rand(rng,0:11)*30, 1]

    # belief state
    # assume perfect knowledge at first time step
    belief = ParticleCollection([true_state for i in 1:num_particles])
    
    
    
    # run simulation
    
    total_reward = 0

    # 500 time steps with an action to be selected at each
    num_iters = 500
    
    for time_step = 1:num_iters
    
        #if time_step % 100 == 0 
        #    @show time_step
        #end
    
        action = rand(rng, action_space)
    
        # take action; get next true state, obs, and reward
        next_state = f2(true_state, action, rng)
        observation = h(next_state, rng)
        reward = r(Tuple(next_state))
        true_state = next_state
    
        # accumulate reward
        total_reward += reward
    
    end
    
    return total_reward
    
end
;

In [50]:
# global scope params (that are not being experimented with)
lambda = 0.95

trial(2,20, Q, N)


Dict([3, 2, 5] => 0.0,[1, 0, 3] => 2.0,[1, 1, 3] => 0.0,[1, 3, 1] => 1.0,[3, 0, 4] => 1.0,[5, 1, 3] => 0.0,[3, 1, 2] => 0.0,[6, 1, 5] => 0.0,[1, 2, 6] => 0.0,[2, 3, 4] => 0.0,[2, 3, 1] => 0.0,[4, 2, 5] => 0.0,[3, 2, 1] => 0.0,[4, 0, 4] => 2.0,[4, 1, 5] => 0.0,[3, 1, 4] => 0.0,[1, 3, 2] => 1.0,[2] => 17.0,[2, 0, 6] => 2.0,[3, 3, 5] => 0.0,[5, 1, 6] => 0.0,[6, 0, 5] => 2.0,[5, 3, 1] => 1.0,[6, 2, 6] => 0.0,[4, 3, 3] => 0.0,[3, 0, 3] => 2.0,[4] => 16.0,[3, 0, 6] => 1.0,[4, 0, 2] => 2.0,[6, 0, 3] => 2.0,[6, 0, 6] => 1.0,[6, 3, 3] => 0.0,[2, 2, 2] => 0.0,[4, 1, 4] => 0.0,[2, 2, 3] => 0.0,[4, 2, 1] => 0.0,[6, 1, 6] => 0.0,[1, 2, 4] => 0.0,[5, 1, 5] => 0.0,[6, 2, 1] => 0.0,[2, 2, 6] => 0.0,[3, 0, 2] => 2.0,[6, 2, 2] => 0.0,[6, 0, 4] => 2.0,[2, 3, 3] => 0.0,[1, 3, 4] => 0.0,[3, 1, 5] => 0.0,[1, 1, 4] => 0.0,[5, 0, 6] => 1.0,[5, 3, 2] => 0.0,[6, 3, 2] => 0.0,[4, 0, 1] => 2.0,[4, 3, 2] => 0.0,[3] => 17.0,[2, 2, 4] => 0.0,[3, 2, 2] => 0.0,[1, 1, 1] => 0.0,[3, 3, 2] => 1.0,[5, 1, 2] => 1.0,[6, 3,

0.5

#### Trial Runs

Compare random reward to MCTS of various depths

Runs to find best c value

In [None]:
# global scope Q and N
Q = Dict{Array{Int64,1},Float64}()
N = Dict{Array{Int64,1},Float64}()
# global scope params (that are not being experimented with)
lambda = 0.95

# collect total reward for each trial
random_reward = Float64[]
mcts_depth1_c5_reward = Float64[]
mcts_depth5_c5_reward = Float64[]
mcts_depth10_c5_reward = Float64[]

# trials
for i = 1:100
    append!(random_reward, random_trial())
    
    append!(mcts_depth1_c5_reward, trial(1, 5))
    append!(mcts_depth5_c5_reward, trial(5, 5))
    append!(mcts_depth10_c5_reward, trial(10, 5))
    
    if i % 10 == 0
        println(i, " runs complete")
    end
end

# print results
println("")
println("Random reward")
println(random_reward)
println("Average: ", mean(random_reward))

println("")
println("MCTS depth 1 c 5 reward")
println(mcts_depth1_c5_reward)
println("Average: ", mean(mcts_depth1_c5_reward))

println("")
println("MCTS depth 5 c 5 reward")
println(mcts_depth5_c5_reward)
println("Average: ", mean(mcts_depth5_c5_reward))

println("")
println("MCTS depth 10 c 5 reward")
println(mcts_depth10_c5_reward)
println("Average: ", mean(mcts_depth10_c5_reward))

#### Plot Results

### Julia scratch space