# MCTS POMDP

Reusing much of polar_pomdp0.45 for problem structure, particle filter, etc.

### Setup

In [5]:
using Plots
using ParticleFilters
using Distributions
using StaticArrays
using LinearAlgebra
using Random
using StatsBase
#using Reel
using SparseArrays
using GridInterpolations
using DataStructures
using DataFrames
using CSV
using Distributed

┌ Info: Recompiling stale cache file /Users/dliedtka/.julia/compiled/v1.2/ParticleFilters/vVum1.ji for ParticleFilters [c8b314e2-9260-5cf8-ae76-3be7461ca6d0]
└ @ Base loading.jl:1240
┌ Info: Recompiling stale cache file /Users/dliedtka/.julia/compiled/v1.2/CSV/HHBkp.ji for CSV [336ed68f-0bac-5ca0-87d4-7b16caf5d00b]
└ @ Base loading.jl:1240


In [6]:
include("atan2.jl")
include("obs_rel.jl")
include("polargrid_rel_qual.jl")
;

In [7]:
rng = MersenneTwister(2)
TGT_SPD = 1
;

### Problem Structure

Function to randomly determine next target course

In [8]:
function next_crs(crs,rng)
    if rand(rng) < .9
        return crs
    end
    crs = (crs + rand(rng,[-1,1])*30) % 360
    if crs < 0 crs += 360 end
    return crs
end
;

True state transition function

In [9]:
# state as tuple (x, y, crs, spd) of target (spd of o/s)
function f(state, control, rng)
    r, θ, crs, spd = state
    θ += control[1]
    spd = control[2]
    if θ < 0 θ += 360 end
    θ = θ % 360
    crs -= control[1]
    if crs < 0 crs += 360 end
    crs = crs % 360
    x = r*cos(π/180*θ)
    y = r*sin(π/180*θ)
    pos = [x + TGT_SPD*cos(π/180*crs) - spd, y + 
        TGT_SPD*sin(π/180*crs)]
    crs = next_crs(crs,rng)
    r = sqrt(pos[1]^2 + pos[2]^2)
    θ = atan2(pos[1],pos[2])*180/π
    if θ < 0 θ += 360 end
    return (r, θ, crs, spd)::NTuple{4, Real}
end
;

Wrapper for f that returns vector rather than Tuple for particle filter

In [10]:
function f2(x, u, rng)
    temp = [i for i in f(x, u, rng)]
    return temp
end
;

Reward function

In [11]:
function r(s)
    range = s[1]
    if range > 150 return -.1 end  # reward to not lose track of contact
    if range <= 10 return -1 end  # collision avoidance
    return .1  # being in "sweet spot" maximizes reward
end
;

Action space and function to convert from action to index and vice versa

In [12]:
action_space = ((-30,1), (-30, 2), (0, 1), (0, 2), (30, 1), (30, 2))

action_to_index(a) = trunc(Int, 2*(a[1]/30+1) + a[2])

function index_to_action(a)
    if a % 2 == 0
        return ( trunc(Int,(((a - 2) / 2) - 1) * 30), 2)
    else
        return ( trunc(Int,(((a - 1) / 2) - 1) * 30), 1)
    end
end
;

### Particle Filter

Will be used for our belief state

In [13]:
num_particles = 500
model = ParticleFilterModel{Vector{Float64}}(f2, g)
pfilter = SIRParticleFilter(model, num_particles)
;

## MCTS Algorithm

#### MCTS Functions

Function to return index of optimal action using current Q values and possibly the exploration bonus

In [14]:
function arg_max_action(history, exploration_bonus=false)
    
    # only need to compute if exploration possibility
    if exploration_bonus
        N_h = 0
        for action in action_to_index.(action_space)
            new_index = copy(history)
            append!(new_index, action)
            N_h += N[new_index]
        end    
    end
    
    values = Float64[]
    for action in action_to_index.(action_space)
        
        new_index = copy(history)
        append!(new_index, action)
        
        # best action with exploration possibility
        if exploration_bonus
            append!(values, Q[new_index] + c * sqrt(log(N_h) / N([new_index])))
            
        # strictly best action
        else
            append!(values, Q[new_index])
        end
    end
    
    return argmax(values)
    
end
;

Function to rollout with random actions until we reach satisfactory depth

In [15]:
function rollout_random(state, depth)
    
    if depth == 0 return 0 end
    
    # random action
    random_action_index = rand(rng,action_to_index.(action_space))
    action = index_to_action(random_action_index)
    
    # generate next state and reward with random action; observation doesn't matter
    state_prime = f2(state, action, rng)
    reward = r(Tuple(state_prime))
    
    return reward + lambda * rollout_random(state_prime, depth-1)
    
end
;

Simulate function includes search, expansion, and rollout

In [16]:
function simulate(state, history, depth)
   
    if depth == 0 return 0 end
    
    
    # expansion
    test_index = copy(history)
    append!(test_index, 1)
    
    if !haskey(Q, test_index)
        for action in action_to_index.(action_space)
 
            # initialize Q and N to zeros
            new_index = copy(history)
            append!(new_index, action)
            Q[new_index] = 0
            N[new_index] = 0
            
        end
    end
    
    return rollout_random(state, depth)
    
    
    # search
    # find optimal action to explore
    search_action_index = arg_max_action(history, exploration_bonus=true)
    action = index_to_action(search_action_index)
    
    # take action; get new state, observation, and reward
    state_prime = f2(state, action, rng)
    observation = h(state_prime, rng)
    reward = r(Tuple(state_prime))
    
    # recursive call after taking action and getting observation
    new_history = copy(history)
    append!(new_history, search_action_index)
    append!(new_history, observation)
    q = reward + lambda * simulate(state_prime, new_history, depth-1)
    
    # update counts and values
    update_index = copy(history)
    append!(update_index, search_action_index)
    N[update_index] += 1
    Q[update_index] += ((q - Q[update_index]) / N[update_index])
    
    return q
    
end
;

Main MCTS function; called by MCTS wrapper at each time step to choose an action

In [31]:
function select_action(belief, depth)
    
    # empty history at top recursive call
    history = Int64[]
    
    # loop
    # timed loop, how long should intervals be?
    #start_time = time_ns()
    #while (time_ns() - start_time) / 1.0e9 < 1 # 1 second timer to start
    
    # counter for now, switch to time later
    counter = 0
    while counter < 100 # probably increase; small for debugging
        
        # draw state randomly based on belief state (pick a random particle)
        state = rand(rng,belief)
        
        # simulate
        simulate(state, history, depth)
        
        counter+=1
    end
    
    best_action_index = arg_max_action(history)
    action = index_to_action(best_action_index)
    return action
    
end
;

#### MCTS loop

Function to advance history tree after an action is chosen and observation is recorded

In [38]:
function modify_history_tree(last_action, last_obs)
    
    newQ = Dict{Array{Int64,1},Float64}()
    newN = Dict{Array{Int64,1},Float64}()
    
    for key in keys(Q)
        if key[1] == last_action && key[2] == last_obs
            newQ[key[3:length(key)]] = Q[key]
            newN[key[3:length(key)]] = N[key]
        else
            continue
        end
    end
    
    return (newQ, newN)
    
end
;

Initialize true state and belief state (particle filter); we assume perfect knowledge at start of simulation (could experiment otherwise with random beliefs)

In [33]:
# true state
# for now state is [range, bearing, relative course, own speed]
# assume a starting position within range of sensor and not too close
true_state = [rand(rng, 25:150), rand(rng,0:359), rand(rng,0:11)*30, 1]

# belief state
# assume perfect knowledge at first time step
belief = ParticleCollection([true_state for i in 1:num_particles])
;

Simulation prep/initialization; for now we start with no prior knowledge for Q values/N values, could incorporate this later

In [49]:
total_reward = 0

# global Q and N dictionaries, indexed by history (and optionally action to follow all in same array; using ints)
Q = Dict{Array{Int64,1},Float64}()
N = Dict{Array{Int64,1},Float64}()

# lambda, discount factor
lambda = 0.9

# exploration factor, experiment with different values
c = 1

# experiment with different depth parameters 
depth = 100 #5

# 500 time steps with an action to be selected at each
num_iters = 500 #500

action = nothing
observation = nothing
;

500 time step simulation

In [50]:
for time_step in 1:num_iters
    @show time_step
    
    # if action taken, modify history tree
    if action != nothing
        (Q,N) = modify_history_tree(action, observation)
    end
    
    
    # select an action
    action = select_action(belief, depth)
    
    # take action; get next true state, obs, and reward
    next_state = f2(true_state, action, rng)
    observation = h(next_state, rng)
    reward = r(Tuple(next_state))
    true_state = next_state
    
    # update belief state (particle filter)
    belief = update(pfilter, belief, action, observation)
    
    # accumulate reward
    total_reward += reward
    # might want to keep track of each step, could use an array to track states, reward, actions, obs
    
end
;

time_step = 1
time_step = 2
time_step = 3
time_step = 4
time_step = 5
time_step = 6
time_step = 7
time_step = 8
time_step = 9
time_step = 10
time_step = 11
time_step = 12
time_step = 13
time_step = 14
time_step = 15
time_step = 16
time_step = 17
time_step = 18
time_step = 19
time_step = 20
time_step = 21
time_step = 22
time_step = 23
time_step = 24
time_step = 25
time_step = 26
time_step = 27
time_step = 28
time_step = 29
time_step = 30
time_step = 31
time_step = 32
time_step = 33
time_step = 34
time_step = 35
time_step = 36
time_step = 37
time_step = 38
time_step = 39
time_step = 40
time_step = 41
time_step = 42
time_step = 43
time_step = 44
time_step = 45
time_step = 46
time_step = 47
time_step = 48
time_step = 49
time_step = 50
time_step = 51
time_step = 52
time_step = 53
time_step = 54
time_step = 55
time_step = 56
time_step = 57
time_step = 58
time_step = 59
time_step = 60
time_step = 61
time_step = 62
time_step = 63
time_step = 64
time_step = 65
time_step = 66
time_step = 67
time

In [51]:
total_reward

50.00000000000044

#### Plot Results

### Julia scratch space

@show then expression to println result