# MCTS POMDP Heatmap

Reusing much of polar_pomdp0.45 for problem structure, particle filter, etc.

### Setup

In [1]:
using Plots
using ParticleFilters
using Distributions
using StaticArrays
using LinearAlgebra
using Random
using StatsBase
#using Reel
using SparseArrays
using GridInterpolations
using DataStructures
using DataFrames
using CSV
using Distributed

In [2]:
include("atan2.jl")
include("obs_rel.jl") # includes functions for generating observations for problem POMDP
;

In [3]:
rng = MersenneTwister(2)
TGT_SPD = 1
;

### Problem Structure

Function to randomly determine next target course

In [4]:
function next_crs(crs,rng)
    if rand(rng) < .9
        return crs
    end
    crs = (crs + rand(rng,[-1,1])*30) % 360
    if crs < 0 crs += 360 end
    return crs
end
;

True state transition function

In [5]:
# state as tuple (x, y, crs, spd) of target (spd of o/s)
function f(state, control, rng)
    r, θ, crs, spd = state
    θ += control[1]
    spd = control[2]
    if θ < 0 θ += 360 end
    θ = θ % 360
    crs -= control[1]
    if crs < 0 crs += 360 end
    crs = crs % 360
    x = r*cos(π/180*θ)
    y = r*sin(π/180*θ)
    pos = [x + TGT_SPD*cos(π/180*crs) - spd, y + 
        TGT_SPD*sin(π/180*crs)]
    crs = next_crs(crs,rng)
    r = sqrt(pos[1]^2 + pos[2]^2)
    θ = atan2(pos[1],pos[2])*180/π
    if θ < 0 θ += 360 end
    return (r, θ, crs, spd)::NTuple{4, Real}
end
;

Wrapper for f that returns vector rather than Tuple for particle filter

In [6]:
function f2(x, u, rng)
    temp = [i for i in f(x, u, rng)]
    return temp
end
;

Reward function

In [7]:
function r(s)
    range = s[1]
    if range > 150 return -.1  # reward to not lose track of contact
        elseif range <= 10 return -1  # collision avoidance
        elseif range <= 20 return 0.6
        elseif range <= 30 return 0.7
        elseif range <= 40 return 0.8
        elseif range <= 50 return 0.9
        elseif range <= 60 return 1
        elseif range <= 70 return 0.9
        elseif range <= 80 return 0.8
        elseif range <= 90 return 0.7
        elseif range <= 100 return 0.6
        elseif range <= 110 return 0.5
        elseif range <= 120 return 0.4
        elseif range <= 130 return 0.3
        elseif range <= 140 return 0.2
        else return 0.1 end
    return .1  # being in "sweet spot" maximizes reward
end
;

Action space and function to convert from action to index and vice versa

In [8]:
action_space = ((-30,1), (-30, 2), (0, 1), (0, 2), (30, 1), (30, 2))

action_to_index(a) = trunc(Int, 2*(a[1]/30+1) + a[2])

function index_to_action(a)
    if a % 2 == 0
        return ( trunc(Int,(((a - 2) / 2) - 1) * 30), 2)
    else
        return ( trunc(Int,(((a - 1) / 2) - 1) * 30), 1)
    end
end
;

### Particle Filter

Will be used for our belief state

In [9]:
num_particles = 500
model = ParticleFilterModel{Vector{Float64}}(f2, g)
pfilter = SIRParticleFilter(model, num_particles)
;

## MCTS Algorithm

#### MCTS Functions

Function to return index of optimal action using current Q values and possibly the exploration bonus

In [10]:
function arg_max_action(Q, N, history, c=nothing, exploration_bonus=false)
    
    # only need to compute if exploration possibility
    if exploration_bonus
        N_h = 0
        for action in action_to_index.(action_space)
            new_index = copy(history)
            append!(new_index, action)
            N_h += N[new_index]
        end    
    end
    
    values = Float64[]
    for action in action_to_index.(action_space)
        
        new_index = copy(history)
        append!(new_index, action)
        
        # best action with exploration possibility
        if exploration_bonus
            
            # ensure an action chosen zero times is always chosen
            if N[new_index] == 0
                return action
            end
            
            # compute exploration bonus, checking for zeroes (I don't think this will ever occur anyway...)
            if log(N_h) < 0
                numerator = 0
            else
                numerator = sqrt(log(N_h))
            end
            denominator = N[new_index]
            exp_bonus = c * numerator / denominator
            append!(values, Q[new_index] + exp_bonus)
        
        # strictly best action
        else
            append!(values, Q[new_index])
        end
    end
    
    return argmax(values)
    
end
;

Function to rollout with random actions until we reach satisfactory depth

In [11]:
function rollout_random(state, depth)
    
    if depth == 0 return 0 end
    
    # random action
    random_action_index = rand(rng,action_to_index.(action_space))
    action = index_to_action(random_action_index)
    
    # generate next state and reward with random action; observation doesn't matter
    state_prime = f2(state, action, rng)
    reward = r(Tuple(state_prime))
    
    return reward + lambda * rollout_random(state_prime, depth-1)
    
end
;

Simulate function includes search, expansion, and rollout

In [12]:
function simulate(Q, N, state, history, depth, c)
    
    if depth == 0 return (Q, N, 0) end
    
    
    # expansion
    test_index = copy(history)
    append!(test_index, 1)
    
    if !haskey(Q, test_index)
        
        for action in action_to_index.(action_space)
            # initialize Q and N to zeros
            new_index = copy(history)
            append!(new_index, action)
            Q[new_index] = 0
            N[new_index] = 0        
        end

        # rollout
        return (Q, N, rollout_random(state, depth))
        
    end
    
    
    # search
    # find optimal action to explore
    search_action_index = arg_max_action(Q, N, history, c, true)
    action = index_to_action(search_action_index)
    
    # take action; get new state, observation, and reward
    state_prime = f2(state, action, rng)
    observation = h(state_prime, rng)
    reward = r(Tuple(state_prime))
    
    # recursive call after taking action and getting observation
    new_history = copy(history)
    append!(new_history, search_action_index)
    append!(new_history, observation)
    (Q, N, successor_reward) = simulate(Q, N, state_prime, new_history, depth-1, c)
    q = reward + lambda * successor_reward
    
    # update counts and values
    update_index = copy(history)
    append!(update_index, search_action_index)
    N[update_index] += 1
    Q[update_index] += ((q - Q[update_index]) / N[update_index])
    
    return (Q, N, q)
    
end
;

Main MCTS function; called by MCTS wrapper at each time step to choose an action

In [13]:
function best_action_from_state(state, depth, c)
    
    Q = Dict{Array{Int64,1},Float64}()
    N = Dict{Array{Int64,1},Float64}()
    
    # empty history at top recursive call
    history = Int64[]
    
    # counted iterations for now, would switch to time for a production model
    counter = 0
    while counter < 100 # probably increase; small for debugging
        
        # simulate
        simulate(Q, N, float(state), history, depth, c)
        
        counter+=1
    end
    
    best_action_index = arg_max_action(Q, N, history)
    return best_action_index # want index this time
    
end
;

#### MCTS simulation

In [14]:
# global scope params (that are not being experimented with)
lambda = 0.95
;

## Heat Map

In [15]:
for course in [0, 30, 60, 90, 120, 150, 180, 210, 240, 270, 300, 330]
    @show course
    
    for speed in [1]
        
        best_actions = zeros(Int64, 401, 401)
        
        for x = -200:200
            if x % 10 == 0
                @show x
            end
    
            for y = -200:200
    
                array_x = x + 201
                array_y = y + 201
        
                # generate state
                radius = sqrt(x ^ 2 + y ^ 2)
                theta = atan2(x,y) * 180 / pi
                if theta < 0
                    theta += 360
                elseif theta == 360
                    theta = 0
                end
        
                state = [radius, theta, course, speed]
        
                # select action
                best_actions[array_x, array_y] = best_action_from_state(state, 5, 20)
                
            end
        end
        
        filename = string("mcts_heatmap/modifiedreward_depth5_course", course, "_speed1", ".csv")
        CSV.write(filename,  DataFrame(best_actions), writeheader=false)
        
    end
end

course = 0
x = -200
x = -190
x = -180
x = -170
x = -160
x = -150
x = -140
x = -130
x = -120
x = -110
x = -100
x = -90
x = -80
x = -70
x = -60
x = -50
x = -40
x = -30
x = -20
x = -10
x = 0
x = 10
x = 20
x = 30
x = 40
x = 50
x = 60
x = 70
x = 80
x = 90
x = 100
x = 110
x = 120
x = 130
x = 140
x = 150
x = 160
x = 170
x = 180
x = 190
x = 200
course = 30
x = -200
x = -190
x = -180
x = -170
x = -160
x = -150
x = -140
x = -130
x = -120
x = -110
x = -100
x = -90
x = -80
x = -70
x = -60
x = -50
x = -40
x = -30
x = -20
x = -10
x = 0
x = 10
x = 20
x = 30
x = 40
x = 50
x = 60
x = 70


InterruptException: InterruptException:

### Julia scratch space