In [16]:
abstract type MDP end
abstract type Estado end
abstract type Accion end

## $epsilon- Greedy$

In [17]:
function ep_greedy(mdp::MDP,ϵ::Float64, s::Estado, q_value )
        
    if rand()<= ϵ
        action = rand(aLegales(mdp,s))
    else
        # choose an action based on epsilon-greedy algorithm
        values_=Dict( a => q_value[(s,a)] for a in aLegales(mdp,s) )
        action = findmax(values_)[2]
    end
    action
end

ep_greedy (generic function with 1 method)

## $Softmax$

In [18]:
function softmax()
    0
end

softmax (generic function with 1 method)

In [19]:
function politica(mdp::MDP,q_value)
    # encontrar la accion que maximice el q_value para cada estado
    
    s_legales=[s for s in mdp.states if !terminal(mdp,s)]
    sx=rand(s_legales)

    π_ = Dict(sx => rand(aLegales(mdp,sx)))
      
    for s in s_legales
        values= Dict(a => q_value[(s,a)] for a in aLegales(mdp,s))
        π_[s]=findmax(values)[2]
    end
        
    π_
end

politica (generic function with 1 method)

## Sarsa

In [20]:
# play for an episode
function sarsa(mdp::MDP,q_value, s0::Estado, ϵ::Float64, α::Float64, episodes,
        random_start,runs)
    
    steps=zeros(episodes)
    Rewards=zeros(episodes)
    
    for run ∈ 1:runs
        ep = 1
        while ep <= episodes
            # track the total time steps in this episode
            time = 0

            # initialize state
            if random_start
                state=rand(mdp.states)
                while terminal(mdp,state)
                    state=rand(mdp.states)
                end
            else 
                state = s0
            end

            action=ep_greedy(mdp,ϵ,state,q_value) #se toma una accion posible epsilon-Greedy
            rewards=0.0

            # keep going until get to the goal state
            while !terminal(mdp,state)

                next_state = step(mdp,state, action)

                if !terminal(mdp,next_state)
                    rewards+=r(mdp,state, action,next_state)
                    next_action=ep_greedy(mdp,ϵ,next_state, q_value)

                    # Sarsa update
                    q_value[(state,action)] += α * (r(mdp,state,action,next_state) + q_value[(next_state,next_action)] -
                             q_value[(state,action)])

                    action = next_action
                else
                    rew=r(mdp,next_state)
                    q_value[(state,action)] += α * (r(mdp,state,action,next_state) + rew -
                             q_value[(state,action)])
                    rewards+=rew

                end

                state = next_state
                time += 1

            end
            #println("episodio: $ep con $time")
            Rewards[ep]+=rewards
            steps[ep]+=time
            ep += 1
        end
    end
    Rewards /= runs
    steps /= runs

    π_=politica(mdp,q_value)
    return π_,steps,Rewards
end

sarsa (generic function with 1 method)

## Expected Sarsa

In [21]:
function expected_sarsa(mdp::MDP,q_value, s0::Estado, ϵ::Float64, α::Float64,γ::Float64, 
        episodes,random_start,runs)
    
    
    steps=zeros(episodes)
    Rewards=zeros(episodes)

    for run ∈ 1:runs
        ep = 1
        while ep <= episodes
            # track the total time steps in this episode
            time = 0

            # initialize state
            if random_start
                state=rand(mdp.states)
                while terminal(mdp,state)
                    state=rand(mdp.states)
                end
            else 
                state = s0
            end

            action=ep_greedy(mdp,ϵ,state,q_value) #se toma una accion posible epsilon-Greedy

            rewards = 0.0
            # keep going until get to the goal state
            while !terminal(mdp,state)
                next_state = step(mdp,state, action)
                reward=r(mdp,state,action,next_state)
                rewards += reward

                if !terminal(mdp,next_state)
                    target = 0.0
                    next_action=ep_greedy(mdp,ϵ,next_state, q_value)


                    q_next = Dict(a=>q_value[(next_state,a)] for a in aLegales(mdp,next_state))
                    max=findmax(q_next)[1]
                    best_actions=[a for (a,b) in q_next if b==max]

                    accionesL=aLegales(mdp,next_state)
                    for a in accionesL
                        if a in best_actions
                            target += ((1.0 - ϵ ) / length(best_actions) + ϵ / length(accionesL)) * q_value[(next_state, a)]
                        else
                            target += ϵ / length(accionesL) * q_value[(next_state, a)]
                        end
                    end

                    target *= γ
                else
                    #target=0
                    target=r(mdp,next_state)
                    next_action=nothing
                end

                q_value[(state, action)] += α *(reward + target - q_value[(state, action)])
                state = next_state
                action = next_action
                time += 1

            end
            Rewards[ep]+=rewards
            steps[ep]+=time
            ep += 1

        end
    end
    Rewards /= runs
    steps /= runs
                
    π_=politica(mdp,q_value)
    return π_,steps,Rewards
        
end

expected_sarsa (generic function with 1 method)

## Q-learning

In [22]:
function q_learning(mdp::MDP,q_value, s0::Estado, ϵ::Float64, α::Float64, γ::Float64, 
        episodes,random_start,runs)
    
    steps = zeros(episodes)
    Rewards = zeros(episodes)
    
    for run ∈ 1:runs
        ep = 1
        while ep <= episodes
            # track the total time steps in this episode
            time = 0
            rewards=0.0
            # initialize state
            if random_start
                state=rand(mdp.states)
                while terminal(mdp,state)
                    state=rand(mdp.states)
                end
            else 
                state = s0
            end

            while !terminal(mdp,state)
                action=ep_greedy(mdp,ϵ,state,q_value) #se toma una accion posible epsilon-Greedy
                next_state = step(mdp,state, action)
                reward=r(mdp,state,action,next_state)
                rewards += reward
                # Q-Learning update
                q_value[(state,action)]+= α * (reward + γ * maximum([q_value[next_state,a] for a in aLegales(mdp,next_state)]) -
                    q_value[state, action])
                #q_value[(state,action)]+= α * (reward + γ * maximum(q_value[(next_state,:)])  -
                #    q_value[(state, action)])
                state = next_state
                time += 1

            end
            Rewards[ep]+=rewards
            steps[ep]+=time
            ep += 1
        end
    end
    Rewards /= runs
    steps /= runs

    
    π_=politica(mdp,q_value)
    return π_,steps, Rewards
end

q_learning (generic function with 1 method)

# semi_gradient_n_step_sarsa

In [33]:
function semi_gradient_n_step_sarsa(mdp::MDP,value_function,funAction,ϵ,n=1)
    
    s=getInit()

    a= funAction(mc,s, value_function, ϵ)
    
    states=[s]
    actions = [a]
    rewards = [0.0]
    
    time = 1
    
    T = 9000
    
    while true
        time += 1
        if time <= T
            next_s, reward = step(mc,s, a)
            next_a = funAction(mc,next_s, value_function, ϵ)
            
            # track new state and action
            
            push!(states, next_s)
            push!(actions, next_a)
            push!(rewards, reward)

            if terminal(mc,next_s)
                T = time
            end
        end
        
        update_time = time - n
        if update_time >= 0
            returns = 0.0
            # calculate corresponding rewards
            for t in (update_time):(min(T, update_time + n))
                returns += rewards[t]
            end
            
            # add estimated state action value to the return
            if update_time + n <= T 
                returns += value(value_function,mc,states[update_time + n],actions[update_time + n])
            end

            # update the state value function
            if !terminal(mc,states[update_time])
                learn(value_function,states[update_time], actions[update_time],
                      returns)
            end
        end
        if update_time == T - 1
            break
        end
        s = next_s
        a = next_a
        
    end
    return time
    
end

semi_gradient_n_step_sarsa (generic function with 2 methods)