In [None]:
using Random

# 簡単なバンディット環境
struct Bandit
    true_probs::Vector{Float64}  # 各アクションの真の報酬確率
end

function step(bandit::Bandit, action::Int)
    reward = rand() < bandit.true_probs[action] ? 1.0 : 0.0
    return reward
end

# 方策ネットワーク（ソフトマックス方策）
mutable struct PolicyNetwork
    params::Vector{Float64}  # 行動のパラメータ（スコア）
end

function softmax(logits::Vector{Float64})
    exp_logits = exp.(logits .- maximum(logits))
    return exp_logits / sum(exp_logits)
end

function select_action(policy::PolicyNetwork)
    probs = softmax(policy.params)
    return rand(Categorical(probs)), probs
end

# REINFORCEアルゴリズム
function reinforce()
    Random.seed!(42)
    bandit = Bandit([0.3, 0.7])  # 2つのアクションの報酬確率
    policy = PolicyNetwork([0.0, 0.0])  # 初期パラメータ
    learning_rate = 0.1
    num_episodes = 1000
    
    for episode in 1:num_episodes
        action, probs = select_action(policy)
        reward = step(bandit, action)
        
        # 方策の勾配更新
        policy.params[action] += learning_rate * (reward - 0.5) * (1 - probs[action])
        for i in 1:length(policy.params)
            if i != action
                policy.params[i] -= learning_rate * (reward - 0.5) * probs[i]
            end
        end
    end
    return softmax(policy.params)
end

# 学習後の方策を確認
learned_policy = reinforce()
println("Learned Policy Probabilities: ", learned_policy)

In [None]:
using Random

# 迷路環境
mutable struct MazeEnv
    grid::Array{Int,2}  # 迷路のマップ (0: 通路, 1: 壁, 2: ゴール)
    start::Tuple{Int,Int}  # 開始位置
    goal::Tuple{Int,Int}  # ゴール位置
    agent_pos::Tuple{Int,Int}  # エージェントの現在位置
end

function reset!(env::MazeEnv)
    env.agent_pos = env.start
    return env.agent_pos
end

function step!(env::MazeEnv, action::Int)
    moves = [(0,-1), (0,1), (-1,0), (1,0)]  # 左, 右, 上, 下
    new_pos = (env.agent_pos[1] + moves[action][1], env.agent_pos[2] + moves[action][2])
    
    if env.grid[new_pos...] != 1  # 壁でないなら移動
        env.agent_pos = new_pos
    end
    
    reward = env.agent_pos == env.goal ? 1.0 : -0.01
    done = env.agent_pos == env.goal
    return env.agent_pos, reward, done
end

# 方策ネットワーク (簡易的なテーブル)
mutable struct PolicyNetwork
    params::Dict{Tuple{Int,Int}, Vector{Float64}}  # 各状態のアクション確率
end

function softmax(logits::Vector{Float64})
    exp_logits = exp.(logits .- maximum(logits))
    return exp_logits / sum(exp_logits)
end

function select_action(policy::PolicyNetwork, state::Tuple{Int,Int})
    probs = softmax(policy.params[state])
    return rand(Categorical(probs)), probs
end

# REINFORCEアルゴリズム
function reinforce_maze()
    Random.seed!(42)
    grid = [0 0 1 0 0; 0 1 1 0 2; 0 0 0 0 1; 1 0 1 0 0; 0 0 0 1 0]
    env = MazeEnv(grid, (1,1), (2,5), (1,1))
    policy = PolicyNetwork(Dict((i,j) => [0.0, 0.0, 0.0, 0.0] for i in 1:5, j in 1:5 if grid[i,j] != 1))
    learning_rate = 0.1
    num_episodes = 500
    
    for episode in 1:num_episodes
        state = reset!(env)
        trajectory = []
        done = false
        
        while !done
            action, probs = select_action(policy, state)
            new_state, reward, done = step!(env, action)
            push!(trajectory, (state, action, reward, probs))
            state = new_state
        end
        
        G = 0
        for (state, action, reward, probs) in reverse(trajectory)
            G += reward
            policy.params[state][action] += learning_rate * G * (1 - probs[action])
            for i in 1:4
                if i != action
                    policy.params[state][i] -= learning_rate * G * probs[i]
                end
            end
        end
    end
    return policy
end

# 学習後の方策を確認
learned_policy = reinforce_maze()
println("Learned Policy: ", learned_policy)