# REINFORCE: Vanilla Policy Gradient

1. Initialize the policy parameter $\theta$ at random.
2. Generate one trajectory on policy $\pi_{\theta}$: $s_1, a_1, r_2, \cdots, s_T$.
3. For $t = 1, 2, \cdots, T$:
    - Estimate the return $G_t$.
    - Update policy parameters: $\theta \leftarrow \theta + \alpha \gamma^t G_t \nabla_\theta \log \pi_{\theta}(a_t | s_t)$

~~~python
    import torch
    from actor import Actor

    class REINFORCE:
        def __init__(self, state_dim, hidden_dim, action_dim, learning_rate, gamma, device):
            self.actor = Actor(state_dim, hidden_dim, action_dim).to(device)
            self.optimizer = torch.optim.Adam(self.actor.parameters(), lr=learning_rate)
            self.gamma = gamma
            self.device = device
        
        def update(self, rollout_buffer):
            G = 0
            self.optimizer.zero_grad()
            for i in reversed(range(len(rollout_buffer.reward_list))):
                reward = rollout_buffer.reward_list[i]
                obs = rollout_buffer.obs_list[i, :]
                action_log_prob = torch.log(self.actor(obs))
                G = self.gamma * G + reward
                loss = -action_log_prob * G
                loss.backward()
            self.optimizer.step()
~~~

In [1]:
import gymnasium as gym

from alg.policy_gradient import VPGAgent

env = gym.make('CartPole-v1')

learning_rate = 1e-4
gamma = 0.99
num_episodes = 1000

agent = VPGAgent(env, learning_rate, gamma)

agent.train(num_episodes)

Episode: 1/1000, Total Reward: 15.0
Episode: 2/1000, Total Reward: 52.0
Episode: 3/1000, Total Reward: 14.0
Episode: 4/1000, Total Reward: 16.0
Episode: 5/1000, Total Reward: 25.0
Episode: 6/1000, Total Reward: 31.0
Episode: 7/1000, Total Reward: 13.0
Episode: 8/1000, Total Reward: 18.0
Episode: 9/1000, Total Reward: 14.0
Episode: 10/1000, Total Reward: 16.0
Episode: 11/1000, Total Reward: 21.0
Episode: 12/1000, Total Reward: 26.0
Episode: 13/1000, Total Reward: 16.0
Episode: 14/1000, Total Reward: 38.0
Episode: 15/1000, Total Reward: 16.0
Episode: 16/1000, Total Reward: 15.0
Episode: 17/1000, Total Reward: 16.0
Episode: 18/1000, Total Reward: 34.0
Episode: 19/1000, Total Reward: 16.0
Episode: 20/1000, Total Reward: 30.0
Episode: 21/1000, Total Reward: 20.0
Episode: 22/1000, Total Reward: 23.0
Episode: 23/1000, Total Reward: 13.0
Episode: 24/1000, Total Reward: 22.0
Episode: 25/1000, Total Reward: 22.0
Episode: 26/1000, Total Reward: 19.0
Episode: 27/1000, Total Reward: 12.0
Episode: 2