# Policy Gradient Methods in Reinforcement Learning

**Objective**: Learn and implement Policy Gradient methods, specifically the REINFORCE algorithm.

## Key Concepts:
- **Policy Gradient**: Directly optimize the policy parameters using gradient ascent
- **REINFORCE**: A Monte Carlo policy gradient algorithm  
- **Policy Network**: Neural network that outputs action probabilities
- **Advantage**: How much better an action is compared to the average

## Why Policy Gradients?
- Can handle continuous action spaces
- Can learn stochastic policies
- More stable than value-based methods in some cases
- Direct policy optimization

In [1]:
# Simple imports for Policy Gradient
import numpy as np
import matplotlib.pyplot as plt

np.random.seed(42)
print("Policy Gradient setup ready!")

Policy Gradient setup complete!
We'll implement REINFORCE algorithm step by step


In [None]:
# Environment: 3-armed bandit
class SimpleBandit:
    def __init__(self):
        self.arm_probs = [0.2, 0.5, 0.8]  # arm 2 is best
        
    def step(self, action):
        return 1.0 if np.random.random() < self.arm_probs[action] else 0.0

env = SimpleBandit()

In [None]:
# Simple policy using action probabilities
class SimplePolicy:
    def __init__(self, n_actions=3, learning_rate=0.1):
        self.n_actions = n_actions
        self.lr = learning_rate
        self.action_probs = np.ones(n_actions) / n_actions
        
    def get_action(self):
        return np.random.choice(self.n_actions, p=self.action_probs)
    
    def update(self, action, reward):
        baseline = 0.5  # simple baseline
        advantage = reward - baseline
        
        # Update probabilities
        for a in range(self.n_actions):
            if a == action:
                self.action_probs[a] += self.lr * advantage
            else:
                self.action_probs[a] -= self.lr * advantage / (self.n_actions - 1)
        
        # Keep probabilities valid
        self.action_probs = np.maximum(self.action_probs, 0.01)
        self.action_probs = self.action_probs / np.sum(self.action_probs)

policy = SimplePolicy()

In [None]:
# REINFORCE training
def train_policy_gradient(env, policy, episodes=200):
    rewards_history = []
    
    for episode in range(episodes):
        action = policy.get_action()
        reward = env.step(action)
        policy.update(action, reward)
        rewards_history.append(reward)
    
    return rewards_history

# Train and visualize
rewards = train_policy_gradient(env, policy, episodes=300)

plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(rewards, alpha=0.3)
window = 20
moving_avg = [np.mean(rewards[max(0, i-window):i+1]) for i in range(len(rewards))]
plt.plot(moving_avg, linewidth=2)
plt.xlabel('Episode')
plt.ylabel('Reward')
plt.title('Learning Progress')

plt.subplot(1, 2, 2)
arms = ['Arm 0', 'Arm 1', 'Arm 2']
plt.bar(arms, policy.action_probs, color=['red', 'orange', 'green'])
plt.ylabel('Probability')
plt.title('Final Policy')

plt.tight_layout()
plt.show()

In [None]:
# Comparison with random policy
class RandomPolicy:
    def __init__(self, n_actions=3):
        self.n_actions = n_actions
    
    def get_action(self):
        return np.random.choice(self.n_actions)

# Test both policies
random_policy = RandomPolicy()

pg_rewards = [env.step(policy.get_action()) for _ in range(100)]
random_rewards = [env.step(random_policy.get_action()) for _ in range(100)]

plt.figure(figsize=(8, 4))
plt.bar(['Random', 'Policy Gradient'], 
        [np.mean(random_rewards), np.mean(pg_rewards)], 
        color=['red', 'blue'])
plt.ylabel('Average Reward')
plt.title('Policy Comparison')
plt.show()

print(f"Random Policy: {np.mean(random_rewards):.3f}")
print(f"Policy Gradient: {np.mean(pg_rewards):.3f}")

In [None]:
# Final results
print("Policy learned to prefer the best arm:")
for i, prob in enumerate(policy.action_probs):
    print(f"Arm {i}: {prob:.3f}")
print(f"Best arm (2) has highest probability: {policy.action_probs[2]:.3f}")

In [None]:
# Policy Gradient summary
print("Policy Gradient Method:")
print("- Directly optimizes action probabilities")
print("- Increases probability of good actions") 
print("- Decreases probability of bad actions")
print("- Works for continuous and discrete actions")

print(f"\nBaseline variance reduction example:")
returns = np.array([10, 8, 12, 6, 14])
baseline = np.mean(returns)
advantages = returns - baseline
print(f"Returns: {returns}")
print(f"Advantages: {advantages}")
print(f"Variance reduced from {np.var(returns):.1f} to {np.var(advantages):.1f}")