In [15]:
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical
from tqdm import tqdm

class Network(nn.Module):
    def __init__(self, state_size, action_size, hidden_size):
        super(Network, self).__init__()
        self.fc1 = nn.Linear(state_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.policy = nn.Linear(hidden_size, action_size)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.softmax(self.policy(x))
        return x


In [None]:

class PolicyGradient:
    def __init__(self, env, state_size, action_size, hidden_size, lr, gamma):
        self.env = env
        self.state_size = state_size
        self.action_size = action_size
        self.lr = lr
        self.gamma = gamma
        self.model = Network(state_size, action_size, hidden_size)
        self.optimizer = optim.Adam(self.model.parameters(), lr=lr)

    def get_action(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0)
        probs = self.model(state)
        m = Categorical(probs)
        action = m.sample()
        return action.item()

    def compute_loss(self, states, actions, rewards):
        states = torch.FloatTensor(states)
        actions = torch.LongTensor(actions)
        rewards = torch.FloatTensor(rewards)

        probs = self.model(states)
        m = Categorical(probs)
        log_probs = m.log_prob(actions)
        loss = -(log_probs * rewards).mean()
        return loss

    def train_step(self, states, actions, rewards):
        self.optimizer.zero_grad()
        loss = self.compute_loss(states, actions, rewards)
        loss.backward()
        self.optimizer.step()
        return loss.item()


In [None]:

# Parameters
env_name = "CartPole-v0"
hidden_size = 64
max_episodes = 500
lr = 7e-3
gamma = 0.99
seed = 12345


In [None]:

# Environment
env = gym.make(env_name)
env.seed(seed)
state_size = env.observation_space.shape[0]
action_size = env.action_space.n


In [None]:

# Agent
agent = PolicyGradient(env, state_size, action_size, hidden_size, lr, gamma)

if __name__ == "__main__":
    torch.manual_seed(seed)
    scores = []
    progress_bar = tqdm(range(max_episodes), desc="Training Progress")
    for episode in progress_bar:
        state = env.reset()
        episode_reward = 0
        done = False

        states = []
        actions = []
        rewards = []

        while not done:
            action = agent.get_action(state)
            next_state, reward, done, _ = env.step(action)

            states.append(state)
            actions.append(action)
            rewards.append(reward)

            state = next_state
            episode_reward += reward

            if done:
                scores.append(episode_reward)
                progress_bar.set_postfix({'Episode': episode + 1, 'Reward': episode_reward})

                # Discount rewards (revised)
                R = 0
                discounted_rewards = []
                for r in rewards[::-1]:
                    R = r + gamma * R
                    discounted_rewards.insert(0, R)
                discounted_rewards = torch.tensor(discounted_rewards)

                loss = agent.train_step(states, actions, discounted_rewards)
                break
