### 1. Import the Necessary Packages

In [7]:
import gym
import numpy as np
from collections import deque
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

## Define Policy

In [3]:
class Policy(nn.Module):
    def __init__(self, s_size=4, h_size=16, a_size=2):
        super(Policy, self).__init__()
        self.fc1 = nn.Linear(s_size, h_size)
        self.fc2 = nn.Linear(h_size, a_size)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return F.softmax(x, dim = 1 if len(x.shape) > 1 else 0)
    
    def act(self, state):
        state = torch.from_numpy(state).float().to(self.fc1.weight.device)
        probs = self(state)
        m = Categorical(probs)
        action = m.sample()
        return action.item(), m.log_prob(action)

## Run Episodes (One episode a time not batch)

In [6]:

def run_episode(policy, env, max_t):
    saved_log_probs = []
    rewards = []
    state = env.reset()
    for t in range(max_t):
        action, log_prob = policy.act(state)
        saved_log_probs.append(log_prob)
        state, reward, done, _ = env.step(action)
        rewards.append(reward)
        if done:
            break 
    return rewards, saved_log_probs

def reinforce(policy, env, n_episodes=1000, max_t=1000, gamma=1.0, print_every=100):
    optimizer = optim.Adam(policy.parameters(), lr=1e-2)
    
    scores_deque = deque(maxlen=100)
    scores = []
    for i_episode in range(1, n_episodes+1):
        rewards, saved_log_probs = run_episode(policy, env, max_t)
        scores_deque.append(sum(rewards))
        scores.append(sum(rewards))
        
        discounts = gamma ** np.arange(len(rewards))
        R = np.sum(discounts * rewards)
        
        log_rewards = [-log_prob * R for log_prob in saved_log_probs]
        policy_loss = torch.stack(log_rewards, dim = 0).sum(dim = 0)
        
        optimizer.zero_grad()
        policy_loss.backward()
        optimizer.step()
        
        if i_episode % print_every == 0:
            print('Episode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))
        if np.mean(scores_deque)>=195.0:
            print('Environment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_deque)))
            break
        
    return scores
    

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
torch.manual_seed(0)
policy = Policy().to(device)

env = gym.make('CartPole-v0')
env.seed(0)

scores = reinforce(policy, env)

Episode 100	Average Score: 29.59
Episode 200	Average Score: 48.25
Episode 300	Average Score: 52.83
Episode 400	Average Score: 59.80
Episode 500	Average Score: 59.14
Episode 600	Average Score: 58.05
Episode 700	Average Score: 81.31
Episode 800	Average Score: 62.43
Episode 900	Average Score: 57.80
Episode 1000	Average Score: 53.40
