In [None]:
# %load REINFORCE.py
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

import matplotlib.pyplot as plt
import numpy as np
import time 
from IPython.display import clear_output

In [None]:
#Hyperparameters
learning_rate = 0.0002
gamma         = 0.98

# I. Define Policy
## 1.1 init

* Softmax in action preference
* $h(a,s,\theta)$ -- ANN
  * 1 hidden layer, of 128 neurons
  * Activation Func, relu
  * Output layer, Softmax (softmax in action preference!)

# 1.2 train_net, update rules for $\theta$
$$ G \leftarrow \sum^T_{k=t+1} \gamma ^{k-t-1}R_k$$
$$ \theta \leftarrow \theta + \alpha \gamma ^t G \nabla \ln \pi (A_t|S_t, \theta) $$

In [None]:
class Policy(nn.Module):
    def __init__(self):
        super(Policy, self).__init__()
        self.data = []     # data of 1 entire episode

        # I. ================================
        # 1.1 Initial Policy network (1)

        # ===================================

        self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)
        
    def forward(self, x):

        # I. ================================
        # 1.1 Initial Policy network (2)

        # ===================================

        return x
      
    def put_data(self, item):     # adding data of 1 step
        self.data.append(item)
        
    def train_net(self):
        self.optimizer.zero_grad()
        G = 0

        # III. ==============================
        # 3. Loop for each step of the episode, update theta

        # ===================================

        self.optimizer.step()
        self.data = []


In [None]:
# check output of policy network, shoule be 2-dim tensor
env = gym.make('CartPole-v1')
s = env.reset()[0]
pi = Policy()
pi(torch.Tensor(s))

In [None]:
output = torch.tensor([0.4690, 0.5310])
output

# introduce func: how to sample action
m = Categorical(output)
print(m)
a = m.sample()
print(a)

In [None]:
# get action a, from tensor a
a.item()

# II. Sample, Observe cartpole
1 episode

In [None]:
env = gym.make('CartPole-v1')
pi = Policy()

In [None]:
# how to sample next state
s_prime, r, done, _, _ = env.step(0)

In [None]:
s, _ = env.reset()
done = False
t = 0

while not done:
    # 2.0 show
    plt.plot([-5, 5], [0,0], c='black')
    plt.scatter([-2.4, 2.4], [0,0], c='black')
    plt.scatter(s[0], 0, marker='^', c='#FF4500', s = 400)
    plt.plot([s[0], s[0] + 8*np.tan(s[1]*10/180*np.pi)], [0,8])
    plt.xlim(-5,5)
    plt.ylim(-1,10)
    clear_output(True)
    plt.show()
    plt.pause(0.005)

    # II. ===============================
    # 2.1 Sample action, according to pi


    # 2.2 observe next step

    # ===================================
    
    if done:
        print("Episode finished after {} timesteps".format(t+1))

# III. Sample, and train

In [None]:
env = gym.make('CartPole-v1')
pi = Policy()
score = 0.0
print_interval = 20
li_score = []

In [None]:
for n_epi in range(10000):
    s, _ = env.reset()
    done = False
    
    while not done: # CartPole-v1 forced to terminates at 500 step.

        # II. ===============================
        # 2.1 Sample action, according to pi


        # 2.2 observe next step,
        # and Record data of 1 step with "pi.put_data", putting (r, prob[a])

        # ===================================
        pass
        
    # 3. Train net
    pi.train_net()
    
    if n_epi%print_interval==0 and n_epi!=0:
        print("# of episode :{}, avg score : {}".format(n_epi, score/print_interval))
        score = 0.0
    li_score.append(score)
env.close()

In [None]:
plt.plot(li_score)