# 方策ベースの強化学習1 

In [4]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [5]:
import sys

### pfrlライブラリのパスへの追加 

In [6]:
sys.path.append("/content/gdrive/MyDrive/repos/pfrl")

### インポート 

In [7]:
import pfrl
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

In [8]:
from tqdm.notebook import tqdm
import cv2

### 描画のための処理

[こちら](https://stackoverflow.com/a/61318224)あるいは[こちら](https://stackoverflow.com/a/61318224)を参考にした．

In [9]:
!apt-get install -y xvfb x11-utils
!pip install -q gym[box2d]==0.17.* pyvirtualdisplay==0.2.* PyOpenGL==3.1.* PyOpenGL-accelerate==3.1.*

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following additional packages will be installed:
  libxxf86dga1
Suggested packages:
  mesa-utils
The following NEW packages will be installed:
  libxxf86dga1 x11-utils xvfb
0 upgraded, 3 newly installed, 0 to remove and 13 not upgraded.
Need to get 993 kB of archives.
After this operation, 2,981 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/main amd64 libxxf86dga1 amd64 2:1.1.4-1 [13.7 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic/main amd64 x11-utils amd64 7.7+3build1 [196 kB]
Get:3 http://archive.ubuntu.com/ubuntu bionic-updates/universe amd64 xvfb amd64 2:1.19.6-1ubuntu4.8 [784 kB]
Fetched 993 kB in 1s (1,323 kB/s)
Selecting previously unselected package libxxf86dga1:amd64.
(Reading database ... 146374 files and directories currently installed.)
Preparing to unpack .../libxxf86dga1_2%3a1.1.4-1_amd64.deb ...
Unpacking libxxf86dga1:amd64 (2:

In [10]:
import pyvirtualdisplay
_display = pyvirtualdisplay.Display(visible=False,  # use False with Xvfb
                                    size=(1400, 900))
_ = _display.start()

### 離散行動・確率的方策 

離散行動の環境の例としてカートポールを利用し，方策ベースの学習手法としてREINFORCEを利用する．

In [12]:
import gym

In [10]:
discrete_env = gym.make("CartPole-v0")
print("observation space:", discrete_env.observation_space)
print("action space:", discrete_env.action_space)

observation space: Box(-3.4028234663852886e+38, 3.4028234663852886e+38, (4,), float32)
action space: Discrete(2)


#### 方策モデルの定義 

REINFORCEでは関数近似器として，状態を入力して行動と行動確率（サンプル値とその尤度の値）が取得できる方策モデルがあればよい．そのためモデルは最終出力として`torch.distributions`のオブジェクトを出力する．`pfrl.policies.SoftmaxCategoricalHead`はカテゴリカル分布を表し，ロジットを入力として`torch.distributions`のクラスを出力する`nn.Module`のサブクラスである．ここで注意するのが`pfrl.action_value.ActionValue`はQ関数の最終行に記述したが，モデルがそのオブジェクト自体を返していた．この`pfrl.policies`のheadクラスは通常の`nn.Module`を継承したクラスと同じように，forward内でコールして利用する．しかし，返り値は`torch.distributions`のクラスである．

In [17]:
class PolicyModel(nn.Module):
    def __init__(self, obs_size, n_actions):
        super().__init__()
        self.fc1 = nn.Linear(obs_size, 50)
        self.fc2 = nn.Linear(50, 50)
        self.fc3 = nn.Linear(50, n_actions)
        self.head = pfrl.policies.SoftmaxCategoricalHead()
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        logits = F.softmax(self.fc3(x), dim=1)  # カテゴリ次元をソフトマックス
        out = self.head(logits)
        return out

In [18]:
obs_size = discrete_env.observation_space.low.size
print("observation size:", obs_size)
n_actions = discrete_env.action_space.n
print("action size:",n_actions)
policy_model = PolicyModel(obs_size, n_actions)

observation size: 4
action size: 2


#### エージェントの定義

on-policyの学習になるので，explolerは必要ない．各種ハイパーパラメータはprflのexamplesを参考にした．[参考](https://github.com/pfnet/pfrl/blob/master/examples/gym/train_reinforce_gym.py)

In [19]:
reinforce_opt = torch.optim.Adam(policy_model.parameters(), lr=1.e-3)

gpu = -1

beta = 1e-4

batch_size = 10

phi = lambda x: x.astype(np.float32, copy=False)

reinforce_agent = pfrl.agents.REINFORCE(
    policy_model,
    reinforce_opt,
    gpu=gpu,
    beta=beta,
    batchsize=batch_size,
    max_grad_norm=1.0,
    phi=phi
)

#### 学習のイテレーション 

In [20]:
n_episodes = 1000  # エピソードの回数
max_episode_len = 200
for i in tqdm(range(1, n_episodes + 1)):
    obs = discrete_env.reset()  # 観測のリセット
    R = 0  # Return (sum ofrewards)
    t = 0  # time step
    while True:
        action = reinforce_agent.act(obs)
        obs, reward, done, _ = discrete_env.step(action)
        R += reward
        t += 1
        reset = t == max_episode_len
        reinforce_agent.observe(obs, reward, done, reset)
        if done or reset:
            break
    
    if i%50 == 0:
        print("episode:{}, return:{}".format(i, R))
    if i%100 == 0:
        print("statistics:", reinforce_agent.get_statistics())
        
print("Finshed")

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

episode:50, return:14.0
episode:100, return:20.0
statistics: [('average_entropy', 0.6311372789739743)]
episode:150, return:37.0
episode:200, return:13.0
statistics: [('average_entropy', 0.6840180225643128)]
episode:250, return:13.0
episode:300, return:11.0
statistics: [('average_entropy', 0.6869980144349104)]
episode:350, return:26.0
episode:400, return:22.0
statistics: [('average_entropy', 0.6835209326775368)]
episode:450, return:20.0
episode:500, return:17.0
statistics: [('average_entropy', 0.6765118268849852)]
episode:550, return:33.0
episode:600, return:27.0
statistics: [('average_entropy', 0.6709195391967641)]
episode:650, return:42.0
episode:700, return:30.0
statistics: [('average_entropy', 0.6615873527748659)]
episode:750, return:44.0
episode:800, return:21.0
statistics: [('average_entropy', 0.6564494581582058)]
episode:850, return:31.0
episode:900, return:14.0
statistics: [('average_entropy', 0.6516458360013909)]
episode:950, return:46.0
episode:1000, return:21.0
statistics: [(

Q関数と比べてreturnが安定していない．

#### 学習結果の可視化 

In [22]:
fourcc = cv2.VideoWriter_fourcc('m', 'p', '4', 'v')
out = cv2.VideoWriter('/content/gdrive/MyDrive/rl_tutorial/movies/pfrl_tutorial_reinforce_result.mp4', fourcc, 10, (600, 400))

max_episode_len = 200
with reinforce_agent.eval_mode():
    
    obs = discrete_env.reset()  # 観測のリセット
    R = 0  # Return (sum ofrewards)
    t = 0  # time step
    
    while True:
        action = reinforce_agent.act(obs)
        obs, reward, done, _ = discrete_env.step(action)
        R += reward
        t += 1
        reset = t == max_episode_len
        # 画像として取得，保存
        frame = discrete_env.render(mode="rgb_array")
        out.write(frame[:,:,::-1])       
        
        #reinforce_agent.observe(obs, reward, done, reset)
        if done or reset:
            break
            
discrete_env.close()
out.release()

print("episode length:", t)

episode length: 41


### 連続行動・確率的方策 

連続行動の環境としてペンデュラムを利用し，方策ベースの学習手法としてA2Cを利用する．

In [13]:
concrete_env = gym.make("Pendulum-v0")
print("observation space:", concrete_env.observation_space)
print("action space:", concrete_env.action_space)

observation space: Box(-8.0, 8.0, (3,), float32)
action space: Box(-2.0, 2.0, (1,), float32)


#### 環境の可視化 

In [24]:
concrete_env.reset()
array = concrete_env.render(mode="rgb_array")
concrete_env.close()
array.shape

(500, 500, 3)

In [14]:
fourcc = cv2.VideoWriter_fourcc('m', 'p', '4', 'v')
out = cv2.VideoWriter('/content/gdrive/MyDrive/rl_tutorial/movies/pfrl_tutorial_a2c.mp4', fourcc, 10, (500, 500))

max_episode_len = 200

R = 0  # Return (sum ofrewards)
t = 0  # time step
concrete_env.reset()

while True:
    action = concrete_env.action_space.sample()
    obs, reward, done, _ = concrete_env.step(action)
    R += reward
    t += 1
    reset = t == max_episode_len
    # 画像として取得，保存
    frame = concrete_env.render(mode="rgb_array")
    out.write(frame[:,:,::-1])
    
    if done or reset:
        break
        
concrete_env.close()
out.release()

print("episode length:", t)

episode length: 200


#### モデルの定義 

A2Cのモデルは方策モデルと状態の価値関数が一体化したものである．一般的には状態を入力として途中から，行動空間の次元数の確率変数を出力する`torch.distributions`のクラスのオブジェクトを返すブランチと
状態に対して一つの勝ちを返すブランチの二つに枝分かれさせる．

In [15]:
class PolicyValueModel(nn.Module):
    def __init__(self, obs_size, action_dim, action_low, action_high):
        super().__init__()
        self.fc1 = nn.Linear(obs_size, 32)
        self.bn1 = nn.BatchNorm1d(32)
        
        self.fc2 = nn.Linear(32, 128)
        self.bn2 = nn.BatchNorm1d(128)
        
        self.fc3 = nn.Linear(128, 256)
        self.bn3 = nn.BatchNorm1d(256)
        
        self.br1_fc1 = nn.Linear(256, 50)
        self.br1_bn1 = nn.BatchNorm1d(50)
        
        self.br1_fc2 = nn.Linear(50, action_dim)
        self.br1_bn2 = nn.BatchNorm1d(action_dim)
        self.policy_bound = pfrl.nn.BoundByTanh(action_low, action_high)
        self.policy_head = pfrl.policies.GaussianHeadWithFixedCovariance(0.1)
        
        self.br2_fc1 = nn.Linear(256, 50)
        self.br2_bn1 = nn.BatchNorm1d(50)
        self.value_head = nn.Linear(50, 1)
        
    def forward(self, x):
        x = F.relu(self.bn1(self.fc1(x)))
        x = F.relu(self.bn2(self.fc2(x)))
        branch = F.relu(self.bn3(self.fc3(x)))
        
        policy_x = F.relu(self.br1_bn1(self.br1_fc1(branch)))
        policy_x = self.policy_bound(self.br1_bn2(self.br1_fc2(policy_x)))
        out_policy = self.policy_head(policy_x)
        
        
        value_x = F.relu(self.br2_bn1(self.br2_fc1(branch))) 
        out_value = self.value_head(value_x)  # exampleの実装では最終層の活性化関数は無い
        
        return out_policy, out_value

In [16]:
obs_size = concrete_env.observation_space.low.size
action_dim = concrete_env.action_space.low.size
action_low = concrete_env.action_space.low.item()
action_high = concrete_env.action_space.high.item()

policy_value_model = PolicyValueModel(obs_size, action_dim, action_low, action_high)

#### マルチプロセス用のバッチ環境 

multiprocessingやjoblibでは，マルチプロセスで実行する関数をリストとして渡す．そこでここでも環境の作成を関数で行う．ここでその関数は引数が無いようにすることに注意．そのためランダムシード等を引数で与える場合は，
関数をラップするか`functiools.partial`を利用する．[参考](https://github.com/pfnet/pfrl/blob/master/examples/atari/train_a2c_ale.py)

ここでwindowsでjupyterを使う人にとって致命的なバグがあり，multiprocessingで利用する関数は.pyファイルに書いておきjupyterからimportしなければならない（[参考](https://stackoverflow.com/questions/45719956/python-multiprocessing-in-jupyter-on-windows-attributeerror-cant-get-attribut)）．以下の関数は`tutorial_make_env_ver1.py`に書かれている．<- joblibでラップするべき？

In [30]:
sys.path.append("/content/gdrive/MyDrive/rl_tutorial")
from tutorial_make_env_ver1 import make_concrete_env

In [None]:
"""
def make_concrete_env():
    one_concrete_env = gym.make("Pendulum-v0")
    return one_concrete_env
"""

'\ndef make_concrete_env():\n    one_concrete_env = gym.make("Pendulum-v0")\n    return one_concrete_env\n'

ミニバッチの環境(マルチプロセッシング可能)を作成するには，`pfrl.envs.MultiprocessVectorEnv`クラスを利用する．これは環境のリストをラップし，それぞれをマルチプロセスで動かす．インターフェースとしては既存と同じ`step`と`reset`が利用できるが，返り値は全てミニバッチとして返ってくる．ただし問題なのは，ミニバッチのサイズ(環境の数)がプロセス数と必ず一致することである([参考](https://github.com/pfnet/pfrl/blob/master/pfrl/envs/multiprocess_vector_env.py))．これはミニバッチ数を大きくしたいときに問題となる．もしそれを解決したいなら，`joblib`の`Pararel`等で実装しなおす必要がありそう．

In [34]:
process_number = 64
batch_concrete_env = pfrl.envs.MultiprocessVectorEnv([make_concrete_env for i in range(process_number)])

#### エージェントの定義 

ハイパーパラメータはこちらによる．([参考](https://github.com/pfnet/pfrl/blob/master/examples/atari/train_a2c_ale.py))

In [35]:
#a2c_opt = pfrl.optimizers.RMSpropEpsInsideSqrt(
#        policy_value_model.parameters(),
#        lr=1e-3,
#        eps=1e-5,
#        alpha=0.99,
#    )

a2c_opt =  torch.optim.Adam(policy_value_model.parameters(), lr=3e-4)

gamma = 0.99

update_steps = 5

phi = lambda x: x.astype(np.float32, copy=False)

use_gae = True

tau = 0.95

max_grad_norm = 40

gpu = -1

num_processes = process_number


a2c_agent = pfrl.agents.A2C(
    policy_value_model,
    a2c_opt,
    gamma=gamma,
    gpu=gpu,
    num_processes=process_number,
    update_steps=update_steps,
    phi=phi,
    use_gae=use_gae,
    tau=tau,
    max_grad_norm=max_grad_norm,
)

#### 学習のイテレーション

A2Cでは，ミニバッチごとにact, observeを行うので(obs, reward等が全てミニバッチ)ミニバッチに対応する`batch_act`,`batch_observe`を利用する．今回は少し強引だが，バッチ環境のうち一つでもdoneしても，学習を続行させるプログラムにしている．

In [36]:
n_episodes = 1000  # エピソードの回数
max_episode_len = 200
for i in tqdm(range(1, n_episodes + 1)):
    
    obss = batch_concrete_env.reset()  # 観測のリセット
    R = np.zeros((process_number,))  # Return (sum ofrewards)
    t = 0  # time step
    while True:
        actions = a2c_agent.batch_act(obss)
        obss, rewards, dones, _ = batch_concrete_env.step(actions)
        
        R += rewards
        t += 1
        
        resets = np.array([t == max_episode_len]*process_number)
        a2c_agent.batch_observe(obss, rewards, dones, resets)
        if resets[0]:
            break
    
    if i%50 == 0:
        print("episode:{}, mean_return:{}, t:{}".format(i, R.mean(), t))
    if i%100 == 0:
        print("statistics:", a2c_agent.get_statistics())
print("Finshed")
batch_concrete_env.close()  # バッチで利用する場合は明示的にクローズ

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

  "A2C currently does not support resetting an env without reaching a"


episode:50, mean_return:-1175.7830742616425, t:200
episode:100, mean_return:-1120.450325845581, t:200
statistics: [('average_actor', 458.906357498217), ('average_value', 6930.20006086006), ('average_entropy', -0.8674942965115855)]
episode:150, mean_return:-941.6859109575046, t:200
episode:200, mean_return:-1355.9904992415554, t:200
statistics: [('average_actor', 247.40206476980526), ('average_value', 6061.153092431643), ('average_entropy', -0.8833512419577557)]
episode:250, mean_return:-1105.8471729179723, t:200
episode:300, mean_return:-1097.0844730020835, t:200
statistics: [('average_actor', 605.8405753089817), ('average_value', 7735.061089967709), ('average_entropy', -0.8836410913779691)]
episode:350, mean_return:-1384.2258563084406, t:200
episode:400, mean_return:-1517.232408550611, t:200
statistics: [('average_actor', 542.8257823745596), ('average_value', 7255.166254936951), ('average_entropy', -0.8836463895412688)]
episode:450, mean_return:-1699.341645664927, t:200
episode:500, m

#### 学習結果の可視化

In [1]:
fourcc = cv2.VideoWriter_fourcc('m', 'p', '4', 'v')
out = cv2.VideoWriter('movies/tutorial/pfrl_tutorial_a2c_result.mp4', fourcc, 10, (500, 500))

max_episode_len = 200
with a2c_agent.eval_mode():
    
    obs = concrete_env.reset()  # 観測のリセット
    R = 0  # Return (sum ofrewards)
    t = 0  # time step
    
    while True:
        action = a2c_agent.act(obs)
        obs, reward, done, _ = concrete_env.step(action)
        R += reward
        t += 1
        reset = t == max_episode_len
        # 画像として取得，保存
        frame = concrete_env.render(mode="rgb_array")
        out.write(frame[:,:,::-1])       
        
        #a2c_agent.observe(obs, reward, done, reset)
        if done or reset:
            break
            
concrete_env.close()
out.release()

print("episode length:", t)

NameError: ignored

### 連続行動・決定的方策

連続行動の環境としてペンデュラムを利用し，学習手法としてDDPGを利用する．

#### 方策モデルの定義

In [17]:
class DPolicy(nn.Module):
    def __init__(self, obs_size, action_dim, action_low=-2., action_high=2.):
        super().__init__()
        self.fc1 = nn.Linear(obs_size, 50)
        self.fc2 = nn.Linear(50, 50)
        self.fc3 = nn.Linear(50, action_dim)
        self.policy_bound = pfrl.nn.BoundByTanh(action_low, action_high)
        self.head = pfrl.policies.DeterministicHead()
        
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.policy_bound(self.fc3(x))
        out = self.head(x)
        return out

In [18]:
obs_size = concrete_env.observation_space.low.size
action_dim = concrete_env.action_space.low.size
action_low = concrete_env.action_space.low.item()
action_high = concrete_env.action_space.high.item()

dpolicy_model = DPolicy(obs_size, action_dim, action_low, action_high)

#### 価値関数の定義  

In [19]:
class QFunc(nn.Module):
    def __init__(self, obs_size, action_dim):
        super().__init__()
        self.concat_obs_action = pfrl.nn.ConcatObsAndAction()
        self.fc1 = nn.Linear(obs_size+action_dim, 50)
        self.fc2 = nn.Linear(50, 50)
        self.fc3 = nn.Linear(50, 1)
        
    def forward(self, x):
        x = self.concat_obs_action(x)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)  # 最後は活性化関数は必要は無い
        return x

In [20]:
q_func = QFunc(obs_size, action_dim)

####  エージェントの定義

ハイパーパラメータはこちらを参考とした．([参考](https://github.com/pfnet/pfrl/blob/master/examples/mujoco/reproduction/ddpg/train_ddpg.py))

In [21]:
opt_a = torch.optim.Adam(dpolicy_model.parameters())
opt_c = torch.optim.Adam(q_func.parameters())

rbuf = pfrl.replay_buffers.ReplayBuffer(10 ** 6)

explorer = pfrl.explorers.AdditiveGaussian(
    scale=0.1, low=concrete_env.action_space.low, high=concrete_env.action_space.high
)

def burnin_action_func():
    """Select random actions until model is updated one or more times."""
    return np.random.uniform(concrete_env.action_space.low, concrete_env.action_space.high).astype(np.float32)

gpu = -1

phi = lambda x: x.astype(np.float32, copy=False)

ddpg_agent = pfrl.agents.DDPG(
    dpolicy_model,
    q_func,
    opt_a,
    opt_c,
    rbuf,
    phi=phi,
    gamma=0.99,
    explorer=explorer,
    replay_start_size=10000,
    target_update_method="soft",
    target_update_interval=1,
    update_interval=1,
    soft_update_tau=5e-3,
    n_times_update=1,
    gpu=gpu,
    minibatch_size=100,
    burnin_action_func=burnin_action_func,
)

#### 学習のイテレーション 

In [22]:
n_episodes = 1000  # エピソードの回数
max_episode_len = 200
for i in tqdm(range(1, n_episodes + 1)):
    obs = concrete_env.reset()  # 観測のリセット
    R = 0  # Return (sum ofrewards)
    t = 0  # time step
    while True:
        action = ddpg_agent.act(obs)
        obs, reward, done, _ = concrete_env.step(action)
        R += reward
        t += 1
        reset = t == max_episode_len
        ddpg_agent.observe(obs, reward, done, reset)
        if done or reset:
            break
    
    if i%50 == 0:
        print("episode:{}, return:{}".format(i, R))
    if i%100 == 0:
        print("statistics:", ddpg_agent.get_statistics())
        
print("Finshed")

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

episode:50, return:-1161.9399052654878
episode:100, return:-122.12085454592133
statistics: [('average_q', -77.58938), ('average_actor_loss', 74.69236209869385), ('average_critic_loss', 43.86319679498673), ('n_updates', 10001)]
episode:150, return:-129.42891704137153
episode:200, return:-247.81231782509693
statistics: [('average_q', -6.0514655), ('average_actor_loss', 4.3349128979444504), ('average_critic_loss', 19.149380059242247), ('n_updates', 30001)]
episode:250, return:-120.67450189419914
episode:300, return:-121.61343329644191
statistics: [('average_q', -11.601319), ('average_actor_loss', 13.167333605289459), ('average_critic_loss', 11.36274651169777), ('n_updates', 50001)]
episode:350, return:-122.65157773489214
episode:400, return:-123.05538413803674
statistics: [('average_q', -11.975549), ('average_actor_loss', 12.316498830318451), ('average_critic_loss', 14.24978187441826), ('n_updates', 70001)]
episode:450, return:-140.23478000737543
episode:500, return:-242.18539948868784
st

#### 学習結果の可視化 

In [23]:
fourcc = cv2.VideoWriter_fourcc('m', 'p', '4', 'v')
out = cv2.VideoWriter('movies/tutorial/pfrl_tutorial_ddpg_result.mp4', fourcc, 10, (500, 500))

max_episode_len = 200
with ddpg_agent.eval_mode():
    
    obs = concrete_env.reset()  # 観測のリセット
    R = 0  # Return (sum ofrewards)
    t = 0  # time step
    
    while True:
        action = ddpg_agent.act(obs)
        obs, reward, done, _ = concrete_env.step(action)
        R += reward
        t += 1
        reset = t == max_episode_len
        # 画像として取得，保存
        frame = concrete_env.render(mode="rgb_array")
        out.write(frame[:,:,::-1])       
        
        #a2c_agent.observe(obs, reward, done, reset)
        if done or reset:
            break
            
concrete_env.close()
out.release()

print("episode length:", t)

episode length: 200


a2cよりDDPGの方がうまく学習できている．

今回はa2cで連続行動・確率方策がうまく行かなかったので，次回はsoft actor criticで試してみる．