In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
import sys

### pfrlライブラリのパスへの追加

In [3]:
sys.path.append("/content/gdrive/MyDrive/repos/pfrl")

### インポート 

In [4]:
import pfrl
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

In [5]:
import cv2
from pathlib import Path

In [6]:
from tqdm.notebook import tqdm

### 描画のための処理

[こちら](https://stackoverflow.com/a/61318224)あるいは[こちら](https://stackoverflow.com/a/61318224)を参考にした．

In [7]:
!apt-get install -y xvfb x11-utils

Reading package lists... Done
Building dependency tree       
Reading state information... Done
x11-utils is already the newest version (7.7+3build1).
xvfb is already the newest version (2:1.19.6-1ubuntu4.8).
0 upgraded, 0 newly installed, 0 to remove and 41 not upgraded.


In [8]:
!pip install -q gym[box2d]==0.17.* pyvirtualdisplay==0.2.* PyOpenGL==3.1.* PyOpenGL-accelerate==3.1.*

In [9]:
import pyvirtualdisplay
_display = pyvirtualdisplay.Display(visible=False,  # use False with Xvfb
                                    size=(1400, 900))
_ = _display.start()

### 環境の作成 

今回はカートポールの環境を利用する．状態は [カートの位置，加速度，ポールの角度，ポールの角速度] の連続値．行動は左右のどちらへ移動するかの離散値

In [10]:
import gym

In [11]:
env = gym.make("CartPole-v0")
print("observation space:", env.observation_space)
print("action space:", env.action_space)

observation space: Box(-3.4028234663852886e+38, 3.4028234663852886e+38, (4,), float32)
action space: Discrete(2)


#### 最初の状態を観測 

In [12]:
obs = env.reset()
print("initial observation:", obs)

initial observation: [ 0.0411939   0.01694667  0.00651279 -0.02487047]


#### 適当な行動からObservation, rewardを取得 

In [13]:
action = env.action_space.sample()
obs, r, done, info = env.step(action)
print("next obserbation:", obs)
print("reward:", r)
print("done:", done)
print("info:", info)

next obserbation: [ 0.04153283  0.21197462  0.00601538 -0.31549144]
reward: 1.0
done: False
info: {}


### ランダム行動の可視化 

In [14]:
fourcc = cv2.VideoWriter_fourcc('m', 'p', '4', 'v')
out = cv2.VideoWriter('/content/gdrive/MyDrive/rl_tutorial/movies/pfrl_tutorial_dqn.mp4', fourcc, 10, (600, 400))

max_episode_len = 200

R = 0  # Return (sum ofrewards)
t = 0  # time stepa
env.reset()

while True:
    action = env.action_space.sample()
    obs, reward, done, _ = env.step(action)
    R += reward
    t += 1
    reset = t == max_episode_len
    # 画像として取得，保存
    frame = env.render(mode="rgb_array")
    out.write(frame[:,:,::-1])
    
    if done or reset:
        break
        
env.close()
out.release()

print("episode length:", t)

episode length: 29


### Q関数の定義 

ここで，最終行の`pfrl.action_value.DiscreteActionValue`とは，`torch.Tensor`をラップするクラスで，エージェント内部でのQ関数の値の扱いを簡単にするために利用する．そのため，Q関数の値によって方策を決定する手法のすべてでQ関数モデルの出力値をラップする必要がある．Q関数を方策の決定に用いない場合は付けるべきでない？

In [15]:
class MyQFunction(nn.Module):
    def __init__(self, obs_size, n_actions):
        super().__init__()
        self.fc1 = nn.Linear(obs_size, 50)
        self.fc2 = nn.Linear(50, 50)
        self.fc3 = nn.Linear(50, n_actions)
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        out = pfrl.action_value.DiscreteActionValue(x)
        return out

In [16]:
obs_size = env.observation_space.low.size
print("observation size:", obs_size)
n_actions = env.action_space.n
print("action size:",n_actions)
q_func = MyQFunction(obs_size, n_actions)

observation size: 4
action size: 2


### エージェントの作成 

In [17]:
optimizer = torch.optim.Adam(q_func.parameters(), eps=1e-2)

In [18]:
gamma = 0.9

explorer = pfrl.explorers.ConstantEpsilonGreedy(epsilon=0.3,
                                                random_action_func=env.action_space.sample
                                               )

replay_buffer = pfrl.replay_buffers.ReplayBuffer(capacity=10**6)

phi = lambda x: x.astype(np.float32, copy=False)

gpu = -1 # -1 is cpu

agent = pfrl.agents.DoubleDQN(
    q_function=q_func,
    optimizer=optimizer,
    replay_buffer=replay_buffer,
    gamma=gamma,
    explorer=explorer,
    replay_start_size=500,
    update_interval=1,
    target_update_interval=100,
    phi=phi,
    gpu=gpu
)

### 学習のイテレーション 

breakの判定ではdoneとreset(指定ステップに到達)のどちらかあるいはresetのみを利用してもよい．これは`agent.observe`がdoneを考慮できるからである．しかし環境側は追加してstepしてしまうのでwarningが出る．そのため煩わしいならそのwarningを非表示にするなどを検討する．

In [19]:
n_episodes = 1000  # エピソードの回数
max_episode_len = 200
for i in tqdm(range(1, n_episodes + 1)):
    obs = env.reset()  # 観測のリセット
    R = 0  # Return (sum ofrewards)
    t = 0  # time step
    while True:
        action = agent.act(obs)
        obs, reward, done, _ = env.step(action)
        R += reward
        t += 1
        reset = t == max_episode_len
        agent.observe(obs, reward, done, reset)
        if done or reset:  # どちらでも良い
        #if reset:  # こちらがwarnが出る
            break
    
    if i%10 == 0:
        print("episode:{}, return:{}".format(i, R))
    if i%50 == 0:
        print("statistics:", agent.get_statistics())
        
print("Finshed")

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

episode:10, return:15.0
episode:20, return:10.0
episode:30, return:12.0
episode:40, return:13.0
episode:50, return:11.0
statistics: [('average_q', 0.5866759), ('average_loss', 0.2298720313631224), ('cumulative_steps', 565), ('n_updates', 66), ('rlen', 565)]
episode:60, return:11.0
episode:70, return:12.0
episode:80, return:14.0
episode:90, return:10.0
episode:100, return:8.0
statistics: [('average_q', 4.683429), ('average_loss', 0.22156397092621774), ('cumulative_steps', 1163), ('n_updates', 664), ('rlen', 1163)]
episode:110, return:12.0
episode:120, return:10.0
episode:130, return:27.0
episode:140, return:21.0
episode:150, return:200.0
statistics: [('average_q', 9.063849), ('average_loss', 0.2922391359321773), ('cumulative_steps', 2904), ('n_updates', 2405), ('rlen', 2904)]
episode:160, return:32.0
episode:170, return:111.0
episode:180, return:62.0
episode:190, return:200.0
episode:200, return:200.0
statistics: [('average_q', 9.922631), ('average_loss', 0.10506204106030054), ('cumulat

### 学習結果の可視化 

In [22]:
fourcc = cv2.VideoWriter_fourcc('m', 'p', '4', 'v')
out = cv2.VideoWriter('/content/gdrive/MyDrive/rl_tutorial/movies/pfrl_tutorial_dqn_result.mp4', fourcc, 10, (600, 400))

max_episode_len = 200
with agent.eval_mode():
    
    obs = env.reset()  # 観測のリセット
    R = 0  # Return (sum ofrewards)
    t = 0  # time step
    
    while True:
        action = agent.act(obs)
        obs, reward, done, _ = env.step(action)
        R += reward
        t += 1
        reset = t == max_episode_len
        # 画像として取得，保存
        frame = env.render(mode="rgb_array")
        out.write(frame[:,:,::-1])       
        
        #agent.observe(obs, reward, done, reset)
        if done or reset:
            break
            
env.close()
out.release()

print("episode length:", t)

episode length: 126


### エージェントの保存 

In [21]:
save_path = Path("tutorial_agents") / Path("pfrl_tutorial_dpn")
agent.save(save_path)

### 他のモデルもお試し

#### Rainbow 