In [439]:
import gym
import math
import random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple
from cmath import isnan
from PIL import Image

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T

env = gym.make('CartPole-v1', render_mode='rgb_array').unwrapped

# matplotlibの設定
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
  from IPython import display

plt.ion()

# gpuが使用される場合の設定
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

torch.autograd.set_detect_anomaly(True)

<torch.autograd.anomaly_mode.set_detect_anomaly at 0x7f9d2f73d6a0>

In [440]:
Transition = namedtuple('Transition', ('state', 'action', 'next_state', 'reward', 'done'))

class ReplayMemory(object):
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.position = 0
    
    def push(self, *args):
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        self.memory[self.position] = Transition(*args)
        self.position = (self.position + 1) % self.capacity # サイクルバッファ
    
    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

In [441]:
class PolicyNetwork(nn.Module):
    def __init__(self, state_dim, action_dim, lr=0.003):
        super(PolicyNetwork, self).__init__()
        self.layer1 = nn.Linear(state_dim, 64)
        self.layer2 = nn.Linear(64, 64)
        self.layer3 = nn.Linear(64, 64)
        self.pi_mean = nn.Linear(64, action_dim)
        self.pi_stddev = nn.Linear(64, action_dim)

        self.optimizer = optim.Adam(self.parameters(), lr=lr)

    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        x = F.relu(self.layer3(x))
        mean = self.pi_mean(x)
        stddev = self.pi_stddev(x)

        stddev = torch.exp(stddev)

        return mean, stddev

In [442]:
class DualQNetwork(nn.Module):
    def __init__(self, state_dim, action_dim, lr=0.003):
        super(DualQNetwork, self).__init__()
        # QNetwork 1
        self.layer1 = nn.Linear(state_dim + action_dim, 64)
        self.layer2 = nn.Linear(64, 64)
        self.layer3 = nn.Linear(64, 64)
        self.q1 = nn.Linear(64, 1)
        # QNetwork 2
        self.layer4 = nn.Linear(state_dim + action_dim, 64)
        self.layer5 = nn.Linear(64, 64)
        self.layer6 = nn.Linear(64, 64)
        self.q2 = nn.Linear(64, 1)

        self.optimizer = optim.Adam(self.parameters(), lr=lr)

    def forward(self, s, a):
        x = torch.cat((s, a), 1) # combination s and a
        # QNetwork 1
        x1 = F.relu(self.layer1(x))
        x1 = F.relu(self.layer2(x1))
        x1 = F.relu(self.layer3(x1))
        x1 = self.q1(x1)
        # QNetwork 2
        x2 = F.relu(self.layer4(x))
        x2 = F.relu(self.layer5(x2))
        x2 = F.relu(self.layer6(x2))
        x2 = self.q2(x2)

        return x1, x2


In [443]:
class SAC():
    def __init__(self, state_space, action_space, buffer_size, gamma, soft_target_tau, hard_target_interval, 
                 target_entropy, policy_lr, q_lr, alpha_lr):
        super(SAC, self).__init__()

        self.state_dim = state_space.shape[0]
        self.action_dim = action_space.shape[0]

        # Envアクション用にスケールする
        self.action_center = torch.FloatTensor((action_space.high + action_space.low) / 2)
        self.action_scale = torch.FloatTensor(action_space.high - self.action_center.detach().numpy())

        # Neural Networks
        self.policy_net = PolicyNetwork(self.state_dim, self.action_dim, policy_lr)
        
        self.q_net = DualQNetwork(self.state_dim, self.action_dim, q_lr)
        self.target_q_net = DualQNetwork(self.state_dim, self.action_dim, q_lr)

        for target_param, param in zip(self.target_q_net.parameters(), self.q_net.parameters()):
            target_param.data.copy_(param.data)

        self.replay_memory = ReplayMemory(buffer_size)

        
        self.target_entropy = -self.action_dim
        self.log_alpha = torch.zeros(1, requires_grad=True)
        self.alpha_optimizer = optim.Adam([self.log_alpha], lr=alpha_lr)

        # Hyper Parameters
        self.gamma = gamma
        self.soft_target_tau = soft_target_tau
        self.target_entropy = target_entropy

    def sample_action(self, state):
        mean, stddev = self.policy_net(state)

        # Reparameterization
        normal_random = torch.normal(0, 1, size=mean.shape)
        action_org = mean + stddev * normal_random

        # Squashed Gaussian Policy
        action = torch.tanh(action_org)

        return action, mean, stddev, action_org

    def scaled_sample_action(self, state):
        action, _, _, _ = self.sample_action(state)
        env_action = action * self.action_scale + self.action_center

        return env_action, action

    # 正規分布でのactionの対数確率密度関数logμ(a|s)
    def compute_logpi(self, mean, stddev, action):
        a1 = -0.5 * np.log(2*np.pi)
        a2 = -torch.log(stddev)
        a3 = -0.5 * (((action - mean) / stddev) ** 2)
        return a1 + a2 + a3

    # tanhで変換されたactionのlogπ(a|s)をaction_orgを使って計算
    def compute_logpi_sgp(self, mean, stddev, action_org):
        logmu = self.compute_logpi(mean, stddev, action_org)
        tmp = 1 - torch.tanh(action_org) ** 2
        tmp = torch.clip(tmp, 1e-10, 1.0)  # log(0)回避
        logpi = logmu - torch.sum(torch.log(tmp), 1, keepdim=True)
        return logpi


    def update(self, batch_size, q_net_sync=False):
        # 経験をバッチでサンプリング
        transitions = self.replay_memory.sample(batch_size)
        batch = Transition(*zip(*transitions))
        
        state_batch = torch.cat(batch.state)
        action_batch = torch.cat(batch.action)
        n_state_batch = torch.cat(batch.next_state)
        reward_batch = torch.cat(batch.reward)
        done_batch = torch.cat(batch.done)

        alpha = torch.exp(self.log_alpha)
        
        # Q(s,a)の推定値を計算し, Q値の損失関数を計算
        with torch.no_grad():
            n_action, n_mean, n_stddev, n_action_org = self.sample_action(n_state_batch)
            
            n_logpi = self.compute_logpi_sgp(n_mean, n_stddev, n_action_org)
            n_q1, n_q2 = self.target_q_net(n_state_batch, n_action)
          
            q_est = reward_batch + (1 - done_batch) * self.gamma * torch.minimum(n_q1, n_q2) - (alpha * n_logpi)
        q1, q2 = self.q_net(state_batch, action_batch)
        q1_loss = F.mse_loss(q1.float(), q_est.float())
        q2_loss = F.mse_loss(q2.float(), q_est.float())
        q_loss = q1_loss + q2_loss
        
        # q_lossからQNetworkを学習
        self.q_net.optimizer.zero_grad()
        print("aaaaaaaaa", q_loss.backward())
        self.q_net.optimizer.step()

        # 方策の損失関数を計算
        action, mean, stddev, action_org = self.sample_action(state_batch) # 現在の方策π(θ)で選ばれるactionについて評価     
        logpi = self.compute_logpi_sgp(mean, stddev, action_org)
        q1, q2 = self.q_net(state_batch, action)
        q_min = torch.minimum(q1, q2)
        policy_loss =  (-q_min + alpha.detach() * logpi).mean()

        # policy_lossからPolicyNetworkを学習
        self.policy_net.optimizer.zero_grad()
        policy_loss.backward()
        self.policy_net.optimizer.step()

        # αの自動調整
        alpha_loss = -(self.log_alpha * (logpi + self.target_entropy).detach()).mean()
        self.alpha_optimizer.zero_grad()
        alpha_loss.backward()
        self.alpha_optimizer.step()

        # ソフトターゲットで更新
        for target_param, param in zip(self.target_q_net.parameters(), self.q_net.parameters()):
            target_param.data.copy_(target_param.data * (1.0 - self.soft_target_tau) + param.data * self.soft_target_tau)

        # q_net_syncフラグが有効ならq_netを同期させる
        if q_net_sync:
            for target_param, param in zip(self.target_q_net.parameters(), self.q_net.parameters()):
                target_param.data.copy_(param.data)

        return policy_loss, q_loss

In [444]:
env = gym.make('Pendulum-v1')

# ハイパーパラメータ
buffer_size = 1000  # Experienceのキュー容量
warmup_size = 500  # 学習するかどうかのExperienceの最低限の容量
train_interval = 10  # 学習する制御周期間隔
batch_size = 32  # バッチサイズ
gamma = 0.9  # 割引率
soft_target_tau = 0.02  # Soft TargetでTargetに近づく割合
hard_target_interval = 100  # Hard Targetで同期する間隔
lr = 0.003
# エントロピーαの目標値: -1xアクション数がいいらしい
target_entropy = -1 * env.action_space.shape[0]

sac = SAC(env.observation_space, env.action_space, buffer_size, gamma, soft_target_tau, hard_target_interval,
            target_entropy, lr, lr, lr)

step_count = 0
train_count = 0

# 記録用
history_rewards = []
history_metrics = []
history_metrics_y = []


# 学習ループ
for episode in range(500):
    state, _ = env.reset()
    done = False
    total_reward = 0
    step = 0

    metrics_list = []

    # １エピソード
    while not done:
        # アクションを決定
        env_action, action = sac.scaled_sample_action(torch.FloatTensor(state).unsqueeze(0))
        if isnan(env_action[0]):
            print("action is NaN. 学習失敗.")
            break
        # print("state:", state, "action:", action)

        n_state, reward, terminated, truncated, _ = env.step(env_action.detach().numpy()[0])
        n_state = np.asarray(n_state)
        step += 1
        total_reward += reward
        done = terminated or truncated

        sac.replay_memory.push(
            torch.tensor(state).reshape(1, -1),
            action,
            torch.tensor(n_state).reshape(1, -1),
            torch.tensor(reward).reshape(1, -1), 
            torch.tensor(done).reshape(1, -1).int())

        state = n_state

        # train_interval毎に, warmup貯まっていたら学習する
        if len(sac.replay_memory) >= warmup_size and step_count % train_interval == 0:
            q_net_sync = False
            if train_count % hard_target_interval == 0:
                q_net_sync = True
            # モデルの更新
            metrics = sac.update(
                batch_size,
                q_net_sync)
            train_count += 1
            metrics_list.append(metrics)
        step_count += 1

    # 報酬
    history_rewards.append(total_reward)

    # メトリクス
    if len(metrics_list) > 0:
        history_metrics.append(np.mean(metrics_list, axis=0))  # 平均を保存
        history_metrics_y.append(episode)

    #--- print
    interval = 20
    if episode % interval == 0:
        print("{} (min,ave,max)reward {:.1f} {:.1f} {:.1f}, alpha={:.3f}".format(
            episode,
            min(history_rewards[-interval:]),
            np.mean(history_rewards[-interval:]),
            max(history_rewards[-interval:]),
            torch.exp(sac.log_alpha).detach().numpy()[0],
        ))

env.close()


0 (min,ave,max)reward -767.3 -767.3 -767.3, alpha=1.000
aaaaaaaaa None


RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [64, 1]], which is output 0 of AsStridedBackward0, is at version 2; expected version 1 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!