In [2]:
import time
import warnings
warnings.filterwarnings('ignore')

In [13]:
start = time.time()


# https://github.com/seungeunrho/minimalRL

#PPO-LSTM
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical
import time
import numpy as np

#Hyperparameters
learning_rate = 0.0005
gamma         = 0.98
lmbda         = 0.95
eps_clip      = 0.1
K_epoch       = 3
T_horizon     = 20

class PPO(nn.Module):
    def __init__(self):
        super(PPO, self).__init__()
        self.data = []

        self.fc1   = nn.Linear(4,64)
        self.lstm  = nn.LSTM(64,32)
        self.fc_pi = nn.Linear(32,2)
        self.fc_pi2   = nn.Linear(3,2)
        self.fc_pi3 = nn.Linear(2,2)

        self.fc_v  = nn.Linear(32,1)
        self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)

    def pi(self, x, hidden, prob_pre_action_0, prob_pre_action_1):
        x = F.relu(self.fc1(x))
        x = x.view(-1, 1, 64)
        x, lstm_hidden = self.lstm(x, hidden)
        x = self.fc_pi(x)
        # prob = F.softmax(x, dim=2)

        prob = torch.cat([x, torch.tensor(prob_pre_action_0).view(-1,1,1)], -1)
        # prob = torch.cat([x], -1)
        prob = F.softmax(self.fc_pi2(prob), dim=2)


        return prob, lstm_hidden

    def v(self, x, hidden):
        x = F.relu(self.fc1(x))
        x = x.view(-1, 1, 64)
        x, lstm_hidden = self.lstm(x, hidden)
        v = self.fc_v(x)
        return v

    def put_data(self, transition):
        self.data.append(transition)

    def make_batch(self):
        s_lst, a_lst, r_lst, s_prime_lst, prob_a_lst, h_in_lst, h_out_lst, done_lst, prob_pre_action_0_lst, prob_pre_action_1_lst = [], [], [], [], [], [], [], [], [], []


        for transition in self.data:
            s, a, r, s_prime, prob_a, h_in, h_out, done, prob_pre_action_0, prob_pre_action_1 = transition
            s_lst.append(s)
            a_lst.append([a])
            r_lst.append([r])
            s_prime_lst.append(s_prime)
            prob_a_lst.append([prob_a])
            h_in_lst.append(h_in)
            h_out_lst.append(h_out)
            done_mask = 0 if done else 1
            done_lst.append([done_mask])
            prob_pre_action_0_lst.append(prob_pre_action_0)
            prob_pre_action_1_lst.append(prob_pre_action_1)


        s,a,r,s_prime,done_mask,prob_a, prob_pre_action_0, prob_pre_action_1 = torch.tensor(s_lst, dtype=torch.float), torch.tensor(a_lst), \
                                         torch.tensor(r_lst), torch.tensor(s_prime_lst, dtype=torch.float), \
                                         torch.tensor(done_lst, dtype=torch.float), torch.tensor(prob_a_lst), torch.tensor(prob_pre_action_0_lst), torch.tensor(prob_pre_action_1_lst)
        self.data = []
        return s,a,r,s_prime, done_mask, prob_a, h_in_lst[0], h_out_lst[0], prob_pre_action_0, prob_pre_action_1

    def train_net(self):
        s,a,r,s_prime,done_mask, prob_a, (h1_in, h2_in), (h1_out, h2_out), prob_pre_action_0, prob_pre_action_1 = self.make_batch()
        first_hidden  = (h1_in.detach(), h2_in.detach())
        second_hidden = (h1_out.detach(), h2_out.detach())

        for i in range(K_epoch):
            v_prime = self.v(s_prime, second_hidden).squeeze(1)
            td_target = r + gamma * v_prime * done_mask
            v_s = self.v(s, first_hidden).squeeze(1)
            delta = td_target - v_s
            delta = delta.detach().numpy()

            advantage_lst = []
            advantage = 0.0
            for item in delta[::-1]:
                advantage = gamma * lmbda * advantage + item[0]
                advantage_lst.append([advantage])
            advantage_lst.reverse()
            advantage = torch.tensor(advantage_lst, dtype=torch.float)



            pi, _ = self.pi(s, first_hidden, prob_pre_action_0, prob_pre_action_1) # 애초에 h_in이 policy 신경망을 통과했기 때문에 first_hidden을 사용하는 것 같음.
            pi_a = pi.squeeze(1).gather(1,a)
            ratio = torch.exp(torch.log(pi_a) - torch.log(prob_a))  # a/b == log(exp(a)-exp(b))

            surr1 = ratio * advantage
            surr2 = torch.clamp(ratio, 1-eps_clip, 1+eps_clip) * advantage
            loss = -torch.min(surr1, surr2) + F.smooth_l1_loss(v_s, td_target.detach())

            self.optimizer.zero_grad()
            loss.mean().backward(retain_graph=True)
            self.optimizer.step()

def main():
    env = gym.make('CartPole-v1')
    model = PPO()
    score = 0.0
    print_interval = 20

    for n_epi in range(800):
        h_out = (torch.zeros([1, 1, 32], dtype=torch.float), torch.zeros([1, 1, 32], dtype=torch.float))
        s = env.reset()
        done = False

        prob_pre_action_0 = 0.5
        prob_pre_action_1 = 0.5

        while not done:
            for t in range(T_horizon):
                h_in = h_out
                # print(h_in)
                prob, h_out = model.pi(torch.from_numpy(s).float(), h_in, prob_pre_action_0, prob_pre_action_1)
                # print(prob)
                prob = prob.view(-1)
                # print(prob)
                m = Categorical(prob)
                # print(m)
                a = m.sample().item()
                s_prime, r, done, truncated = env.step(a)

                prob_pre_action = torch.tensor(prob)

                model.put_data((s, a, r/100.0, s_prime, prob[a].item(), h_in, h_out, done, prob_pre_action_0, prob_pre_action_1))

                s = s_prime
                prob_pre_action_0 = prob[0]
                prob_pre_action_1 = prob[1]


                score += r
                if done:
                    break
            model.train_net()

        if n_epi%print_interval==0 and n_epi!=0:
            print("# of episode :{}, avg score : {:.1f}".format(n_epi, score/print_interval))
            score = 0.0

    env.close()

if __name__ == '__main__':
    main()

end = time.time()

print(f"{end - start:.5f} sec")

# of episode :20, avg score : 28.9
# of episode :40, avg score : 26.2
# of episode :60, avg score : 30.9
# of episode :80, avg score : 112.5
# of episode :100, avg score : 116.8
# of episode :120, avg score : 110.9
# of episode :140, avg score : 181.8
# of episode :160, avg score : 103.4
# of episode :180, avg score : 288.1
# of episode :200, avg score : 329.4
# of episode :220, avg score : 290.0
# of episode :240, avg score : 190.1
# of episode :260, avg score : 320.1
# of episode :280, avg score : 457.8
# of episode :300, avg score : 285.8
# of episode :320, avg score : 148.9
# of episode :340, avg score : 282.1
# of episode :360, avg score : 432.6
# of episode :380, avg score : 416.8
# of episode :400, avg score : 410.6
# of episode :420, avg score : 272.7
# of episode :440, avg score : 222.7
# of episode :460, avg score : 306.0
# of episode :480, avg score : 326.6
# of episode :500, avg score : 319.0
# of episode :520, avg score : 325.4
# of episode :540, avg score : 289.5
# of epi

In [14]:
start = time.time()

#PPO-LSTM
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical
import time
import numpy as np

#Hyperparameters
learning_rate = 0.0005
gamma         = 0.98
lmbda         = 0.95
eps_clip      = 0.1
K_epoch       = 3
T_horizon     = 20

class PPO(nn.Module):
    def __init__(self):
        super(PPO, self).__init__()
        self.data = []

        self.fc1   = nn.Linear(4,64)
        self.lstm  = nn.LSTM(64,32)
        self.fc_pi = nn.Linear(32,2)
        self.fc_v  = nn.Linear(32,1)
        self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)

    def pi(self, x, hidden):
        x = F.relu(self.fc1(x))
        x = x.view(-1, 1, 64)
        x, lstm_hidden = self.lstm(x, hidden)
        x = self.fc_pi(x)
        prob = F.softmax(x, dim=2)
        return prob, lstm_hidden

    def v(self, x, hidden):
        x = F.relu(self.fc1(x))
        x = x.view(-1, 1, 64)
        x, lstm_hidden = self.lstm(x, hidden)
        v = self.fc_v(x)
        return v

    def put_data(self, transition):
        self.data.append(transition)

    def make_batch(self):
        s_lst, a_lst, r_lst, s_prime_lst, prob_a_lst, h_in_lst, h_out_lst, done_lst = [], [], [], [], [], [], [], []
        for transition in self.data:
            s, a, r, s_prime, prob_a, h_in, h_out, done = transition

            s_lst.append(s)
            a_lst.append([a])
            r_lst.append([r])
            s_prime_lst.append(s_prime)
            prob_a_lst.append([prob_a])
            h_in_lst.append(h_in)
            h_out_lst.append(h_out)
            done_mask = 0 if done else 1
            done_lst.append([done_mask])

        s,a,r,s_prime,done_mask,prob_a = torch.tensor(s_lst, dtype=torch.float), torch.tensor(a_lst), \
                                         torch.tensor(r_lst), torch.tensor(s_prime_lst, dtype=torch.float), \
                                         torch.tensor(done_lst, dtype=torch.float), torch.tensor(prob_a_lst)
        self.data = []
        return s,a,r,s_prime, done_mask, prob_a, h_in_lst[0], h_out_lst[0]

    def train_net(self):
        s,a,r,s_prime,done_mask, prob_a, (h1_in, h2_in), (h1_out, h2_out) = self.make_batch()
        first_hidden  = (h1_in.detach(), h2_in.detach())
        second_hidden = (h1_out.detach(), h2_out.detach())

        for i in range(K_epoch):
            v_prime = self.v(s_prime, second_hidden).squeeze(1)
            td_target = r + gamma * v_prime * done_mask
            v_s = self.v(s, first_hidden).squeeze(1)
            delta = td_target - v_s
            delta = delta.detach().numpy()

            advantage_lst = []
            advantage = 0.0
            for item in delta[::-1]:
                advantage = gamma * lmbda * advantage + item[0]
                advantage_lst.append([advantage])
            advantage_lst.reverse()
            advantage = torch.tensor(advantage_lst, dtype=torch.float)

            pi, _ = self.pi(s, first_hidden)
            pi_a = pi.squeeze(1).gather(1,a)
            ratio = torch.exp(torch.log(pi_a) - torch.log(prob_a))  # a/b == log(exp(a)-exp(b))

            surr1 = ratio * advantage
            surr2 = torch.clamp(ratio, 1-eps_clip, 1+eps_clip) * advantage
            loss = -torch.min(surr1, surr2) + F.smooth_l1_loss(v_s, td_target.detach())

            self.optimizer.zero_grad()
            loss.mean().backward(retain_graph=True)
            self.optimizer.step()

def main():
    env = gym.make('CartPole-v1')
    model = PPO()
    score = 0.0
    print_interval = 20

    for n_epi in range(800):
        h_out = (torch.zeros([1, 1, 32], dtype=torch.float), torch.zeros([1, 1, 32], dtype=torch.float))
        s = env.reset()
        done = False

        while not done:
            for t in range(T_horizon):
                h_in = h_out
                prob, h_out = model.pi(torch.from_numpy(s).float(), h_in)
                prob = prob.view(-1)
                m = Categorical(prob)
                a = m.sample().item()

                s_prime, r, done, truncated= env.step(a)

                model.put_data((s, a, r/100.0, s_prime, prob[a].item(), h_in, h_out, done))
                s = s_prime

                score += r
                if done:
                    break

            model.train_net()

        if n_epi%print_interval==0 and n_epi!=0:
            print("# of episode :{}, avg score : {:.1f}".format(n_epi, score/print_interval))
            score = 0.0

    env.close()

if __name__ == '__main__':
    main()

end = time.time()

print(f"{end - start:.5f} sec")

# of episode :20, avg score : 28.4
# of episode :40, avg score : 48.8
# of episode :60, avg score : 137.6
# of episode :80, avg score : 183.6
# of episode :100, avg score : 292.4
# of episode :120, avg score : 123.4
# of episode :140, avg score : 266.4
# of episode :160, avg score : 189.7
# of episode :180, avg score : 252.4
# of episode :200, avg score : 243.2
# of episode :220, avg score : 194.0
# of episode :240, avg score : 298.9
# of episode :260, avg score : 179.7
# of episode :280, avg score : 237.4
# of episode :300, avg score : 277.1
# of episode :320, avg score : 208.4
# of episode :340, avg score : 149.1
# of episode :360, avg score : 156.4
# of episode :380, avg score : 192.8
# of episode :400, avg score : 241.6
# of episode :420, avg score : 222.8
# of episode :440, avg score : 197.7
# of episode :460, avg score : 237.8
# of episode :480, avg score : 322.4
# of episode :500, avg score : 72.1
# of episode :520, avg score : 11.2
# of episode :540, avg score : 10.3
# of episo