In [1]:
! pip install gymnasium[classic-control]

[0m

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Categorical
from torch.autograd import Variable
import matplotlib.pyplot as plt
import numpy as np
import gymnasium as gym
import math

In [3]:
class PolicyNet(nn.Module):
    def __init__(self):
        super(PolicyNet, self).__init__()

        self.fc1 = nn.Linear(4, 24)
        self.fc2 = nn.Linear(24, 36)
        self.fc3 = nn.Linear(36, 3)  # Prob of Left

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.sigmoid(self.fc3(x))
        return x

In [None]:
def transformState(S_t):
    return [ math.atan2( S_t[0], S_t[1]), math.atan2( S_t[2], S_t[3]), S_t[4], S_t[5] ]

episode_durations = []
episode_rewards = []
episode_success = []

def main():

    # Parameters
    num_episode = 1000
    batch_size = 5
    learning_rate = 0.1
    gamma = 0.99

    env = gym.make('Acrobot-v1')
    policy_net = PolicyNet()
    policy_optimizer = torch.optim.Adam(policy_net.parameters(), lr=learning_rate)

    # Batch History
    state_pool = []
    action_pool = []
    reward_pool = []
    steps = 0


    for e in range(num_episode):

        state = env.reset()[0]
        state = np.array(transformState(state))
        state = torch.from_numpy(state).float()
        state = Variable(state)

        for t in range(1000):

            probs = policy_net(state)
            m = Categorical(probs)
            action = m.sample()

            action = action.data.numpy().astype(int)
            stepData = env.step(action)
            next_state, reward, done, truncated, info = stepData[0], stepData[1], stepData[2], stepData[3], stepData[4]
            next_state = np.array(transformState(next_state))

            # To mark boundarys between episodes
            if done:
                reward = 0

            state_pool.append(state)
            action_pool.append(float(action))
            reward_pool.append(reward)

            state = next_state
            state = torch.from_numpy(state).float()
            state = Variable(state)

            steps += 1

            if done:
                episode_durations.append(t + 1)
                print("Episode:",e,"\tDuration:",t+1)
                episode_rewards.append(-t)

                break

        # Update policy
        if e > 0 and e % batch_size == 0:

            # Discount reward
            return_G = 0
            for i in reversed(range(steps)):
                if reward_pool[i] == 0:
                    return_G = 0
                else:
                    return_G = return_G * gamma + reward_pool[i]
                    reward_pool[i] = return_G

            # print(reward_pool)
            # print(len(reward_pool))

            # Normalize reward
            # reward_mean = np.mean(reward_pool)
            # reward_std = np.std(reward_pool)
            # for i in range(steps):
            #     reward_pool[i] = (reward_pool[i] - reward_mean) / reward_std

            # Gradient Desent
            policy_optimizer.zero_grad()

            for i in range(steps):
                state = state_pool[i]
                action = Variable(torch.FloatTensor([action_pool[i]]))
                reward = reward_pool[i]

                probs = policy_net(state)
                m = Categorical(probs)
                loss = -m.log_prob(action) * reward  # Negtive score function x reward
                loss.backward()

            policy_optimizer.step()
            
            
            state_pool = []
            action_pool = []
            reward_pool = []
            steps = 0


if __name__ == '__main__':
    main()

Episode: 3 	Duration: 740
Episode: 4 	Duration: 964
Episode: 5 	Duration: 743
Episode: 19 	Duration: 957
Episode: 29 	Duration: 668
Episode: 30 	Duration: 424
Episode: 31 	Duration: 970
Episode: 32 	Duration: 786
Episode: 33 	Duration: 856
Episode: 34 	Duration: 526
Episode: 38 	Duration: 673
Episode: 41 	Duration: 894
Episode: 43 	Duration: 830
Episode: 46 	Duration: 469
Episode: 47 	Duration: 248
Episode: 48 	Duration: 353
Episode: 49 	Duration: 338
Episode: 50 	Duration: 316
Episode: 51 	Duration: 167
Episode: 52 	Duration: 178
Episode: 53 	Duration: 169
Episode: 54 	Duration: 178
Episode: 55 	Duration: 186
Episode: 56 	Duration: 360
Episode: 57 	Duration: 186
Episode: 58 	Duration: 164
Episode: 59 	Duration: 157
Episode: 60 	Duration: 182
Episode: 61 	Duration: 196
Episode: 62 	Duration: 196
Episode: 63 	Duration: 183
Episode: 64 	Duration: 156
Episode: 65 	Duration: 166
Episode: 66 	Duration: 231
Episode: 67 	Duration: 164
Episode: 68 	Duration: 212
Episode: 69 	Duration: 196
Epis

In [None]:
plt.figure()
plt.plot(episode_rewards)
plt.savefig("lr-1.png")
plt.show()