In [1243]:
%matplotlib inline
import torch
from torch import nn
import torch.nn.functional as F
from torch.distributions import Categorical
import torch.optim as optim
import gymnasium as gym
import matplotlib.pyplot as plt
from collections import deque
import numpy as np
torch.manual_seed(0)



<torch._C.Generator at 0x10fa71330>

In [1244]:
class Policy(nn.Module):
    def __init__(self, len_obs, len_act, len_hidden):
        super(Policy, self).__init__()
        self.l1 = nn.Linear(len_obs, len_hidden)
        self.l2 = nn.Linear(len_hidden, len_act)    

    def forward(self, obs):
        x = self.l1(obs)
        x = F.tanh(x)
        x = self.l2(x)
        return x
    
    def action(self, obs): 
        obs = obs.float().unsqueeze(0)
        logits = self.forward(obs)
        act_dist = Categorical(logits=logits)
        action_selected = act_dist.sample()
        return action_selected.item(), act_dist.log_prob(action_selected)

In [1245]:
def calc_rtg_reward(rewards,gamma):
    rtg_rewards =[]
    len_rewards = len(rewards)
    rtg_rewards = [0]*len_rewards
    for i in range(len_rewards)[::-1]:
        if i == (len_rewards-1):
            pre_disc_reward = 0
        else:
            pre_disc_reward = rtg_rewards[i+1]
        rtg_rewards[i] = rewards[i] + gamma * pre_disc_reward
    return rtg_rewards

In [1257]:
def train( env, policy, num_eps, lr, gamma, ep_max_len):
    scores_deque = deque(maxlen = 100)   
    action_len_deque = deque(maxlen = 100)

    len_obs = env.observation_space.shape[0]
    len_act = env.action_space.n
    len_hidden = 48

    # policy = Policy(len_obs = len_obs, len_act= len_act, len_hidden = len_hidden)
    optimiser = torch.optim.Adam(policy.parameters(), lr)
    
    

    for i in range(num_eps):
        rewards = []
        actions = []
        log_probs = []
        observation, info = env.reset()
        state = observation
        states = [observation]
        
        for step in range(ep_max_len):
            action, log_prob = policy.action(torch.from_numpy(state))
            actions.append(action)
            log_probs.append(log_prob)
            state,reward,terminal,_,_ = env.step(action)
            states.append(state)
            rewards.append(reward)
            if terminal:
                ep_len = step
                break


        scores_deque.append(sum(rewards))
        action_len_deque.append(len(actions))
        rtg_rewards = calc_rtg_reward(rewards, gamma)

        loss = []
        for j in range(len(rtg_rewards)):
            loss.append( - rtg_rewards[j]*log_probs[j])

        loss = torch.cat(loss).sum()

        optimiser.zero_grad()
        loss.backward()
        optimiser.step()
        if i % 100 == 0:
            print('Episode {}\tAverage Score: {:.2f}\t Avg Episode Length: {} '.format(i, np.mean(scores_deque),np.mean(action_len_deque)))

    return


In [1261]:

if __name__ == "__main__":
    
    NUM_EPS = 100000
    LEARNING_RATE = 0.0015
    GAMMA = 0.999
    MOMENTUM = 0.9
    EP_MAX_LEN = 300

    env = gym.make("LunarLander-v2")
    env.action_space.seed(123)
    len_obs = env.observation_space.shape[0]
    len_act = env.action_space.n
    len_hidden = 48

    policy = Policy(len_obs = len_obs, len_act= len_act, len_hidden = len_hidden)
    train(env, policy, NUM_EPS, LEARNING_RATE, GAMMA, EP_MAX_LEN )


Episode 0	Average Score: -227.41	 episode Lenght: 125.0 
Episode 100	Average Score: -156.40	 episode Lenght: 88.73 
Episode 200	Average Score: -129.39	 episode Lenght: 84.91 
Episode 300	Average Score: -119.87	 episode Lenght: 80.52 
Episode 400	Average Score: -129.10	 episode Lenght: 77.24 
Episode 500	Average Score: -122.25	 episode Lenght: 75.41 
Episode 600	Average Score: -117.26	 episode Lenght: 83.47 
Episode 700	Average Score: -109.16	 episode Lenght: 96.27 
Episode 800	Average Score: -101.99	 episode Lenght: 97.09 
Episode 900	Average Score: -93.96	 episode Lenght: 103.88 
Episode 1000	Average Score: -80.29	 episode Lenght: 107.83 
Episode 1100	Average Score: -78.86	 episode Lenght: 119.03 
Episode 1200	Average Score: -78.73	 episode Lenght: 108.18 
Episode 1300	Average Score: -69.80	 episode Lenght: 107.95 
Episode 1400	Average Score: -61.58	 episode Lenght: 103.67 
Episode 1500	Average Score: -51.28	 episode Lenght: 109.97 
Episode 1600	Average Score: -35.65	 episode Lenght: 