In [22]:
import gym
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from collections import deque
import random

In [23]:
class PolicyNetwork(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dim=256):
        super(PolicyNetwork, self).__init__()
        self.fc1 = nn.Linear(state_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, action_dim)
        self.relu = nn.ReLU()

    def forward(self, x, temperature=1.0):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return nn.Softmax(dim=-1)(x / temperature)


In [24]:
class PolicyGradientAgent:
    def __init__(self, state_dim, action_dim, hidden_dim=256, lr=0.001, gamma=0.99, temperature=1.0):
        self.policy_net = PolicyNetwork(state_dim, action_dim, hidden_dim)
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=lr)
        self.gamma = gamma
        self.temperature = temperature
        self.memory = []

    def choose_action(self, state):
        state = torch.FloatTensor(state).unsqueeze(0)
        probs = self.policy_net(state, self.temperature).detach().numpy()[0]
        action = np.random.choice(len(probs), p=probs)
        return action

    def store_transition(self, transition):
        self.memory.append(transition)

    def learn(self):
        states, actions, rewards = zip(*self.memory)

        rewards = normalize_rewards(rewards)  # Normalize rewards

        G = np.zeros_like(rewards, dtype=np.float64)
        for t in range(len(rewards)):
            G_sum = 0
            discount = 1
            for k in range(t, len(rewards)):
                G_sum += rewards[k] * discount
                discount *= self.gamma
            G[t] = G_sum

        G = torch.FloatTensor(G)
        states = torch.FloatTensor(states)
        actions = torch.LongTensor(actions)

        self.optimizer.zero_grad()
        loss = 0
        for i in range(len(G)):
            state = states[i]
            action = actions[i]
            Gt = G[i]

            probs = self.policy_net(state)
            log_prob = torch.log(probs[action])
            entropy = -torch.sum(probs * torch.log(probs))
            loss += -log_prob * Gt - 0.01 * entropy  # Entropy regularization

        loss.backward()
        self.optimizer.step()
        self.memory = []

    def save_model(self, path):
        torch.save({
            'model_state_dict': self.policy_net.state_dict(),
            'optimizer_state_dict': self.optimizer.state_dict()
        }, path)

    def load_model(self, path, device):
        checkpoint = torch.load(path, map_location=device)
        self.policy_net.load_state_dict(checkpoint['model_state_dict'])
        self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        self.device = device
        self.policy_net.to(device)
       


In [25]:
episodes=10
max_timesteps=1000
env = gym.make('LunarLander-v2',render_mode='human')
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n
agent = PolicyGradientAgent(state_dim, action_dim, temperature=1.0)
rewards_per_episode = []

In [26]:
save_path='policy_gradient_model.pth'

In [27]:
try:
    
    agent.load_model(save_path, device='cpu')
    print("Loading successful")
    
except FileNotFoundError:
    print("No checkpoint found, starting from scratch.")

Loading successful


In [28]:
for episode in range(episodes):
    state = env.reset()
    total_reward = 0
    for t in range(max_timesteps):
        action = agent.choose_action(state)
        next_state, reward, done, _ = env.step(action)
        agent.store_transition((state, action, reward))
        state = next_state
        total_reward += reward
        if done:
            break
    print(f"Episode: {episode}, Reward: {total_reward}")

    

env.close()
    

Episode: 0, Reward: 178.44862935470192
Episode: 1, Reward: 158.9581518097793
Episode: 2, Reward: 88.74128939079934
Episode: 3, Reward: 199.02253558151898
Episode: 4, Reward: 128.96397573108524
Episode: 5, Reward: 108.16536825862342
Episode: 6, Reward: -34.007050425718276
Episode: 7, Reward: 89.47118869891442
Episode: 8, Reward: 148.26956221289194
Episode: 9, Reward: 262.8064463790478
