In [1]:
import gym
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import os
from collections import deque, namedtuple

env = gym.make('LunarLander-v2')


  deprecation(
  deprecation(


In [2]:
class QNetwork(nn.Module):
    def __init__(self, state_size, action_size):
        super(QNetwork, self).__init__()
        self.fc1 = nn.Linear(state_size, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, action_size)

    def forward(self, state):
        x = torch.relu(self.fc1(state))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)




In [3]:
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
q_network = QNetwork(state_size, action_size)
target_q_network = QNetwork(state_size, action_size)
target_q_network.load_state_dict(q_network.state_dict())

<All keys matched successfully>

In [4]:
Experience = namedtuple('Experience', ['state', 'action', 'reward', 'next_state', 'done'])

class ReplayBuffer:
    def __init__(self, buffer_size):
        self.memory = deque(maxlen=buffer_size)

    def add(self, experience):
        self.memory.append(experience)

    def sample(self, batch_size):
        indices = np.random.choice(len(self.memory), batch_size, replace=False)
        experiences = [self.memory[idx] for idx in indices]
        return experiences

    def __len__(self):
        return len(self.memory)

buffer = ReplayBuffer(buffer_size=100000)


In [5]:
batch_size = 64
gamma = 0.99
alpha = 0.2  # Entropy regularization term
tau = 1e-3  # For soft update of target parameters
learning_rate = 1e-3
epsilon_start = 1.0
epsilon_end = 0.01
epsilon_decay = 0.995
target_update_frequency = 10  # Update target network every 10 episodes
model_filename = 'soft_q_learning_model.pth'

optimizer = optim.Adam(q_network.parameters(), lr=learning_rate)
epsilon = epsilon_start


In [6]:
def save_model(model, filename):
    torch.save(model.state_dict(), filename)

def load_model(model, filename):
    if os.path.isfile(filename):
        model.load_state_dict(torch.load(filename))
        print(f"Loaded model from {filename}")
    else:
        print(f"No model found at {filename}, starting training from scratch")


In [7]:
load_model(q_network, model_filename)


No model found at soft_q_learning_model.pth, starting training from scratch


In [8]:
def select_action(state, epsilon):
    if np.random.rand() < epsilon:
        return env.action_space.sample()
    else:
        state_tensor = torch.FloatTensor(state).unsqueeze(0)
        with torch.no_grad():
            q_values = q_network(state_tensor)
        return torch.argmax(q_values).item()


In [9]:
def soft_update(local_model, target_model, tau):
    for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
        target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)

In [10]:
num_episodes = 1000
max_t = 1000

In [11]:
for episode in range(num_episodes):
    state = env.reset()
    total_reward = 0

    for t in range(max_t):
        action = select_action(state, epsilon)
        next_state, reward, done, _ = env.step(action)
        total_reward += reward

        buffer.add(Experience(state, action, reward, next_state, done))
        state = next_state

        if done:
            break

        if len(buffer) > batch_size:
            experiences = buffer.sample(batch_size)

            states = torch.FloatTensor([e.state for e in experiences])
            actions = torch.LongTensor([e.action for e in experiences])
            rewards = torch.FloatTensor([e.reward for e in experiences])
            next_states = torch.FloatTensor([e.next_state for e in experiences])
            dones = torch.FloatTensor([e.done for e in experiences])

            q_values = q_network(states).gather(1, actions.unsqueeze(1)).squeeze(1)
            with torch.no_grad():
                next_q_values = target_q_network(next_states)
                soft_q_values = next_q_values - alpha * next_q_values.logsumexp(dim=1, keepdim=True)
                target_q_values = rewards + (1 - dones) * gamma * soft_q_values.max(1)[0]

            loss = nn.MSELoss()(q_values, target_q_values)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if t % target_update_frequency == 0:
                soft_update(q_network, target_q_network, tau)

    print(f'Episode {episode}, Total Reward: {total_reward}')

    epsilon = max(epsilon_end, epsilon_decay * epsilon)

    if episode % 10 == 0:  # Save model every 10 episodes
        save_model(q_network, model_filename)
        print(f"Model saved at episode {episode}")


  if not isinstance(terminated, (bool, np.bool8)):
  states = torch.FloatTensor([e.state for e in experiences])


Episode 0, Total Reward: -308.1489577134298
Model saved at episode 0
Episode 1, Total Reward: -315.7497814349833
Episode 2, Total Reward: -83.24103282066842
Episode 3, Total Reward: -65.3382379643823
Episode 4, Total Reward: -101.10317076602718
Episode 5, Total Reward: -69.21158407285196
Episode 6, Total Reward: -48.397142941341585
Episode 7, Total Reward: -273.6733423491942
Episode 8, Total Reward: -256.06091347498625
Episode 9, Total Reward: -82.2341952725516
Episode 10, Total Reward: -249.29172460645705
Model saved at episode 10
Episode 11, Total Reward: -147.24271829138797
Episode 12, Total Reward: -78.40145390767479
Episode 13, Total Reward: -11.466880557226759
Episode 14, Total Reward: -235.5632548316504
Episode 15, Total Reward: -108.46780913072504
Episode 16, Total Reward: -83.88939972546093
Episode 17, Total Reward: -105.28335332180227
Episode 18, Total Reward: -243.46799871297367
Episode 19, Total Reward: -203.44697831644544
Episode 20, Total Reward: -172.43857246866074
Model

In [12]:
save_model(q_network, model_filename)