In [1]:
%pip install optuna
%pip install swig
%pip install gymnasium[box2d]

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import pickle
import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque
import random
import optuna

In [3]:
env = gym.make('LunarLander-v2', render_mode="rgb_array")
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

In [4]:
class DQN(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(state_dim, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, action_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

In [5]:
class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)

    def push(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        state, action, reward, next_state, done = zip(*random.sample(self.buffer, batch_size))
        return state, action, reward, next_state, done

    def __len__(self):
        return len(self.buffer)

In [6]:
class DQNTrainer:
    def __init__(self, env, main_network, target_network, optimizer, replay_buffer, model_path='./model/model.pth', gamma=0.99, batch_size=64, target_update_frequency=1000):
        self.env = env
        self.main_network = main_network
        self.target_network = target_network
        self.optimizer = optimizer
        self.replay_buffer = replay_buffer
        self.model_path = model_path
        self.gamma = gamma
        self.batch_size = batch_size
        self.target_update_frequency = target_update_frequency
        self.step_count = 0

        # Load the model if it exists
        if os.path.exists(os.path.dirname(self.model_path)):
            if os.path.isfile(self.model_path):
                self.main_network.load_state_dict(torch.load(self.model_path))
                self.target_network.load_state_dict(torch.load(self.model_path))
                print("Loaded model from disk")
        else:
            os.makedirs(os.path.dirname(self.model_path))

    def train(self, num_episodes, save=True):
        total_rewards = []
        for episode in range(num_episodes):
            state, _ = self.env.reset()  # Extract the state from the returned tuple
            done = False
            total_reward = 0

            while not done:
                self.env.render()  # Add this line to render the environment
                # Ensure the state is in the correct shape by adding an extra dimension
                action = self.main_network(torch.FloatTensor(state).unsqueeze(0)).argmax(dim=1).item()
                next_state, reward, done, _, _ = self.env.step(action)  # Extract the next_state from the returned tuple
                self.replay_buffer.push(state, action, reward, next_state, done)
                state = next_state
                total_reward += reward

                if len(self.replay_buffer) >= self.batch_size:
                    self.update_network()

            total_rewards.append(total_reward)
            print(f"Episode {episode}, Total Reward: {total_reward}")

        # Save the model after training
        if save:
            torch.save(self.main_network.state_dict(), self.model_path)
            print("Saved model to disk")

        self.env.close()
        return sum(total_rewards) / len(total_rewards)  # Return average reward

    def update_network(self):
        state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.replay_buffer.sample(self.batch_size)

        # Convert to tensors
        state_batch = torch.FloatTensor(state_batch)
        action_batch = torch.LongTensor(action_batch)
        reward_batch = torch.FloatTensor(reward_batch)
        next_state_batch = torch.FloatTensor(next_state_batch)
        done_batch = torch.FloatTensor(done_batch)

        # Calculate the current Q-values
        q_values = self.main_network(state_batch).gather(1, action_batch.unsqueeze(1)).squeeze(1)

        # Calculate the target Q-values
        next_q_values = self.target_network(next_state_batch).max(1)[0]
        expected_q_values = reward_batch + self.gamma * next_q_values * (1 - done_batch)

        # Compute the loss
        loss = nn.MSELoss()(q_values, expected_q_values.detach())

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # Periodically update the target network
        if self.step_count % self.target_update_frequency == 0:
            self.target_network.load_state_dict(self.main_network.state_dict())

        self.step_count += 1

In [7]:
class Optimizer:
    def __init__(self, env, main_network, target_network, replay_buffer, model_path, params_path='./model/params.pkl'):
        self.env = env
        self.main_network = main_network
        self.target_network = target_network
        self.replay_buffer = replay_buffer
        self.model_path = model_path
        self.params_path = params_path

    def objective(self, trial, n_episodes=10):
        lr = trial.suggest_loguniform('lr', 1e-5, 1e-1)
        gamma = trial.suggest_uniform('gamma', 0.9, 0.999)
        batch_size = trial.suggest_categorical('batch_size', [32, 64, 128])
        target_update_frequency = trial.suggest_categorical('target_update_frequency', [500, 1000, 2000])

        optimizer = optim.Adam(self.main_network.parameters(), lr=lr)
        trainer = DQNTrainer(self.env, self.main_network, self.target_network, optimizer, self.replay_buffer, self.model_path, gamma=gamma, batch_size=batch_size, target_update_frequency=target_update_frequency)
        reward = trainer.train(n_episodes, save=False)
        return reward

    def optimize(self, n_trials=100, save_params=True):
        if not TRAIN and os.path.isfile(self.params_path):
            with open(self.params_path, 'rb') as f:
                best_params = pickle.load(f)
            print("Loaded parameters from disk")
        elif not FINETUNE:
            best_params = {
                'lr': LEARNING_RATE, 
                'gamma': GAMMA, 
                'batch_size': BATCH_SIZE, 
                'target_update_frequency': TARGET_UPDATE_FREQUENCY
                }
            print(f"Using default parameters: {best_params}")
        else:
            print("Optimizing hyperparameters")
            study = optuna.create_study(direction='maximize')
            study.optimize(self.objective, n_trials=n_trials)
            best_params = study.best_params

            if save_params:
                with open(self.params_path, 'wb') as f:
                    pickle.dump(best_params, f)
                print("Saved parameters to disk")

        return best_params

In [8]:
TRAIN = True
FINETUNE = True

# Set the following hyperparameters if FINETUNE is False
GAMMA = 0.99
BATCH_SIZE = 64
TARGET_UPDATE_FREQUENCY = 1000
LEARNING_RATE = 1e-3

In [9]:
main_network = DQN(state_dim, action_dim)
target_network = DQN(state_dim, action_dim)
target_network.load_state_dict(main_network.state_dict())
target_network.eval()

replay_buffer = ReplayBuffer(10000)

In [10]:
STEP_COUNT = 0
optimizer = Optimizer(env, main_network, target_network, replay_buffer, "./model/model.pth", "./model/params.pkl")
best_params = optimizer.optimize(n_trials=2, save_params=True)
optimizer = optim.Adam(main_network.parameters(), lr=best_params['lr'])
trainer = DQNTrainer(env, main_network, target_network, optimizer, replay_buffer, "./model/model.pth", gamma=best_params['gamma'], batch_size=best_params['batch_size'], target_update_frequency=best_params['target_update_frequency'])
trainer.train(1000)

[I 2024-07-11 00:18:51,392] A new study created in memory with name: no-name-b0bf83e3-e24a-4035-a5b0-b2f305baffaf


Optimizing hyperparameters


  lr = trial.suggest_loguniform('lr', 1e-5, 1e-1)
  gamma = trial.suggest_uniform('gamma', 0.9, 0.999)


Episode 0, Total Reward: -318.9072366224961
Episode 1, Total Reward: -585.5581007282342


  state_batch = torch.FloatTensor(state_batch)


Episode 2, Total Reward: -189.40773445014497
Episode 3, Total Reward: -326.51467033600943
Episode 4, Total Reward: -308.29743967922377
Episode 5, Total Reward: -463.67328125518924
Episode 6, Total Reward: -146.8794712703745
Episode 7, Total Reward: -154.09308379207621
Episode 8, Total Reward: -336.2867523165479


[I 2024-07-11 00:18:55,418] Trial 0 finished with value: -308.19981628270676 and parameters: {'lr': 0.00790558268335284, 'gamma': 0.9509440699573725, 'batch_size': 128, 'target_update_frequency': 500}. Best is trial 0 with value: -308.19981628270676.


Episode 9, Total Reward: -252.3803923767715


  lr = trial.suggest_loguniform('lr', 1e-5, 1e-1)
  gamma = trial.suggest_uniform('gamma', 0.9, 0.999)


Episode 0, Total Reward: -251.33041391185637
Episode 1, Total Reward: -260.23173719721206
Episode 2, Total Reward: -323.2997105792823
Episode 3, Total Reward: -184.4195232359948
Episode 4, Total Reward: -325.3352141425354
Episode 5, Total Reward: -185.84395677956337
Episode 6, Total Reward: -192.58449262000602
Episode 7, Total Reward: -282.85551002242005
Episode 8, Total Reward: -206.73803979741643


[I 2024-07-11 00:18:59,550] Trial 1 finished with value: -251.79971288899037 and parameters: {'lr': 5.9018754034715956e-05, 'gamma': 0.9488550748981232, 'batch_size': 64, 'target_update_frequency': 500}. Best is trial 1 with value: -251.79971288899037.


Episode 9, Total Reward: -305.35853060361706
Saved parameters to disk
Episode 0, Total Reward: -219.274621806715
Episode 1, Total Reward: -290.0454661081469
Episode 2, Total Reward: -211.4889324724608
Episode 3, Total Reward: -269.0099830683223
Episode 4, Total Reward: -322.35969816703937
Episode 5, Total Reward: -371.8429359766541
Episode 6, Total Reward: -227.678740115424
Episode 7, Total Reward: -253.07858670278517
Episode 8, Total Reward: -152.0877508840339
Episode 9, Total Reward: -282.22767313355274
Episode 10, Total Reward: -418.1023112976398
Episode 11, Total Reward: -249.04633805513123
Episode 12, Total Reward: -179.12076063697714
Episode 13, Total Reward: -183.03040211573455
Episode 14, Total Reward: -177.52102553519498
Episode 15, Total Reward: -303.05141322051963
Episode 16, Total Reward: -190.80353646647467
Episode 17, Total Reward: -189.60018678119465
Episode 18, Total Reward: -173.71017218421332
Episode 19, Total Reward: -162.88537901682307
Episode 20, Total Reward: -192