In [9]:
import numpy as np
import torch
import torch.nn as nn
from torch.nn.utils import parameters_to_vector, vector_to_parameters
from pymoo.algorithms.soo.nonconvex.pso import PSO
from pymoo.core.population import Population
from pymoo.core.problem import Problem
import gymnasium as gym

In [None]:
class MyNet(nn.Module):
  def __init__(self, in_features: int, out_features: int):
    self.l1 = nn.Linear(in_features, 64)
    self.relu = nn.ReLU()
    self.l2 = nn.Linear(64, 32)
    self.l3 = nn.Linear(32, out_features)

  def forward(self, x):
    x = self.l1(x)
    x = self.relu(x)
    x = self.l2(x)
    x = self.relu(x)
    x = self.l3(x)
    return x

  def sample(self, state):
    pass

In [2]:
def make_network(in_features: int, out_features: int):
    return nn.Sequential(
        nn.Linear(in_features, 64),
        nn.ReLU(),
        nn.Linear(64, 32),
        nn.ReLU(),
        nn.Linear(32, out_features)
    )

In [3]:
actor_population = [
  make_network(8, 16)
  for _ in range(25)
]

In [4]:
vector_encoded_actors = np.asarray([
  parameters_to_vector(actor.parameters()).detach().cpu().numpy()
  for actor in actor_population
])

In [None]:
def run_episode(actor_policy, env, replay_buffer=None):
    """evaluate fitness of actor's policy on an environment"""
    total_reward = 0.0
    observation, _ = env.reset(seed=42) # initial state
    terminated = False
    truncated = False
    while not (terminated or truncated):
        action = actor_policy.sample(observation)
        new_observation, reward, terminated, truncated, _ = env.step(action)
        total_reward += reward
        if replay_buffer is not None:
            replay_buffer.append((observation, action, reward, new_observation))
        observation = new_observation
    return total_reward

class TheProblem(Problem):
    def __init__(self, env, actors: list[nn.Module], *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.env = env
        self.actors = actors

    def _evaluate(self, X, out, *args, **kwargs):
        """X is the set of solutions, not just one solution"""
        F = []
        for i, x in enumerate(X):
            actor = self.actors[i]
            vector_to_parameters(torch.from_numpy(x), actor.parameters())
            total_reward = run_episode(actor_policy=actor, env=self.env, replay_buffer=[])
            F.append(-total_reward)
        out["F"] = F

In [6]:
pso = PSO(
  pop_size=len(vector_encoded_actors),
  sampling=Population.new(X=vector_encoded_actors),
  adaptive=False,
  pertube_best=False,
  seed=0,
)

In [None]:
env = gym.make("LunarLander-v3")

problem = TheProblem(env, actor_population, n_var=vector_encoded_actors[0].shape[0], xl=-1.0, xu=1.0)

In [8]:
from pymoo.optimize import minimize

res = minimize(problem,
               pso,
               ('n_gen', 200),
               seed=1,
               verbose=True)

AttributeError: 'ellipsis' object has no attribute 'reset'

In [None]:
updated_vector_encoded_actors = res.pop.get("X")