In [8]:
from tqdm import tqdm

from gflownet import HyperGridEnv, ForwardHyperGridPolicy, UniformPolicy, TrajectoryBalanceObjective, \
    TrajectoryBalanceOptimizer
from gflownet.api.reward import Reward
from gflownet import ExploratoryPolicy, HyperGridProxy, RandomSampler
from gflownet import RewardPrioritizedReplayBuffer
from gflownet.trainer.trainer import Trainer

In [9]:
# Define the environment and the core forward and backward policies.
size = 16
env = HyperGridEnv(size=size, n_dimensions=3, max_num_steps=2)
forward_policy = ForwardHyperGridPolicy(env=env)
backward_policy = UniformPolicy()

# Define the objective that we want to optimize. Note that the reward is not needed here.
objective = TrajectoryBalanceObjective(
    forward_policy=forward_policy,
    backward_policy=backward_policy,
)

# Define the reward and the underlying proxy
proxy = HyperGridProxy(size=size)
reward = Reward(
    proxy=proxy,
    reward_boosting='linear'
)

# Define the exploratory policy that will be used in the training to sample forward trajectories.
train_forward_policy = ExploratoryPolicy(
    first_policy=forward_policy,
    second_policy=UniformPolicy(),
    first_policy_weight=0.9,
)

# Define the train sampler and a replay buffer that will be used to sample trajectories for training.
train_forward_sampler = RandomSampler(
    policy=train_forward_policy,
    env=env,
    reward=reward,
)


In [10]:
# You can now train the model with a simple loop
n_iterations = 1000
n_trajectories = 16
device = 'cpu'

train_forward_sampler.set_device(device)
objective.set_device(device)

optimizer = TrajectoryBalanceOptimizer(
    cls_name='Adam',
    lr=1e-3,
    logZ_multiplier=10.0
)
optimizer.initialize(model=objective)

for i in (pbar := tqdm(range(n_iterations), total=n_iterations)):
    trajectories = train_forward_sampler.sample_trajectories(n_trajectories=n_trajectories)
    objective_output = objective.compute_objective_output(trajectories=trajectories)
    objective_output.loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    if i % 100 == 0:
        pbar.set_description(f'Loss: {objective_output.loss.item():.2f}')

Loss: 1.05: 100%|██████████| 1000/1000 [00:05<00:00, 172.50it/s]


In [11]:
# Or you can use the Trainer class that implements more functionalities, e.g. sampling from replay buffer.

# Define the backward sampler and the replay buffer.
backward_sampler = RandomSampler(
    policy=backward_policy,
    env=env.reversed(),
    reward=reward,
)
replay_buffer = RewardPrioritizedReplayBuffer(
    sampler=backward_sampler
)

# Initialize the trainer with bunch of parameters.
trainer = Trainer(
    run_dir='../experiments/example_run',
    logger=None,
    train_forward_sampler=train_forward_sampler,
    train_replay_buffer=replay_buffer,
    train_forward_n_trajectories=n_trajectories,
    train_replay_n_trajectories=16,
    n_iterations=n_iterations,
    objective=objective,
    optimizer=TrajectoryBalanceOptimizer(
        cls_name='Adam',
        lr=1e-3,
        logZ_multiplier=10.0
    ),
    device=device,
)

# Train the model
trainer.train()

Loss: 4.2197: 100%|██████████| 1000/1000 [00:21<00:00, 46.35it/s]


{'loss': 1.213454246520996, 'logZ': 3.3436150550842285, 'epoch': 10}