# SymbolicGym: Integration with Popular RL Libraries

This notebook demonstrates how to integrate SymbolicGym with popular reinforcement learning libraries:
1. Stable Baselines3
2. RLlib (Ray)
3. Custom PyTorch implementation

We'll solve the same SAT problems with each approach to compare their performance.

In [None]:
import random
from collections import deque

import gymnasium as gym
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

# Check if symbolicgym is installed
try:
    import symbolicgym

    print(f"Using SymbolicGym version: {getattr(symbolicgym, '__version__', 'dev')}")
except ImportError:
    print("SymbolicGym not found. Please install with: pip install -e .")

# Set seeds for reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

## 1. Creating a SAT Problem

First, we'll create a standard SAT problem to use with all RL frameworks.

In [None]:
def create_test_formula(difficulty="medium"):
    """Create a SAT formula with specified difficulty."""
    if difficulty == "easy":
        # Simple 3-SAT with 5 variables
        return {
            "clauses": [
                [1, 2, 3],
                [-1, -2, 4],
                [2, -3, 5],
                [-2, 3, -4],
                [1, -3, -5],
                [-1, 4, 5],
                [1, -4, -5],
                [-1, -2, -3],
                [2, 4, -5],
                [3, -4, 5],
                [-1, 3, 4],
                [1, 2, -5],
            ],
            "num_vars": 5,
            "name": "easy_3sat",
        }
    elif difficulty == "medium":
        # Generate random 3-SAT with 10 variables
        clauses = []
        num_vars = 10
        num_clauses = 42  # ratio 4.2

        for _ in range(num_clauses):
            vars_in_clause = np.random.choice(num_vars, 3, replace=False) + 1
            literals = [v if np.random.random() > 0.5 else -v for v in vars_in_clause]
            clauses.append(literals)

        return {"clauses": clauses, "num_vars": num_vars, "name": "medium_3sat"}
    else:  # hard
        # Generate random 3-SAT with 20 variables near phase transition
        clauses = []
        num_vars = 20
        num_clauses = 85  # ratio 4.25

        for _ in range(num_clauses):
            vars_in_clause = np.random.choice(num_vars, 3, replace=False) + 1
            literals = [v if np.random.random() > 0.5 else -v for v in vars_in_clause]
            clauses.append(literals)

        return {"clauses": clauses, "num_vars": num_vars, "name": "hard_3sat"}


# Create our test formula
formula = create_test_formula("medium")
print(
    f"Created {formula['name']} with {formula['num_vars']} variables and {len(formula['clauses'])} clauses"
)

# Display a few sample clauses
print("\nSample clauses:")
for i in range(min(5, len(formula["clauses"]))):
    print(f"Clause {i + 1}: {formula['clauses'][i]}")

## 2. State Preprocessing

To use SymbolicGym with RL libraries, we need to convert the dictionary observations to flat vectors.

In [None]:
def preprocess_observation(observation):
    """Preprocess the SymbolicGym observation into a flat vector for RL algorithms."""
    variables = observation["variables"]
    clauses = observation["clauses"]

    # You can add more features here if desired
    state = np.concatenate([variables, clauses])
    return state


# Create environment and test preprocessing
from symbolicgym.envs.sat import SymbolicSatEnv

env = SymbolicSatEnv(formula=formula, reward_mode="dense", max_steps=100)
obs, _ = env.reset(seed=RANDOM_SEED)

# Show raw observation
print("Raw observation:")
print(f"- variables: shape {obs['variables'].shape}, values: {obs['variables']}")
print(f"- clauses: shape {obs['clauses'].shape}, values: {obs['clauses']}")

# Show preprocessed observation
preprocessed = preprocess_observation(obs)
print(f"\nPreprocessed state: shape {preprocessed.shape}, values: {preprocessed}")

## 3. Implementation with Custom PyTorch DQN

First, let's implement a solution using a custom PyTorch DQN agent.

In [None]:
class DQN(nn.Module):
    def __init__(self, input_size, output_size, hidden_size=64):
        super().__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)


class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)

    def push(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        return random.sample(self.buffer, batch_size)

    def __len__(self):
        return len(self.buffer)


class DQNAgent:
    def __init__(
        self,
        state_size,
        action_size,
        hidden_size=64,
        lr=0.001,
        gamma=0.99,
        epsilon=0.9,
        epsilon_min=0.05,
        epsilon_decay=0.995,
    ):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay

        # Initialize device
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"Using device: {self.device}")

        # Initialize networks
        self.policy_net = DQN(state_size, action_size, hidden_size).to(self.device)
        self.target_net = DQN(state_size, action_size, hidden_size).to(self.device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()

        # Initialize optimizer
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=lr)

        # Initialize replay buffer
        self.memory = ReplayBuffer(10000)

        # Training metrics
        self.rewards_history = []
        self.loss_history = []
        self.satisfaction_history = []

    def select_action(self, state):
        if random.random() > self.epsilon:
            with torch.no_grad():
                state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device)
                q_values = self.policy_net(state_tensor)
                return q_values.max(1)[1].item()
        else:
            return random.randrange(self.action_size)

    def learn(self, batch_size):
        if len(self.memory) < batch_size:
            return 0

        # Sample transitions
        transitions = self.memory.sample(batch_size)
        batch = list(zip(*transitions, strict=False))

        # Extract batch components
        state_batch = torch.FloatTensor(batch[0]).to(self.device)
        action_batch = torch.LongTensor(batch[1]).unsqueeze(1).to(self.device)
        reward_batch = torch.FloatTensor(batch[2]).to(self.device)
        next_state_batch = torch.FloatTensor(batch[3]).to(self.device)
        done_batch = torch.FloatTensor(batch[4]).to(self.device)

        # Compute Q(s_t, a) - the model computes Q(s_t), then we select the columns of actions taken
        current_q_values = self.policy_net(state_batch).gather(1, action_batch)

        # Compute V(s_{t+1}) for all next states
        next_q_values = self.target_net(next_state_batch).max(1)[0].detach()

        # Compute the expected Q values
        expected_q_values = reward_batch + self.gamma * next_q_values * (1 - done_batch)

        # Compute loss
        loss = nn.MSELoss()(current_q_values, expected_q_values.unsqueeze(1))

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        return loss.item()

    def update_target_net(self):
        self.target_net.load_state_dict(self.policy_net.state_dict())

    def decay_epsilon(self):
        self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)

    def train(self, env, num_episodes, batch_size=64, target_update=10):
        for episode in range(num_episodes):
            # Reset environment
            obs, _ = env.reset()
            state = preprocess_observation(obs)

            # Initialize episode variables
            total_reward = 0
            max_satisfaction = 0
            episode_loss = 0
            steps = 0
            done = False

            while not done:
                steps += 1

                # Select and perform action
                action = self.select_action(state)
                next_obs, reward, terminated, truncated, info = env.step(action)
                next_state = preprocess_observation(next_obs)
                done = terminated or truncated

                # Store transition
                self.memory.push(state, action, reward, next_state, done)

                # Move to the next state
                state = next_state
                total_reward += reward

                # Track max satisfaction
                if "satisfaction_ratio" in info:
                    max_satisfaction = max(max_satisfaction, info["satisfaction_ratio"])

                # Learn from experience
                loss = self.learn(batch_size)
                if loss:
                    episode_loss += loss

            # Update target network
            if episode % target_update == 0:
                self.update_target_net()

            # Decay exploration rate
            self.decay_epsilon()

            # Store metrics
            self.rewards_history.append(total_reward)
            self.loss_history.append(episode_loss / max(1, steps))
            self.satisfaction_history.append(max_satisfaction)

            # Print progress
            if (episode + 1) % 10 == 0:
                print(
                    f"Episode {episode + 1}/{num_episodes}, "
                    f"Reward: {total_reward:.2f}, "
                    f"Loss: {episode_loss / max(1, steps):.4f}, "
                    f"Satisfaction: {max_satisfaction:.2f}, "
                    f"Epsilon: {self.epsilon:.2f}"
                )

Now let's train our custom DQN agent on the SAT problem.

In [None]:
# Create environment for custom DQN
env_custom = SymbolicSatEnv(formula=formula, reward_mode="dense", max_steps=100)

# Get state and action dimensions
obs, _ = env_custom.reset(seed=RANDOM_SEED)
state = preprocess_observation(obs)
state_size = len(state)
action_size = env_custom.action_space.n

print(f"State size: {state_size}, Action size: {action_size}")

# Create and train the agent
custom_agent = DQNAgent(
    state_size=state_size,
    action_size=action_size,
    hidden_size=64,
    lr=0.001,
    gamma=0.99,
    epsilon=0.9,
    epsilon_min=0.05,
    epsilon_decay=0.995,
)

print("Training custom PyTorch DQN agent...")
custom_agent.train(env_custom, num_episodes=100, batch_size=64, target_update=10)

Let's visualize the training results for our custom DQN agent.

In [None]:
plt.figure(figsize=(15, 5))

# Plot rewards
plt.subplot(1, 3, 1)
plt.plot(custom_agent.rewards_history)
plt.title("Custom DQN: Episode Rewards")
plt.xlabel("Episode")
plt.ylabel("Total Reward")
plt.grid(True)

# Plot loss
plt.subplot(1, 3, 2)
plt.plot(custom_agent.loss_history)
plt.title("Custom DQN: Training Loss")
plt.xlabel("Episode")
plt.ylabel("Average Loss")
plt.grid(True)

# Plot satisfaction
plt.subplot(1, 3, 3)
plt.plot(custom_agent.satisfaction_history)
plt.title("Custom DQN: Max Clause Satisfaction")
plt.xlabel("Episode")
plt.ylabel("Satisfaction Ratio")
plt.grid(True)

plt.tight_layout()
plt.show()

## 4. Implementation with Stable Baselines3

Now let's train an agent using [Stable Baselines3](https://stable-baselines3.readthedocs.io/), a popular RL library.

First, we need to create a wrapper to convert our dictionary observations to arrays.

In [None]:
# Check if Stable-Baselines3 is installed
try:
    import stable_baselines3
    from stable_baselines3 import DQN as SB3_DQN
    from stable_baselines3.common.evaluation import evaluate_policy
    from stable_baselines3.common.vec_env import DummyVecEnv

    print(f"Using Stable-Baselines3 version: {stable_baselines3.__version__}")
    sb3_available = True
except ImportError:
    print("Stable-Baselines3 not found. Install with: pip install stable-baselines3")
    sb3_available = False

In [None]:
if sb3_available:
    # Create environment wrapper for SB3
    class SATGymWrapper(gym.Wrapper):
        def __init__(self, env):
            super().__init__(env)
            # Get observation dimensions
            obs, _ = env.reset()
            processed_obs = preprocess_observation(obs)

            # Define new observation space
            self.observation_space = gym.spaces.Box(
                low=-1.0, high=1.0, shape=(len(processed_obs),), dtype=np.float32
            )

        def reset(self, **kwargs):
            obs, info = self.env.reset(**kwargs)
            return preprocess_observation(obs), info

        def step(self, action):
            obs, reward, terminated, truncated, info = self.env.step(action)
            processed_obs = preprocess_observation(obs)
            return processed_obs, reward, terminated, truncated, info

    # Create and wrap environment
    raw_env = SymbolicSatEnv(formula=formula, reward_mode="dense", max_steps=100)
    env_sb3 = SATGymWrapper(raw_env)

    # Convert to VecEnv (SB3 format)
    vec_env = DummyVecEnv([lambda: env_sb3])

    print("Training Stable-Baselines3 DQN agent...")

    # Create and train SB3 agent
    model = SB3_DQN(
        "MlpPolicy",
        vec_env,
        learning_rate=0.001,
        buffer_size=10000,
        learning_starts=1000,
        batch_size=64,
        gamma=0.99,
        exploration_fraction=0.3,
        exploration_initial_eps=0.9,
        exploration_final_eps=0.05,
        train_freq=1,
        target_update_interval=10,
        verbose=1,
    )

    # Train the model
    model.learn(total_timesteps=10000)

    # Evaluate the model
    mean_reward, std_reward = evaluate_policy(model, vec_env, n_eval_episodes=10)
    print(f"Mean reward: {mean_reward:.2f} +/- {std_reward:.2f}")

## 5. Implementation with RLlib (Ray)

Finally, let's implement a solution using [RLlib](https://docs.ray.io/en/latest/rllib/index.html), which is powerful for distributed training.

In [None]:
# Check if Ray RLlib is installed
try:
    import ray
    from ray import tune
    from ray.rllib.algorithms.dqn import DQNConfig

    print(f"Using Ray version: {ray.__version__}")
    rllib_available = True
except ImportError:
    print("Ray RLlib not found. Install with: pip install ray[rllib]")
    rllib_available = False

In [None]:
if rllib_available:
    # Create environment wrapper for RLlib
    class SATGymWrapperRLlib(gym.Wrapper):
        def __init__(self, env_config=None):
            env_config = env_config or {}
            formula_to_use = env_config.get("formula", formula)

            # Create environment
            env = SymbolicSatEnv(
                formula=formula_to_use, reward_mode="dense", max_steps=100
            )
            super().__init__(env)

            # Get observation dimensions
            obs, _ = env.reset()
            processed_obs = preprocess_observation(obs)

            # Define new observation space
            self.observation_space = gym.spaces.Box(
                low=-1.0, high=1.0, shape=(len(processed_obs),), dtype=np.float32
            )

        def reset(self, **kwargs):
            obs, info = self.env.reset(**kwargs)
            return preprocess_observation(obs), info

        def step(self, action):
            obs, reward, terminated, truncated, info = self.env.step(action)
            processed_obs = preprocess_observation(obs)
            return processed_obs, reward, terminated, truncated, info

    # Initialize Ray
    ray.init(ignore_reinit_error=True)

    # Configure the algorithm
    config = (
        DQNConfig()
        .environment(env=SATGymWrapperRLlib, env_config={"formula": formula})
        .training(gamma=0.99, lr=0.001, train_batch_size=64)
        .exploration(
            exploration_config={
                "initial_epsilon": 0.9,
                "final_epsilon": 0.05,
                "epsilon_timesteps": 10000,
            }
        )
        .resources(num_gpus=0)
    )

    print("Training Ray RLlib DQN agent...")

    # Train the model
    stop = {"training_iteration": 100}
    results = tune.run(
        "DQN",
        config=config.to_dict(),
        stop=stop,
        checkpoint_at_end=True,
        checkpoint_freq=10,
        verbose=1,
    )

    # Get the best trial
    best_trial = results.get_best_trial("episode_reward_mean", "max")
    print(f"Best trial: {best_trial.trial_id}")
    print(f"Best trial final reward: {best_trial.last_result['episode_reward_mean']}")

    # Cleanup Ray
    ray.shutdown()

## 6. Compare Performance

Let's compare the performance of all three implementations.

In [None]:
def evaluate_agent(agent_type, agent, env, num_episodes=20):
    """Evaluate an agent on the environment."""
    rewards = []
    steps = []
    solved = 0

    for episode in range(num_episodes):
        if agent_type == "custom":
            # Reset environment
            obs, _ = env.reset()
            state = preprocess_observation(obs)

            # Episode variables
            total_reward = 0
            episode_steps = 0
            done = False

            while not done:
                episode_steps += 1
                # Select action without exploration
                with torch.no_grad():
                    state_tensor = (
                        torch.FloatTensor(state).unsqueeze(0).to(agent.device)
                    )
                    q_values = agent.policy_net(state_tensor)
                    action = q_values.max(1)[1].item()

                # Take action
                next_obs, reward, terminated, truncated, info = env.step(action)
                next_state = preprocess_observation(next_obs)
                done = terminated or truncated

                # Update state and metrics
                state = next_state
                total_reward += reward

                # Check if solved
                if done and info.get("solved", False):
                    solved += 1

            # Save metrics
            rewards.append(total_reward)
            steps.append(episode_steps)

        elif agent_type == "sb3" and sb3_available:
            # Reset environment
            obs, _ = env.reset()
            total_reward = 0
            episode_steps = 0
            done = False

            while not done:
                episode_steps += 1
                # Select action
                action, _ = agent.predict(obs, deterministic=True)

                # Take action
                obs, reward, terminated, truncated, info = env.step(action)
                done = terminated or truncated

                # Update metrics
                total_reward += reward

                # Check if solved
                if done and info.get("solved", False):
                    solved += 1

            # Save metrics
            rewards.append(total_reward)
            steps.append(episode_steps)

    return {
        "mean_reward": np.mean(rewards),
        "std_reward": np.std(rewards),
        "mean_steps": np.mean(steps),
        "std_steps": np.std(steps),
        "solved_ratio": solved / num_episodes,
    }


# Dictionary to store results
results = {}

# Evaluate custom DQN
print("\nEvaluating Custom PyTorch DQN...")
results["custom"] = evaluate_agent("custom", custom_agent, env_custom)
print(
    f"Custom DQN - Mean reward: {results['custom']['mean_reward']:.2f}, "
    f"Solved: {results['custom']['solved_ratio'] * 100:.1f}%, "
    f"Mean steps: {results['custom']['mean_steps']:.1f}"
)

# Evaluate SB3 if available
if sb3_available:
    print("\nEvaluating Stable-Baselines3 DQN...")
    results["sb3"] = evaluate_agent("sb3", model, env_sb3)
    print(
        f"SB3 DQN - Mean reward: {results['sb3']['mean_reward']:.2f}, "
        f"Solved: {results['sb3']['solved_ratio'] * 100:.1f}%, "
        f"Mean steps: {results['sb3']['mean_steps']:.1f}"
    )

# For RLlib, we'd need a more complex evaluation approach which is omitted for brevity

## Conclusion

We've demonstrated how to integrate SymbolicGym with different reinforcement learning libraries:

1. **Custom PyTorch DQN**: Provides maximum flexibility but requires more manual implementation
2. **Stable-Baselines3**: Offers a good balance of simplicity and performance with minimal custom code
3. **Ray RLlib**: Powerful for distributed training and hyperparameter tuning

The key steps for integration with any RL library are:
1. Create a wrapper to convert dictionary observations to flat arrays
2. Define appropriate observation and action spaces
3. Configure the RL algorithm to match the SAT environment characteristics

Each library has its strengths:
- Custom implementation gives complete control over the algorithm
- Stable-Baselines3 provides well-tested implementations for quick experimentation
- RLlib excels at scaling to large distributed training

Choose the approach that best fits your specific requirements for solving SAT problems with reinforcement learning.