In [1]:
import matplotlib.pyplot as plt
import numpy as np
import sympy as sp
import tensorflow as tf

from environments.binomials import BinomialBuchbergerEnv, LeadMonomialWrapper
from agents.networks import ParallelMultilayerPerceptron

# Environment

In [2]:
variables = sp.symbols('x y z')
domain = sp.FF(32003)
order = 'grevlex'
degree = 3
size = 5

env = LeadMonomialWrapper(BinomialBuchbergerEnv(degree, size, len(variables)))
env_full = LeadMonomialWrapper(BinomialBuchbergerEnv(degree, size, len(variables)), k=2)

# Agents

In [3]:
class RandomAgent:

    def act(self, state):
        return np.random.randint(state.shape[0])

In [4]:
class DegreeAgent:

    def __init__(self, random=False):
        self.random = random

    def act(self, state):
        n = state.shape[2]//2  # number of variables
        degs = np.sum(np.maximum(state[:, :, :n], state[:, :, n:]), axis=2)
        if self.random:
            indices = np.where(degs == np.min(degs))[0]
            return np.random.choice(indices)
        else:
            return np.argmin(degs)

In [5]:
def discounted_rewards(rewards, gamma):
    out = np.empty(len(rewards))
    cumulative_reward = 0
    for i in reversed(range(len(rewards))):
        cumulative_reward = rewards[i] + gamma * cumulative_reward
        out[i] = cumulative_reward
    return list(out)


class PGAgent:
    """A policy gradient agent."""

    def __init__(self, network, learning_rate=0.00025, gamma=0.99):
        self.model = self._buildModel(network, learning_rate)
        self.gamma = gamma

    def act(self, state):
        """Choose an action for the given state."""
        probs = self.model.predict(np.expand_dims(state, axis=0))[0]
        return np.random.choice(len(probs), p=probs)

    def train(self, env, episodes):
        """Train the agent using policy gradients."""
        reward_out = np.zeros(episodes)

        total_states = []
        total_actions = []
        total_rewards = []
        total_baselines = []

        # generate rollouts and discounted rewards
        for i in range(episodes):
            state = env.reset()
            done = False
            states = []
            actions = []
            rewards = []
            baselines = []
            while not done:
                action = self.act(state)
                next_state, reward, done, _ = env.step(action)
                reward_out[i] += reward
                states.append(state)
                actions.append(action)
                rewards.append(reward)
                baselines.append(state.shape[0])
                state = next_state
            rewards = discounted_rewards(rewards, self.gamma)

            total_states += states
            total_actions += actions
            total_rewards += rewards
            total_baselines += baselines

        # produce and normalize advantages
        advantages = np.array(total_rewards) + np.array(total_baselines)
        advantages -= np.mean(advantages)
        advantages /= np.std(advantages)
        
        # fit to advantages to perform policy gradient step
        for i in range(len(total_states)):
            state = np.expand_dims(total_states[i], axis=0)
            advantage = np.zeros((1, state.shape[1]))
            advantage[0, total_actions[i]] = advantages[i]  
            self.model.fit(state, advantage, verbose=0)

        return reward_out

    def loadModel(self, filename):
        self.model.load_weights(filename)
        
    def saveModel(self, filename):
        self.model.save_weights(filename)

    def _buildModel(self, network, learning_rate):
        model = tf.keras.models.clone_model(network)
        loss = 'categorical_crossentropy'
        optimizer = tf.keras.optimizers.Adam(learning_rate)
        model.compile(loss=loss, optimizer=optimizer)
        return model

In [6]:
# set weights on a network so it outputs degree
# use max(a, b) = 0.5 * (relu(a) + relu(b) + relu(a-b) + relu(b-a))

n = len(variables)
scale = 10

filters0 = np.zeros((1, 1, 2*n, 4*n))
filters0[0,0,:,:] = np.vstack((np.hstack((np.eye(n), np.zeros((n,n)), np.eye(n), -np.eye(n))),
                               np.hstack((np.zeros((n,n)), np.eye(n), -np.eye(n), np.eye(n)))))
bias0 = np.zeros(4*n)
filters1 = np.full((1, 1, 4*n, 1), -0.5*scale)
bias1 = np.zeros(1)

network = ParallelMultilayerPerceptron(2*n, [4*n])
SetPGAgent = PGAgent(network)
SetPGAgent.model.predict(np.expand_dims(env.reset(), axis=0))  # need to run before setting weights
SetPGAgent.model.set_weights([filters0, bias0, filters1, bias1])

# Testing

In [7]:
def test(agent, env, episodes):
    rewards = np.zeros(episodes)
    for i in range(episodes):
        state = env.reset()
        done = False
        while not done:
            action = agent.act(state)
            state, reward, done, _ = env.step(action)
            rewards[i] += reward
    return rewards

In [8]:
agent = RandomAgent()
rewards = test(agent, env, 100)

np.mean(rewards), np.std(rewards), np.min(rewards), np.max(rewards)

(-40.56, 17.784442639565626, -105.0, -15.0)

In [9]:
agent = DegreeAgent(random=False)
rewards = test(agent, env, 100)

np.mean(rewards), np.std(rewards), np.min(rewards), np.max(rewards)

(-33.17, 13.121017491033232, -78.0, -15.0)

In [10]:
agent = DegreeAgent(random=True)
rewards = test(agent, env, 100)

np.mean(rewards), np.std(rewards), np.min(rewards), np.max(rewards)

(-32.73, 13.261866384487515, -78.0, -15.0)

In [11]:
agent = SetPGAgent
rewards = test(agent, env, 100)

np.mean(rewards), np.std(rewards), np.min(rewards), np.max(rewards)

(-32.45, 12.658890156723851, -91.0, -15.0)

In [12]:
n = len(variables)
network = ParallelMultilayerPerceptron(4*n, [16*n, 16*n])
agent = PGAgent(network, learning_rate=0.00001, gamma=1.0)

In [13]:
rewards = []
i = 0

while True:
    i += 1
    r = np.mean(agent.train(env_full, 1000))
    rewards.append(r)
    print("\rEpoch {} - avg_reward: {}".format(i, r), end="")

Epoch 25 - avg_reward: -40.864

KeyboardInterrupt: 

In [None]:
plt.plot(rewards)