In [1]:
import numpy as np
import sympy as sp
import tensorflow as tf

from environments.buchberger import BinomialBuchbergerEnv
from agents.pg import PGAgent
from agents.buchberger import RandomAgent, DegreeAgent

# Network

In [2]:
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.InputLayer(input_shape=(None, 1, 6)))
model.add(tf.keras.layers.Conv2D(32, 1, activation='relu'))
model.add(tf.keras.layers.Conv2D(32, 1, activation='relu'))
model.add(tf.keras.layers.Conv2D(1, 1, activation='linear'))
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam')

In [3]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, None, 1, 32)       224       
_________________________________________________________________
conv2d_1 (Conv2D)            (None, None, 1, 32)       1056      
_________________________________________________________________
conv2d_2 (Conv2D)            (None, None, 1, 1)        33        
_________________________________________________________________
flatten (Flatten)            (None, None)              0         
_________________________________________________________________
activation (Activation)      (None, None)              0         
Total params: 1,313
Trainable params: 1,313
Non-trainable params: 0
_________________________________________________________________


In [4]:
input1 = np.random.rand(1000, 100, 1, 6)
output1 = np.random.rand(1000, 100)
model.fit(input1, output1)

Epoch 1/1


<tensorflow.python.keras.callbacks.History at 0x11184f438>

In [5]:
input2 = np.random.rand(1000, 10, 1, 6)
output2 = np.random.rand(1000, 10)
model.fit(input2, output2)

Epoch 1/1


<tensorflow.python.keras.callbacks.History at 0x1118465f8>

# Environment

In [6]:
degree = 2
size = 5
env = BinomialBuchbergerEnv(degree, size)

In [7]:
def lm(f, order):
    return sp.poly(sp.LM(f, order=order), *f.gens, domain=f.domain)

def state_to_tensor(state):
    G, P = state
    vecs = []
    for pair in P:
        vec = sp.degree_list(lm(G[pair[0]], 'grevlex')) + sp.degree_list(lm(G[pair[1]], 'grevlex'))
        vecs.append(vec)
    return np.expand_dims(np.array(vecs), axis=1)

In [8]:
class LeadMonomialWrapper:
    """A wrapper for Buchberger environments that returns lead monomials as vectors."""
    
    def __init__(self, env):
        self.env = env
        self.state = None
        
    def reset(self):
        self.state = self.env.reset()
        return state_to_tensor(self.state)
    
    def step(self, action):
        G, P = self.state
        action = list(P)[action]
        self.state, reward, done, info = self.env.step(action)
        return state_to_tensor(self.state), reward, done, info

# Agent

In [9]:
def discounted_rewards(rewards, gamma):
    out = np.empty(len(rewards))
    cumulative_reward = 0
    for i in reversed(range(len(rewards))):
        cumulative_reward = rewards[i] + gamma * cumulative_reward
        out[i] = cumulative_reward
    return list(out)


class PGAgent:
    """A policy gradient agent."""

    def __init__(self, network, learning_rate=0.00025, gamma=0.99):
        self.model = self._buildModel(network, learning_rate)
        self.gamma = gamma

    def act(self, state):
        """Choose an action (row) for the given state."""
        probs = self.model.predict(np.expand_dims(state, axis=0))[0]
        return np.random.choice(len(probs), p=probs)

    def train(self, env, episodes):
        """Train the agent using policy gradients."""
        total_states = []
        total_actions = []
        total_rewards = []

        # generate rollouts and discounted rewards
        for _ in range(episodes):
            state = env.reset()
            done = False
            states = []
            actions = []
            rewards = []
            while not done:
                action = self.act(state)
                next_state, reward, done, _ = env.step(action)
                states += [state]
                actions += [action]
                rewards += [reward]
                state = next_state
            rewards = discounted_rewards(rewards, self.gamma)

            total_states += states
            total_actions += actions
            total_rewards += rewards

        for i in range(len(total_states)):
            state = np.expand_dims(total_states[i], axis=0)
            advantage = np.zeros((1, state.shape[1]))
            advantage[0, total_actions[i]] = total_rewards[i]
            
            self.model.fit(state, advantage, verbose=0)
            
    def test(self, env, episodes):
        """Test the agent for given episodes on given environment."""
        rewards = np.zeros(episodes)
        for i in range(episodes):
            state = env.reset()
            done = False
            while not done:
                action = self.act(state)
                next_state, reward, done, _ = env.step(action)
                rewards[i] += reward
                state = next_state
        return rewards

    def save(self, name):
        self.model.save_weights(name)

    def load(self, name):
        self.model.load_weights(name)

    def _buildModel(self, network, learning_rate):
        model = tf.keras.models.clone_model(network)
        loss = 'categorical_crossentropy'
        optimizer = tf.keras.optimizers.Adam(learning_rate)
        model.compile(loss=loss, optimizer=optimizer)
        return model

# Testing

In [10]:
degree = 2
size = 5
env = BinomialBuchbergerEnv(degree, size)

In [11]:
agent = RandomAgent()

episodes = 100
rewards = np.zeros(episodes)

for i in range(episodes):
    state = env.reset()
    done = False
    while not done:
        action = agent.act(state)
        state, reward, done, _ = env.step(action)
        rewards[i] += reward
        
print(rewards, np.mean(rewards))

[-36. -21. -28. -45. -28. -36. -15. -36. -21. -21. -21. -66. -28. -15.
 -28. -21. -45. -21. -36. -66. -21. -55. -21. -36. -55. -45. -36. -91.
 -28. -45. -15. -55. -21. -45. -10. -55. -28. -36. -21. -15. -36. -21.
 -45. -55. -21. -21. -36. -55. -21. -15. -15. -10. -66. -45. -21. -21.
 -28. -28. -78. -55. -21. -45. -28. -55. -15. -21. -66. -78. -45. -10.
 -66. -36. -21. -36. -45. -21. -21. -21. -21. -21. -28. -21. -55. -21.
 -21. -28. -55. -15. -28. -45. -66. -28. -28. -55. -55. -21. -10. -28.
 -21. -21.] -34.04


In [12]:
agent = DegreeAgent()

episodes = 100
rewards = np.zeros(episodes)

for i in range(episodes):
    state = env.reset()
    done = False
    while not done:
        action = agent.act(state)
        state, reward, done, _ = env.step(action)
        rewards[i] += reward
        
print(rewards, np.mean(rewards))

[-15. -21. -21. -28. -28. -21. -21. -21. -15. -21. -28. -15. -21. -21.
 -15. -21. -15. -21. -28. -21. -21. -15. -28. -21. -21. -21. -10. -15.
 -21. -28. -15. -21. -21. -15. -21. -21. -15. -15. -15. -21. -21. -28.
 -15. -28. -21. -28. -15. -15. -28. -36. -15. -21. -21. -21. -21. -21.
 -28. -21. -15. -28. -15. -21. -21. -21. -15. -15. -15. -28. -10. -28.
 -28. -21. -21. -15. -21. -21. -28. -15. -15. -10. -21. -15. -21. -21.
 -28. -21. -21. -36. -15. -10. -28. -21. -21. -36. -15. -21. -21. -15.
 -28. -21.] -20.66


In [13]:
env = LeadMonomialWrapper(env)
agent = PGAgent(model)

epochs = 100
for i in range(epochs):
    agent.train(env, 10)
    print(np.mean(agent.test(env, 10)))

-27.0
-30.2
-39.6
-39.1
-32.9
-32.2
-35.0
-39.1
-36.3
-42.6
-33.0
-36.4
-35.2
-36.2
-29.8
-30.9
-34.0
-39.0
-34.4
-31.0
-39.3
-39.4
-37.4
-18.6
-33.8
-33.6
-35.8
-29.6
-25.2
-29.4
-27.8
-33.9
-30.9
-25.3


KeyboardInterrupt: 