In [1]:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

# Simple RNN Cell

In [2]:
timesteps = 100
input_features = 32
output_features = 64

inputs = np.random.random((timesteps, input_features))

state_t = np.zeros((output_features,))

W = np.random.random((output_features, input_features))
U = np.random.random((output_features, output_features))
b = np.random.random((output_features,))

successive_outputs = []
for input_t in inputs:
    output_t = np.tanh(np.dot(W, input_t) + np.dot(U, state_t) + b)
    successive_outputs.append(output_t)
    state_t = output_t
    
final_output_sequence = np.concatenate(successive_outputs, axis=0)

# Example Using Keras

In [3]:
max_features = 10000
maxlen = 500

(x_train, y_train), (x_test, y_test) = tf.keras.datasets.imdb.load_data(num_words=max_features)
x_train = tf.keras.preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = tf.keras.preprocessing.sequence.pad_sequences(x_test, maxlen=maxlen)

In [4]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(max_features, 32),
    tf.keras.layers.SimpleRNN(32),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])

In [5]:
# history = model.fit(x_train, y_train, epochs=10, batch_size=128, validation_split=0.2)

In [6]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(max_features, 32),
    tf.keras.layers.LSTM(32),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])

In [7]:
# history = model.fit(x_train, y_train, epochs=10, batch_size=128, validation_split=0.2)

# RNN for Value Function

In [8]:
from environments.binomials import BinomialBuchbergerEnv, LeadMonomialWrapper

In [9]:
degree = 3
size = 5
variables = 3

env = LeadMonomialWrapper(BinomialBuchbergerEnv(degree, size, variables))

In [10]:
model = tf.keras.models.Sequential([
    tf.keras.layers.LSTM(32, input_shape=(None, 2*variables)),
    tf.keras.layers.Dense(1, activation='linear')
])

model.compile(optimizer='adam', loss='mse')

In [11]:
class DegreeAgent:

    def __init__(self, random=False):
        self.random = random

    def act(self, state):
        n = state.shape[2]//2  # number of variables
        degs = np.sum(np.maximum(state[:, :, :n], state[:, :, n:]), axis=2)
        if self.random:
            indices = np.where(degs == np.min(degs))[0]
            return np.random.choice(indices)
        else:
            return np.argmin(degs)

In [12]:
def discounted_rewards(rewards, gamma):
    out = np.empty(len(rewards))
    cumulative_reward = 0
    for i in reversed(range(len(rewards))):
        cumulative_reward = rewards[i] + gamma * cumulative_reward
        out[i] = cumulative_reward
    return list(out)


def train(model, agent, env, episodes, gamma=1.0):
    """Train the value model on the agent's performance."""

    total_states = []
    total_actions = []
    total_rewards = []

    # generate rollouts and discounted rewards
    for i in range(episodes):
        state = env.reset()
        done = False
        states = []
        actions = []
        rewards = []
        while not done:
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action)
            states.append(state[:,0,:])
            actions.append(action)
            rewards.append(reward)
            state = next_state
        rewards = discounted_rewards(rewards, gamma)

        total_states += states
        total_actions += actions
        total_rewards += rewards

    # fit to discounted rewards
    for i in range(len(total_states)):
        state = np.expand_dims(total_states[i], axis=0)
        value = np.expand_dims(np.array(total_rewards[i]), axis=0)       
        model.fit(state, value, verbose=0)

In [13]:
%time train(model, DegreeAgent(), env, 1000)

CPU times: user 1h 19min 57s, sys: 22min 21s, total: 1h 42min 18s
Wall time: 21min 29s


In [14]:
def train_batched(model, agent, env, episodes, gamma=1.0):
    """Train the value model on the agent's performance."""

    total_states = []
    total_actions = []
    total_rewards = []

    # generate rollouts and discounted rewards
    for i in range(episodes):
        state = env.reset()
        done = False
        states = []
        actions = []
        rewards = []
        while not done:
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action)
            states.append(state[:,0,:])
            actions.append(action)
            rewards.append(reward)
            state = next_state
        rewards = discounted_rewards(rewards, gamma)

        total_states += states
        total_actions += actions
        total_rewards += rewards
        
    # process into batches
    batches = {}
    for i in range(len(total_states)):
        size = total_states[i].shape[0]
        if size not in batches:
            batches[size] = [[], []]
        batches[size][0].append(total_states[i])
        batches[size][1].append(total_rewards[i])

    # fit to discounted rewards
    for size in batches:
        states = np.stack(batches[size][0])
        values = np.array(batches[size][1])      
        model.fit(states, values, verbose=0)

In [15]:
%time train_batched(model, DegreeAgent(), env, 1000)

CPU times: user 3min 26s, sys: 48 s, total: 4min 14s
Wall time: 57.1 s
