In [1]:
import kagglegym

# Create environment
env = kagglegym.make()
# Get first observation
observation = env.reset()

observation.features.shape

(968, 110)

In [2]:
# hyperparameters
batch_size = 5 # every how many episodes to do a param update?
learning_rate = 1e-2 # feel free to play with this to train faster or more stably.
gamma = 0.99 # discount factor for reward

D = observation.features.head().shape[1] # input dimensionality
H = int(D / 3) # number of hidden layer neurons

[D, H]

[110, 36]

In [7]:
import tensorflow as tf

tf.reset_default_graph()

#This defines the network as it goes from taking an observation of the environment to 
#giving a probability of chosing to the action of moving left or right.
observations = tf.placeholder(tf.float32, [None,D] , name="input_x")
W1 = tf.get_variable("W1", shape=[D, H],
           initializer=tf.contrib.layers.xavier_initializer())
layer1 = tf.nn.relu(tf.matmul(observations,W1))
W2 = tf.get_variable("W2", shape=[H, 1],
           initializer=tf.contrib.layers.xavier_initializer())
score = tf.matmul(layer1,W2)
probability = tf.nn.sigmoid(score)

#From here we define the parts of the network needed for learning a good policy.
tvars = tf.trainable_variables()
input_y = tf.placeholder(tf.float32,[None,1], name="input_y")
advantages = tf.placeholder(tf.float32,name="reward_signal")

# The loss function. This sends the weights in the direction of making actions 
# that gave good advantage (reward over time) more likely, and actions that didn't less likely.
#TODO fix loglik = tf.log(input_y*(input_y - probability) + (1 - input_y)*(input_y + probability))
#TODO fix loss = -tf.reduce_mean(loglik * advantages) 
loss = -tf.reduce_mean((input_y - probability) * advantages)
newGrads = tf.gradients(loss,tvars)

# Once we have collected a series of gradients from multiple episodes, we apply them.
# We don't just apply gradeients after every episode in order to account for noise in the reward signal.
adam = tf.train.AdamOptimizer(learning_rate=learning_rate) # Our optimizer
# Placeholders to send the final gradients through when we update.
W1Grad = tf.placeholder(tf.float32,name="batch_grad1") 
W2Grad = tf.placeholder(tf.float32,name="batch_grad2")
#
batchGrad = [W1Grad,W2Grad]
updateGrads = adam.apply_gradients(zip(batchGrad,tvars))


In [8]:
import numpy as np

def discount_rewards(r):
    """ take 1D float array of rewards and compute discounted reward """
    discounted_r = np.zeros_like(r)
    running_add = 0
    # in Python 2:
    # for t in reversed(xrange(0, r.size)):
    for t in reversed(range(0, r.size)):
        running_add = running_add * gamma + r[t]
        discounted_r[t] = running_add
    return discounted_r


In [9]:
class Debug:
    def __init__(self, state):
        self.state = state

    def log(self, msg):
        if self.state % 100 == 0:
            print(msg)


In [11]:
# Create environment
env = kagglegym.make()
# Get first observation
observation = env.reset()

init = tf.global_variables_initializer()

# Launch the graph
with tf.Session() as sess:
    rendering = False
    sess.run(init)
    observation = env.reset() # Obtain an initial observation of the environment

    # Reset the gradient placeholder. We will collect gradients in 
    # gradBuffer until we are ready to update our policy network. 
    gradBuffer = sess.run(tvars)
    for ix,grad in enumerate(gradBuffer):
        gradBuffer[ix] = grad * 0
    print("gradBuffer", len(gradBuffer), len(gradBuffer[0]), len(gradBuffer[1]))

    rewards = []
    done = False
    while not done:
        x = observation.features.fillna(.0) #.head(1)
        # log
        timestamp = x["timestamp"][0]
        #timestamp = 0
        debug = Debug(timestamp)
        debug.log(x.tail(3))
        debug.log("Timestamp #{}".format(timestamp))

        # Run the policy network and get an action to take. 
        y = sess.run(probability,feed_dict={observations: x})
        action = observation.target
        action["y"] = y
        # step the environment and get new measurements
        observation, reward, done, info = env.step(action)
        # record reward (has to be done after we call step() to get reward for previous action)
        #rewards.append(float(reward))
        rewards = y * reward
        rewards = np.vstack(rewards)
        
        # log
        debug.log(action.tail(3))
        debug.log("Reward is {}".format(reward))
        
        # compute the discounted reward backwards through time
        discounted_epr = discount_rewards(rewards)
        # size the rewards to be unit normal (helps control the gradient estimator variance)
        discounted_epr -= np.mean(discounted_epr)
        discounted_epr /= np.std(discounted_epr)
        #print("y[0:10]", y[0:10])
        # Get the gradient for this episode, and save it in the gradBuffer
        tGrad = sess.run(newGrads,feed_dict={observations: x, input_y: y, advantages: discounted_epr})
        for ix,grad in enumerate(tGrad):
            debug.log("grad-{0} is {1}".format(str(ix), grad))
            gradBuffer[ix] += grad
        
        # If we have completed enough episodes, then update the policy network with our gradients.
        sess.run(updateGrads,feed_dict={W1Grad: gradBuffer[0],W2Grad:gradBuffer[1]})
        for ix,grad in enumerate(gradBuffer):
            gradBuffer[ix] = grad * 0

        #if timestamp > 920:
        #    break

gradBuffer 2 110 36
   id  timestamp  derived_0  derived_1  derived_2  derived_3  derived_4  \
0   0        907   0.241634   0.101084   0.002550  -0.028882   0.398276   
1   7        907   0.204701   1.925502   0.753222  -0.237052   0.154132   
2  11        907  -0.073517  -0.057697   1.611172   0.310672  -0.003293   

   fundamental_0  fundamental_1  fundamental_2      ...       technical_35  \
0      -0.272570       0.538591       0.241426      ...          -0.081623   
1      -0.324891      -0.110032      -0.495813      ...          -0.005009   
2       0.272259       0.088949      -0.171558      ...          -0.071035   

   technical_36  technical_37  technical_38  technical_39  technical_40  \
0     -0.194489 -4.041816e-10 -4.041816e-10 -3.861803e-10     -0.057166   
1     -0.144726 -1.360235e-02 -1.360235e-02 -1.360235e-02     -0.017764   
2     -0.082874 -2.924539e-15 -6.483912e-23 -7.500384e-01     -0.045536   

   technical_41  technical_42  technical_43  technical_44  
0    

In [154]:
env = kagglegym.make()
observation = env.reset()

while True:
    target = observation.target
    timestamp = observation.features["timestamp"][0]
    if timestamp % 100 == 0:
        print("Timestamp #{}".format(timestamp))

    observation, reward, done, info = env.step(target)
    if done:        
        break
        
info

Timestamp #1000
Timestamp #1100
Timestamp #1200
Timestamp #1300
Timestamp #1400
Timestamp #1500
Timestamp #1600
Timestamp #1700
Timestamp #1800


{'public_score': -0.0083462397338969572}

In [129]:
import tensorflow.contrib.slim as slim

class agent():
    def __init__(self, lr, s_size,a_size,h_size):
        #These lines established the feed-forward part of the network. The agent takes a state and produces an action.
        self.state_in= tf.placeholder(shape=[None,s_size],dtype=tf.float32)
        hidden = slim.fully_connected(self.state_in,h_size,biases_initializer=None,activation_fn=tf.nn.relu)
        self.output = slim.fully_connected(hidden,a_size,activation_fn=tf.nn.softmax,biases_initializer=None)
        self.chosen_action = tf.argmax(self.output,1)

        #The next six lines establish the training proceedure. We feed the reward and chosen action into the network
        #to compute the loss, and use it to update the network.
        self.reward_holder = tf.placeholder(shape=[None],dtype=tf.float32)
        self.action_holder = tf.placeholder(shape=[None],dtype=tf.int32)
        
        self.indexes = tf.range(0, tf.shape(self.output)[0]) * tf.shape(self.output)[1] + self.action_holder
        self.responsible_outputs = tf.gather(tf.reshape(self.output, [-1]), self.indexes)

        self.loss = -tf.reduce_mean(tf.log(self.responsible_outputs)*self.reward_holder)
        
        tvars = tf.trainable_variables()
        self.gradient_holders = []
        for idx,var in enumerate(tvars):
            placeholder = tf.placeholder(tf.float32,name=str(idx)+'_holder')
            self.gradient_holders.append(placeholder)
        
        self.gradients = tf.gradients(self.loss,tvars)
        
        optimizer = tf.train.AdamOptimizer(learning_rate=lr)
        self.update_batch = optimizer.apply_gradients(zip(self.gradient_holders,tvars))


In [130]:
# TODO implement arbitrary actions from https://github.com/awjuliani/DeepRL-Agents/blob/master/Vanilla-Policy.ipynb

tf.reset_default_graph() #Clear the Tensorflow graph.
myAgent = agent(lr=1e-2,s_size=4,a_size=2,h_size=8) #Load the agent.

init = tf.global_variables_initializer()
# Launch the graph
with tf.Session() as sess:
    sess.run(init)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
