In [1]:
import kagglegym

# Create environment
env = kagglegym.make()
# Get first observation
observation = env.reset()

observation.features.shape

(968, 110)

In [2]:
# hyperparameters
batch_size = 5 # every how many episodes to do a param update?
learning_rate = 1e-2 # feel free to play with this to train faster or more stably.
gamma = 0.99 # discount factor for reward

D = observation.features.head().shape[1] # input dimensionality
H = int(D / 3) # number of hidden layer neurons

[D, H]

[110, 36]

In [3]:
import tensorflow as tf

tf.reset_default_graph()

#This defines the network as it goes from taking an observation of the environment to 
#giving a probability of chosing to the action of moving left or right.
observations = tf.placeholder(tf.float32, [None,D] , name="input_x")
W1 = tf.get_variable("W1", shape=[D, H],
           initializer=tf.contrib.layers.xavier_initializer())
layer1 = tf.nn.relu(tf.matmul(observations,W1))
W2 = tf.get_variable("W2", shape=[H, 1],
           initializer=tf.contrib.layers.xavier_initializer())
score = tf.matmul(layer1,W2)
probability = tf.nn.sigmoid(score)

#From here we define the parts of the network needed for learning a good policy.
tvars = tf.trainable_variables()
input_y = tf.placeholder(tf.float32,[None,1], name="input_y")
advantages = tf.placeholder(tf.float32,name="reward_signal")

# The loss function. This sends the weights in the direction of making actions 
# that gave good advantage (reward over time) more likely, and actions that didn't less likely.
#TODO fix loglik = tf.log(input_y*(input_y - probability) + (1 - input_y)*(input_y + probability))
#TODO fix loss = -tf.reduce_mean(loglik * advantages) 
loss = -tf.reduce_mean((input_y - probability) * advantages)
newGrads = tf.gradients(loss,tvars)

# Once we have collected a series of gradients from multiple episodes, we apply them.
# We don't just apply gradeients after every episode in order to account for noise in the reward signal.
adam = tf.train.AdamOptimizer(learning_rate=learning_rate) # Our optimizer
# Placeholders to send the final gradients through when we update.
W1Grad = tf.placeholder(tf.float32,name="batch_grad1") 
W2Grad = tf.placeholder(tf.float32,name="batch_grad2")
#
batchGrad = [W1Grad,W2Grad]
updateGrads = adam.apply_gradients(zip(batchGrad,tvars))


In [4]:
import numpy as np

def discount_rewards(r):
    """ take 1D float array of rewards and compute discounted reward """
    discounted_r = np.zeros_like(r)
    running_add = 0
    # in Python 2:
    # for t in reversed(xrange(0, r.size)):
    for t in reversed(range(0, r.size)):
        running_add = running_add * gamma + r[t]
        discounted_r[t] = running_add
    return discounted_r


In [5]:
class Debug:
    def __init__(self, state):
        self.state = state

    def log(self, msg):
        if self.state % 300 == 0:
            print(msg)


In [6]:
# Here's an example of loading the CSV using Pandas's built-in HDF5 support:
import pandas as pd

with pd.HDFStore("../input/train.h5", "r") as train:
    # Note that the "train" dataframe is the only dataframe in the file
    df = train.get("train")

print(df[df["timestamp"] == 909].shape)
print(df[df["timestamp"] == 1000].shape)
print(df[df["timestamp"] == 1100].shape)

t910 = df[df["timestamp"] == 910]
print(t910.shape)
t910.tail(3)


(968, 111)
(959, 111)
(974, 111)
(967, 111)


Unnamed: 0,id,timestamp,derived_0,derived_1,derived_2,derived_3,derived_4,fundamental_0,fundamental_1,fundamental_2,...,technical_36,technical_37,technical_38,technical_39,technical_40,technical_41,technical_42,technical_43,technical_44,y
811134,2154,910,0.074597,-0.015414,0.29208,0.343019,0.044199,-0.228965,0.179896,-0.157234,...,0.138343,-0.8638552,-1.679721e-15,-0.8638552,0.070375,-0.140491,-0.007442452,-0.015625,-0.050378,-0.004286
811135,2155,910,0.020373,-0.053024,,-0.014809,,-0.095301,-0.09729,,...,0.018055,-0.00034692,-0.00034692,-0.00034692,-0.075559,0.211429,-7.994175e-27,-3.330669e-16,-0.004531,-0.002657
811136,2156,910,-0.010002,-0.047333,,0.086585,,0.307436,0.22972,,...,-0.300517,-9.636440000000001e-17,-9.636440000000001e-17,-9.636440000000001e-17,0.075476,0.008847,0.8564127,-2.0,0.003307,-0.005547


In [7]:
def extend(x, target):
    keys = list(x.keys())
    zeros = [.0] * len(keys)
    rows = x.shape[0]
    while rows < target:
        tail = pd.DataFrame(dict(zip(keys, zeros)), index=[rows])
        x = pd.concat([x, tail])
        rows = x.shape[0]
    return x

In [9]:
# Create environment
env = kagglegym.make()
# Get first observation
observation = env.reset()

init = tf.global_variables_initializer()

# Launch the graph
with tf.Session() as sess:
    rendering = False
    sess.run(init)
    observation = env.reset() # Obtain an initial observation of the environment

    # Reset the gradient placeholder. We will collect gradients in 
    # gradBuffer until we are ready to update our policy network. 
    gradBuffer = sess.run(tvars)
    for ix,grad in enumerate(gradBuffer):
        gradBuffer[ix] = grad * 0
    print("gradBuffer", len(gradBuffer), len(gradBuffer[0]), len(gradBuffer[1]))

    rewards = []
    done = False
    while not done:
        x = observation.features.fillna(.0) #.head(1)
        # log
        timestamp = x["timestamp"][0]
        #timestamp = 0
        debug = Debug(timestamp)
        #debug.log(x.tail(3))
        debug.log("Timestamp #{}".format(timestamp))

        # Run the policy network and get an action to take. 
        y = sess.run(probability,feed_dict={observations: x})
        action = observation.target
        action["y"] = y
        # step the environment and get new measurements
        observation, reward, done, info = env.step(action)

        # get perfect action for training
        perfect_action = df[df["timestamp"] == timestamp][["id", "y"]].reset_index(drop=True)
        
        # calculate rewards
        pa = perfect_action
        y = action["y"]
        #debug.log("Perfect action {}".format(pa.tail(3)))
        rewards = pa["y"] - y #.fillna(.0)
        debug.log("rewards.tail {}".format(rewards.tail(3)))
        rewards = -np.abs(rewards) #* 10
        #rewards = np.nan_to_num(rewards)
        #debug.log("rewards.shape {0}".format(rewards.shape))
        
        # record reward (has to be done after we call step() to get reward for previous action)
        #rewards = y * reward
        rewards = np.vstack(rewards)
        
        # log
        debug.log("Reward is {}".format(reward))
        
        # compute the discounted reward backwards through time
        discounted_epr = discount_rewards(rewards)
        # size the rewards to be unit normal (helps control the gradient estimator variance)
        discounted_epr -= np.mean(discounted_epr)
        discounted_epr /= np.std(discounted_epr)
        # Get the gradient for this episode, and save it in the gradBuffer
        py = pa["y"]
        py = py.reshape([py.shape[0],1])
        debug.log("y={}".format(y[0:3]))
        debug.log("py={}".format(py[0:3]))
        tGrad = sess.run(newGrads,feed_dict={observations: x, input_y: py, advantages: discounted_epr})
        for ix,grad in enumerate(tGrad):
            debug.log("grad-{0} is {1}".format(str(ix), grad[0:3]))
            gradBuffer[ix] += grad
        
        # If we have completed enough episodes, then update the policy network with our gradients.
        sess.run(updateGrads,feed_dict={W1Grad: gradBuffer[0],W2Grad:gradBuffer[1]})
        for ix,grad in enumerate(gradBuffer):
            gradBuffer[ix] = grad * 0

        #if timestamp > 920:
        #    break

gradBuffer 2 110 36
Timestamp #1000
rewards.tail 956    0.025702
957   -0.006563
958    0.000643
Name: y, dtype: float32
Reward is -1
y=0    0.0
1    0.0
2    0.0
Name: y, dtype: float32
py=[[ -5.83076896e-03]
 [  8.23848459e-05]
 [  3.94059252e-03]]
grad-0 is [[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]]
grad-1 is [[ 0.]
 [ 0.]
 [ 0.]]
Timestamp #1100
rewards.tail 971    0.012944
972    0.000578
973    0.001801
Name: y, dtype: float32
Reward is -1
y=0    0.0
1    0.0
2    0.0
Name: y, dtype: float32
py=[[-0.01675762]
 [ 0.00074415]
 [-0.01008121]]
grad-0 is [[ 0.  0.  0.  0.  0.  0.  0.  0.

In [154]:
env = kagglegym.make()
observation = env.reset()

while True:
    target = observation.target
    timestamp = observation.features["timestamp"][0]
    if timestamp % 100 == 0:
        print("Timestamp #{}".format(timestamp))

    observation, reward, done, info = env.step(target)
    if done:        
        break
        
info

Timestamp #1000
Timestamp #1100
Timestamp #1200
Timestamp #1300
Timestamp #1400
Timestamp #1500
Timestamp #1600
Timestamp #1700
Timestamp #1800


{'public_score': -0.0083462397338969572}

In [129]:
import tensorflow.contrib.slim as slim

class agent():
    def __init__(self, lr, s_size,a_size,h_size):
        #These lines established the feed-forward part of the network. The agent takes a state and produces an action.
        self.state_in= tf.placeholder(shape=[None,s_size],dtype=tf.float32)
        hidden = slim.fully_connected(self.state_in,h_size,biases_initializer=None,activation_fn=tf.nn.relu)
        self.output = slim.fully_connected(hidden,a_size,activation_fn=tf.nn.softmax,biases_initializer=None)
        self.chosen_action = tf.argmax(self.output,1)

        #The next six lines establish the training proceedure. We feed the reward and chosen action into the network
        #to compute the loss, and use it to update the network.
        self.reward_holder = tf.placeholder(shape=[None],dtype=tf.float32)
        self.action_holder = tf.placeholder(shape=[None],dtype=tf.int32)
        
        self.indexes = tf.range(0, tf.shape(self.output)[0]) * tf.shape(self.output)[1] + self.action_holder
        self.responsible_outputs = tf.gather(tf.reshape(self.output, [-1]), self.indexes)

        self.loss = -tf.reduce_mean(tf.log(self.responsible_outputs)*self.reward_holder)
        
        tvars = tf.trainable_variables()
        self.gradient_holders = []
        for idx,var in enumerate(tvars):
            placeholder = tf.placeholder(tf.float32,name=str(idx)+'_holder')
            self.gradient_holders.append(placeholder)
        
        self.gradients = tf.gradients(self.loss,tvars)
        
        optimizer = tf.train.AdamOptimizer(learning_rate=lr)
        self.update_batch = optimizer.apply_gradients(zip(self.gradient_holders,tvars))


In [130]:
# TODO implement arbitrary actions from https://github.com/awjuliani/DeepRL-Agents/blob/master/Vanilla-Policy.ipynb

tf.reset_default_graph() #Clear the Tensorflow graph.
myAgent = agent(lr=1e-2,s_size=4,a_size=2,h_size=8) #Load the agent.

init = tf.global_variables_initializer()
# Launch the graph
with tf.Session() as sess:
    sess.run(init)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
