# Model - 1

# Q-Table Learning

In [74]:
import gym
import numpy as np

# Load the environment

In [75]:
env = gym.make('FrozenLake-v0')

[2017-10-26 19:15:55,070] Making new env: FrozenLake-v0


# Q-Table Learning Algorithm

In [None]:
#Initialize table with all zeros
Q = np.zeros([env.observation_space.n, env.action_space.n])
# Set learning parameters
lr = .8
y = .95
num_episodes = 2000
#create lists to contain total rewards and steps per episode
jList = []
rList = []
for i in range(num_episodes):
    #Reset environment and get first new observation
    s = env.reset()
    rAll = 0
    d = False
    j = 0
    # Q-learning Algoithm
    while j<99:
        j+=1
        #Choose an action by greedily (with noise) picking from Q table
        a = np.argmax(Q[s,:] + np.random.randn(1, env.action_space.n)*(1./(i+1)))
        #Get new state and reward from environment
        s1, r, d, _ = env.step(a)
        #Update Q-Table with new knowledge
        Q[s, a] = Q[s, a] + lr*(r + y*np.max(Q[s1, :]) - Q[s, a])
        rAll += r
        s = s1
        if d==True:
            break
            
    jList.append(j)
    rList.append(rAll)
        

In [None]:
print "Score over time: " +  str(sum(rList)/num_episodes)


In [None]:
print "Final Q-Table Values"
print Q

In [None]:
plt.plot(rList)

In [None]:
plt.plot(jList)

# Model - 2

# Q-Learning with Neural Networks

# Load the Env


In [None]:
import gym
import numpy as np
import random
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
env = gym.make('FrozenLake-v0')


# The Q-Network Approach

In [None]:
tf.reset_default_graph()

In [None]:
#These lines establish the feed-forward part of the network used to choose actions
inputs1 = tf.placeholder(shape=[1, 16], dtype=tf.float32)
W = tf.Variable(tf.random_uniform([16,4],0,0.01))
Qout = tf.matmul(inputs1, W)
predict = tf.argmax(Qout, 1)

#Below we obtain the loss by taking the sum of squares difference between the target and prediction Q values.
nextQ = tf.placeholder(shape = [1,4], dtype=tf.float32)
loss = tf.reduce_sum(tf.square(nextQ - Qout))
trainer = tf.train.GradientDescentOptimizer(learning_rate=0.1)
updateModel = trainer.minimize(loss)

# Training the Network

In [None]:
init = tf.initialize_all_variables()

#set Learning parameters
y = .99
e = 0.1
num_episodes = 2000
#create lists to contain total rewards and steps per episode
jList = []
rList = []
with tf.Session() as sess:
    sess.run(init)
    for i in range(num_episodes):
        #Reset environment and get first new observation
        s = env.reset()
        rAll = 0
        d = False
        j = 0
        # Q-Network
        while j<99:
            j+=1
            #Choose an action by greedily (with e chance of random action) from the Q-network
            a, allQ = sess.run([predict, Qout], feed_dict = {inputs1:np.identity(16)[s:s+1]})
            if np.random.rand(1) < e:
                a[0] = env.action_space.sample()
            #Get new state and reward from environment
            s1, r, d, _ = env.step(a[0])
            #Obtain the Q' values by feeding the new state through our network
            Q1 = sess.run(Qout, feed_dict={inputs1:np.identity(16)[s1:s1+1]})
            #Obtain maxQ' and set our target value for chosen action.
            maxQ1 = np.max(Q1)
            targetQ = allQ
            targetQ[0, a[0]] = r + y*maxQ1
            
            #Train our network using target and predicted Q values
            _,W1 = sess.run([updateModel, W], feed_dict={inputs1:np.identity(16)[s:s+1], nextQ:targetQ})
            
            rAll += r
            s = s1
            if d==True:
                #Reduce chance of random action as we train the model.
                e = 1./((i/50) + 10)
                break

        jList.append(j)
        rList.append(rAll)
print "Percent of succesful episodes: " + str(sum(rList)/num_episodes) + "%"

In [None]:
plt.plot(rList)


In [None]:
plt.plot(jList)


# Model - 3

# The Multi-armed bandit


In [3]:
import tensorflow as tf
import numpy as np

In [5]:
bandits = [0.3, 0, -0.3, 0.5]
num_bandits = len(bandits)
def pullBandit(bandit):
    #Get a random number.
    result = np.random.randn(1)
    if result > bandit:
        #return a positive reward.
        return 1
    else:
        #return a negative reward.
        return -1

In [53]:
tf.reset_default_graph()

weights = tf.Variable(tf.ones([num_bandits]))
chosen_action = tf.argmax(weights, 0)

reward_holder = tf.placeholder(shape = [1], dtype=tf.float32)
action_holder = tf.placeholder(shape = [1], dtype=tf.int32)
responsible_weight = tf.slice(weights, action_holder, [1])
loss = -(tf.log(responsible_weight)*reward_holder)
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
update = optimizer.minimize(loss)


# Training the Agent

In [55]:
total_episodes = 1000
total_reward = np.zeros(num_bandits)
e = 0.1

init = tf.initialize_all_variables()

with tf.Session() as sess:
    sess.run(init)
    i=0
    while i<total_episodes:
        
        if np.random.rand(1)<e:
            action = np.random.randint(num_bandits)
        else:
            action = sess.run(chosen_action)
        
        reward = pullBandit(bandits[action])
        
        #Update the network
        _, resp, ww = sess.run([update, responsible_weight, weights], feed_dict={reward_holder:[reward],
                                                                            action_holder:[action]})
        print "ww:", ww
        print "reward: ", reward
        print "action: ", action
        
        print "loss: ",-(np.log(resp[0])*reward)
        #Update our running tally of scores. 
        total_reward[action] += reward
        if i%50 == 0:
            print "Running reward for the " + str(num_bandits) + " bandits: " + str(total_reward)
        i+=1
        print "The agent thinks bandit " + str(np.argmax(ww)+1) + " is the most promising...."
        
        if np.argmax(ww) == np.argmax(-np.array(bandits)):
            print "...and it was right!"
        else:
            print "...and it was wrong!"

# Model - 4


# The Contextual Bandits

In [76]:
import tensorflow as tf
import tensorflow.contrib.slim as slim
import numpy as np

In [77]:
class contextual_bandit():
    def __init__(self):
        self.state = 0
        self.bandits = np.array([[0.2,0,-0.0,-5],[0.1,-5,1,0.25],[-5,5,5,5]])
        self.num_bandits = self.bandits.shape[0]
        self.num_actions = self.bandits.shape[1]
        
    def getBandit(self):
        self.state = np.random.randint(0, self.num_bandits)
        return self.state
    
    def pullArm(self, action):
        bandit = self.bandits[self.state, action]
        result = np.random.randn(1)
        if result > bandit:
            #return a positive reward.
            return 1
        else:
            #return a negative reward.
            return -1

In [83]:
class agent():
    def __init__(self, lr, s_size, a_size):
        self.state_in = tf.placeholder(shape=[1], dtype=tf.int32)
        state_in_OH = slim.one_hot_encoding(self.state_in, s_size)
        output = slim.fully_connected(state_in_OH, a_size, biases_initializer=None,
                                     activation_fn = tf.nn.sigmoid,
                                     weights_initializer = tf.ones_initializer())
        self.output = tf.reshape(output, [-1])
        self.chosen_action = tf.argmax(self.output, 0)
        
        
        self.reward_holder = tf.placeholder(shape = [1], dtype = tf.float32)
        self.action_holder = tf.placeholder(shape = [1], dtype = tf.int32)
        self.responsible_weight = tf.slice(self.output, self.action_holder, [1])
        self.loss = -(tf.log(self.responsible_weight)*self.reward_holder)
        
        optimizer = tf.train.GradientDescentOptimizer(learning_rate=lr)
        self.update = optimizer.minimize(self.loss)
        

In [84]:
tf.reset_default_graph()

cBandit = contextual_bandit()
myAgent = agent(lr=0.001, s_size = cBandit.num_bandits, a_size = cBandit.num_actions)
weights = tf.trainable_variables()[0]

total_episodes = 10000
total_reward = np.zeros([cBandit.num_bandits, cBandit.num_actions])
e=0.1

init = tf.initialize_all_variables()

# Launch the tensorflow graph
with tf.Session() as sess:
    sess.run(init)
    i=0
    while i < total_episodes:
        s = cBandit.getBandit()
        
        if np.random.rand(1) < e:
            action = np.random.randint(cBandit.num_actions)
        else:
            action = sess.run(myAgent.chosen_action, feed_dict={myAgent.state_in:[s]})
        
        reward = cBandit.pullArm(action)
        
        #Update the network.
        
        _, ww = sess.run([myAgent.update,weights], feed_dict={myAgent.reward_holder:[reward],
                                    myAgent.action_holder:[action],
                                    myAgent.state_in:[s]} )
        #Update our running tally of scores.
        total_reward[s,action] += reward
        if i % 500 == 0:
            print "Mean reward for each of the " + str(cBandit.num_bandits) + " bandits: " + str(np.mean(total_reward,axis=1))
        i+=1
        
for a in range(cBandit.num_bandits):
    print "The agent thinks action " + str(np.argmax(ww[a])+1) + " for bandit " + str(a+1) + " is the most promising...."
    if np.argmax(ww[a]) == np.argmin(cBandit.bandits[a]):
        print "...and it was right!"
    else:
        print "...and it was wrong!"

Instructions for updating:
Use `tf.global_variables_initializer` instead.


[2017-10-26 19:19:59,749] From /home/cse/ug/14075009/virtualenv-1.9/myVE/lib/python2.7/site-packages/tensorflow/python/util/tf_should_use.py:175: initialize_all_variables (from tensorflow.python.ops.variables) is deprecated and will be removed after 2017-03-02.
Instructions for updating:
Use `tf.global_variables_initializer` instead.


Mean reward for each of the 3 bandits: [ 0.    0.    0.25]
Mean reward for each of the 3 bandits: [  1.75  36.25  36.75]
Mean reward for each of the 3 bandits: [ 10.75  73.25  74.25]
Mean reward for each of the 3 bandits: [   8.75  108.5   112.  ]
Mean reward for each of the 3 bandits: [  14.    148.    145.75]
Mean reward for each of the 3 bandits: [  14.75  189.5   177.  ]
Mean reward for each of the 3 bandits: [  19.    226.    209.25]
Mean reward for each of the 3 bandits: [  20.25  264.25  246.25]
Mean reward for each of the 3 bandits: [  17.    297.75  286.  ]
Mean reward for each of the 3 bandits: [  41.5   335.25  319.  ]
Mean reward for each of the 3 bandits: [  78.75  375.    355.5 ]
Mean reward for each of the 3 bandits: [ 121.5   408.75  390.  ]
Mean reward for each of the 3 bandits: [ 165.5   445.75  422.  ]
Mean reward for each of the 3 bandits: [ 203.    478.5   463.25]
Mean reward for each of the 3 bandits: [ 242.    513.75  498.5 ]
Mean reward for each of the 3 bandits

# Model - 5


# Vanilla Policy Gradient Agent

In [133]:
import tensorflow as tf
import tensorflow.contrib.slim as slim
import numpy as np
import gym
import matplotlib.pyplot as plt
%matplotlib inline

try:
    xrange = xrange
except:
    xrange = range

In [134]:
env = gym.make('CartPole-v0')


[2017-10-30 16:12:32,670] Making new env: CartPole-v0


# The Policy-Based Agent

In [135]:
gamma = 0.99

def discount_reward(r):
    """ take 1D float array of rewards and compute discounted reward """
    discounted_r = np.zeros_like(r)
    running_add = 0
    for t in reversed(xrange(0, r.size)):
        running_add = r[t] + running_add*gamma
        discounted_r[t] = running_add
    return discounted_r

In [136]:
class agent():
    def __init__(self, lr, s_size, a_size, h_size):
        #These lines established the feed-forward part of the network.
        #The agent takes a state and produces an action.
        self.state_in = tf.placeholder(shape=[None, s_size], dtype=tf.float32)
        hidden = slim.fully_connected(self.state_in, h_size, biases_initializer=None, activation_fn=tf.nn.relu)
        self.output = slim.fully_connected(hidden, a_size, activation_fn = tf.nn.softmax, biases_initializer=None)
        self.chosen_action = tf.argmax(self.output, 1)
        
        #The next six lines establish the training proceedure.
        #We feed the reward and chosen action into the network
        #to compute the loss, and use it to update the network.
        self.reward_holder = tf.placeholder(shape=[None], dtype=tf.float32)
        self.action_holder = tf.placeholder(shape=[None], dtype=tf.int32)
        
        self.indexes = tf.range(0, tf.shape(self.output)[0]) * tf.shape(self.output)[1] + self.action_holder
        self.responible_outputs = tf.gather(tf.reshape(self.output, [-1]), self.indexes)
        
        self.loss = -tf.reduce_mean(tf.log(self.responible_outputs)*self.reward_holder)
        
        tvars = tf.trainable_variables()
        self.gradient_holders = []
        for idx, var in enumerate(tvars):
            placeholder = tf.placeholder(tf.float32, name=str(idx)+'_holder')
            self.gradient_holders.append(placeholder)
        
        self.gradients = tf.gradients(self.loss, tvars)
        
        optimizer = tf.train.AdamOptimizer(learning_rate=lr)
        self.update_batch = optimizer.apply_gradients(zip(self.gradient_holders, tvars))
        

# Training the Agent

In [None]:
tf.reset_default_graph() #Clear the Tensorflow graph.

myAgent = agent(lr=1e-2, s_size=4, a_size=2, h_size=8)

total_episodes = 5000
max_ep = 999
update_frequecy = 5

init = tf.initialize_all_variables()

with tf.Session() as sess:
    sess.run(init)
    i=0
    total_reward = []
    total_length = []
    
    gradBuffer = sess.run(tf.trainable_variables())
    for ix, grad in enumerate(gradBuffer):
        gradBuffer[ix] = grad*0
        
    while i < total_episodes:
        s = env.reset()
        running_reward = 0
        ep_history = []
        for j in range(max_ep):
            #Probabilistically pick an action given our network outputs.
            a_dist = sess.run(myAgent.output, feed_dict = {myAgent.state_in:[s]})
            a = np.random.choice(a_dist[0], p = a_dist[0])
            a = np.argmax(a_dist == a)
            
            env.render()
            s1, r, d, _ = env.step(a)
            ep_history.append([s,a,r,s1])
            s = s1
            running_reward += r
            
            if d==True:
                #Update the network
                ep_history = np.array(ep_history)
                ep_history[:, 2] = discount_reward(ep_history[:,2])
                feed_dict = {myAgent.reward_holder:ep_history[:,2], 
                            myAgent.action_holder:ep_history[:, 1],
                            myAgent.state_in:np.vstack(ep_history[:, 0])}
                grads = sess.run(myAgent.gradients, feed_dict=feed_dict)
                
                for idx, grad in enumerate(grads):
                    gradBuffer[idx] += grad
                
                if i%update_frequecy==0 and i!=0:
                    feed_dict = dictionary = dict(zip(myAgent.gradient_holders, gradBuffer))
                    _ = sess.run(myAgent.update_batch, feed_dict = feed_dict)
                    for ix, grad in enumerate(gradBuffer):
                        gradBuffer[ix] = grad*0
                    
                total_reward.append(running_reward)
                total_length.append(j)
                break
        
        if i % 100 == 0:
            print(np.mean(total_reward[-100:]))
        i += 1
            
            

Instructions for updating:
Use `tf.global_variables_initializer` instead.


[2017-10-30 16:13:53,859] From /home/cse/ug/14075009/virtualenv-1.9/myVE/lib/python2.7/site-packages/tensorflow/python/util/tf_should_use.py:175: initialize_all_variables (from tensorflow.python.ops.variables) is deprecated and will be removed after 2017-03-02.
Instructions for updating:
Use `tf.global_variables_initializer` instead.


# Model - 6

## Learning with Model

In [6]:
from __future__ import print_function
import numpy as np
try:
    import cPickle as pickle
except ModuleNotFoundError:
    import pickle
import tensorflow as tf
%matplotlib inline
import matplotlib.pyplot as plt
import math

In [7]:
import sys
if sys.version_info.major > 2:
    xrange = range
del sys

In [8]:
import gym
env = gym.make('CartPole-v0')

### Setting Hyper-parameters

In [9]:
# hyperparameters
H = 8 # number of hidden layer neurons
learning_rate = 1e-2
gamma = 0.99 # discount factor for reward
decay_rate = 0.99 # decay factor for RMSProp leaky sum of grad^2
resume = False # resume from previous checkpoint?

model_bs = 3 # Batch size when learning from model
real_bs = 3 # Batch size when learning from real environment

# model initialization
D = 4 # input dimensionality

### Policy Network

In [13]:
#Network for Agent policy
tf.reset_default_graph()
observations = tf.placeholder(tf.float32, [None, 4], name = "input_x")
W1 = tf.get_variable("W1", shape=[4,H], initializer = tf.contrib.layers.xavier_initializer())

layer1 = tf.nn.relu(tf.matmul(observations, W1))
W2 = tf.get_variable("W2", shape=[H,1], initializer=tf.contrib.layers.xavier_initializer())

score = tf.matmul(layer1, W2)
probability = tf.nn.sigmoid(score)

tvars = tf.trainable_variables()
input_y = tf.placeholder(tf.float32, [None, 1], name = "input_y")
advantages = tf.placeholder(tf.float32, name = "reward_signal")

adam = tf.train.AdamOptimizer(learning_rate=learning_rate)

W1Grad = tf.placeholder(tf.float32, name = "batch_grad1")
W2Grad = tf.placeholder(tf.float32, name = "batch_grad2")
batchGrad = [W1Grad, W2Grad]

loglik = tf.log(input_y*(input_y-probability) + (1-input_y)*(input_y + probability))
loss = -tf.reduce_mean(loglik*advantages)
newGrads = tf.gradients(loss, tvars)
updateGrads = adam.apply_gradients(zip(batchGrad, tvars))

### Model Network


In [None]:
#Here we implement a multi-layer neural network that predicts the next observation,
#reward, and done state from a current state and action.
mH = 256 # model layer size

input_data = tf.placeholder(tf.float32, [None, 5])
with tf.variable_scope('rnnlm'):
    softmax_w = tf.get_variable("softmax_w", [mH, 50])
    softmax_b = tf.get_variable("softmax_b", [50])

previous_state = tf.placeholder(tf.float32, [None,5] , name="previous_state")
W1M = tf.get_variable("W1M", shape=[5, mH],
           initializer=tf.contrib.layers.xavier_initializer())
B1M = tf.Variable(tf.zeros([mH]),name="B1M")
layer1M = tf.nn.relu(tf.matmul(previous_state,W1M) + B1M)
W2M = tf.get_variable("W2M", shape=[mH, mH],
           initializer=tf.contrib.layers.xavier_initializer())
B2M = tf.Variable(tf.zeros([mH]),name="B2M")
layer2M = tf.nn.relu(tf.matmul(layer1M,W2M) + B2M)
wO = tf.get_variable("wO", shape=[mH, 4],
           initializer=tf.contrib.layers.xavier_initializer())
wR = tf.get_variable("wR", shape=[mH, 1],
           initializer=tf.contrib.layers.xavier_initializer())
wD = tf.get_variable("wD", shape=[mH, 1],
           initializer=tf.contrib.layers.xavier_initializer())

bO = tf.Variable(tf.zeros([4]),name="bO")
bR = tf.Variable(tf.zeros([1]),name="bR")
bD = tf.Variable(tf.ones([1]),name="bD")


predicted_observation = tf.matmul(layer2M,wO,name="predicted_observation") + bO
predicted_reward = tf.matmul(layer2M,wR,name="predicted_reward") + bR
predicted_done = tf.sigmoid(tf.matmul(layer2M,wD,name="predicted_done") + bD)

true_observation = tf.placeholder(tf.float32,[None,4],name="true_observation")
true_reward = tf.placeholder(tf.float32,[None,1],name="true_reward")
true_done = tf.placeholder(tf.float32,[None,1],name="true_done")


predicted_state = tf.concat(1,[predicted_observation,predicted_reward,predicted_done])

observation_loss = tf.square(true_observation - predicted_observation)

reward_loss = tf.square(true_reward - predicted_reward)

done_loss = tf.mul(predicted_done, true_done) + tf.mul(1-predicted_done, 1-true_done)
done_loss = -tf.log(done_loss)

model_loss = tf.reduce_mean(observation_loss + done_loss + reward_loss)

modelAdam = tf.train.AdamOptimizer(learning_rate=learning_rate)
updateModel = modelAdam.minimize(model_loss)


In [None]:

Helper-functions
In [9]:
def resetGradBuffer(gradBuffer):
    for ix,grad in enumerate(gradBuffer):
        gradBuffer[ix] = grad * 0
    return gradBuffer
        
def discount_rewards(r):
    """ take 1D float array of rewards and compute discounted reward """
    discounted_r = np.zeros_like(r)
    running_add = 0
    for t in reversed(xrange(0, r.size)):
        running_add = running_add * gamma + r[t]
        discounted_r[t] = running_add
    return discounted_r


# This function uses our model to produce a new state when given a previous state and action
def stepModel(sess, xs, action):
    toFeed = np.reshape(np.hstack([xs[-1][0],np.array(action)]),[1,5])
    myPredict = sess.run([predicted_state],feed_dict={previous_state: toFeed})
    reward = myPredict[0][:,4]
    observation = myPredict[0][:,0:4]
    observation[:,0] = np.clip(observation[:,0],-2.4,2.4)
    observation[:,2] = np.clip(observation[:,2],-0.4,0.4)
    doneP = np.clip(myPredict[0][:,5],0,1)
    if doneP > 0.1 or len(xs)>= 300:
        done = True
    else:
        done = False
    return observation, reward, done
Training the Policy and Model
In [10]:
xs,drs,ys,ds = [],[],[],[]
running_reward = None
reward_sum = 0
episode_number = 1
real_episodes = 1
init = tf.initialize_all_variables()
batch_size = real_bs

drawFromModel = False # When set to True, will use model for observations
trainTheModel = True # Whether to train the model
trainThePolicy = False # Whether to train the policy
switch_point = 1

# Launch the graph
with tf.Session() as sess:
    rendering = False
    sess.run(init)
    observation = env.reset()
    x = observation
    gradBuffer = sess.run(tvars)
    gradBuffer = resetGradBuffer(gradBuffer)
    
    while episode_number <= 5000:
        # Start displaying environment once performance is acceptably high.
        if (reward_sum/batch_size > 150 and drawFromModel == False) or rendering == True : 
            env.render()
            rendering = True
            
        x = np.reshape(observation,[1,4])

        tfprob = sess.run(probability,feed_dict={observations: x})
        action = 1 if np.random.uniform() < tfprob else 0

        # record various intermediates (needed later for backprop)
        xs.append(x) 
        y = 1 if action == 0 else 0 
        ys.append(y)
        
        # step the  model or real environment and get new measurements
        if drawFromModel == False:
            observation, reward, done, info = env.step(action)
        else:
            observation, reward, done = stepModel(sess,xs,action)
                
        reward_sum += reward
        
        ds.append(done*1)
        drs.append(reward) # record reward (has to be done after we call step() to get reward for previous action)

        if done: 
            
            if drawFromModel == False: 
                real_episodes += 1
            episode_number += 1

            # stack together all inputs, hidden states, action gradients, and rewards for this episode
            epx = np.vstack(xs)
            epy = np.vstack(ys)
            epr = np.vstack(drs)
            epd = np.vstack(ds)
            xs,drs,ys,ds = [],[],[],[] # reset array memory
            
            if trainTheModel == True:
                actions = np.array([np.abs(y-1) for y in epy][:-1])
                state_prevs = epx[:-1,:]
                state_prevs = np.hstack([state_prevs,actions])
                state_nexts = epx[1:,:]
                rewards = np.array(epr[1:,:])
                dones = np.array(epd[1:,:])
                state_nextsAll = np.hstack([state_nexts,rewards,dones])

                feed_dict={previous_state: state_prevs, true_observation: state_nexts,true_done:dones,true_reward:rewards}
                loss,pState,_ = sess.run([model_loss,predicted_state,updateModel],feed_dict)
            if trainThePolicy == True:
                discounted_epr = discount_rewards(epr).astype('float32')
                discounted_epr -= np.mean(discounted_epr)
                discounted_epr /= np.std(discounted_epr)
                tGrad = sess.run(newGrads,feed_dict={observations: epx, input_y: epy, advantages: discounted_epr})
                
                # If gradients becom too large, end training process
                if np.sum(tGrad[0] == tGrad[0]) == 0:
                    break
                for ix,grad in enumerate(tGrad):
                    gradBuffer[ix] += grad
                
            if switch_point + batch_size == episode_number: 
                switch_point = episode_number
                if trainThePolicy == True:
                    sess.run(updateGrads,feed_dict={W1Grad: gradBuffer[0],W2Grad:gradBuffer[1]})
                    gradBuffer = resetGradBuffer(gradBuffer)

                running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
                if drawFromModel == False:
                    print 'World Perf: Episode %f. Reward %f. action: %f. mean reward %f.' % (real_episodes,reward_sum/real_bs,action, running_reward/real_bs)
                    if reward_sum/batch_size > 200:
                        break
                reward_sum = 0

                # Once the model has been trained on 100 episodes, we start alternating between training the policy
                # from the model and training the model from the real environment.
                if episode_number > 100:
                    drawFromModel = not drawFromModel
                    trainTheModel = not trainTheModel
                    trainThePolicy = not trainThePolicy
            
            if drawFromModel == True:
                observation = np.random.uniform(-0.1,0.1,[4]) # Generate reasonable starting point
                batch_size = model_bs
            else:
                observation = env.reset()
                batch_size = real_bs
                
print real_episodes
World Perf: Episode 4.000000. Reward 34.666667. action: 1.000000. mean reward 34.666667.
World Perf: Episode 7.000000. Reward 18.333333. action: 1.000000. mean reward 34.503333.
World Perf: Episode 10.000000. Reward 30.666667. action: 1.000000. mean reward 34.464967.
World Perf: Episode 13.000000. Reward 34.333333. action: 0.000000. mean reward 34.463650.
World Perf: Episode 16.000000. Reward 39.666667. action: 0.000000. mean reward 34.515680.
World Perf: Episode 19.000000. Reward 20.000000. action: 0.000000. mean reward 34.370524.
World Perf: Episode 22.000000. Reward 30.333333. action: 0.000000. mean reward 34.330152.
World Perf: Episode 25.000000. Reward 20.333333. action: 1.000000. mean reward 34.190184.
World Perf: Episode 28.000000. Reward 19.333333. action: 1.000000. mean reward 34.041615.
World Perf: Episode 31.000000. Reward 32.333333. action: 0.000000. mean reward 34.024532.
World Perf: Episode 34.000000. Reward 26.333333. action: 0.000000. mean reward 33.947620.
World Perf: Episode 37.000000. Reward 25.000000. action: 0.000000. mean reward 33.858144.
World Perf: Episode 40.000000. Reward 41.000000. action: 1.000000. mean reward 33.929563.
World Perf: Episode 43.000000. Reward 26.333333. action: 1.000000. mean reward 33.853600.
World Perf: Episode 46.000000. Reward 25.333333. action: 0.000000. mean reward 33.768398.
World Perf: Episode 49.000000. Reward 22.000000. action: 0.000000. mean reward 33.650714.
World Perf: Episode 52.000000. Reward 35.666667. action: 1.000000. mean reward 33.670873.
World Perf: Episode 55.000000. Reward 30.000000. action: 1.000000. mean reward 33.634165.
World Perf: Episode 58.000000. Reward 29.000000. action: 1.000000. mean reward 33.587823.
World Perf: Episode 61.000000. Reward 31.666667. action: 1.000000. mean reward 33.568611.
World Perf: Episode 64.000000. Reward 29.000000. action: 0.000000. mean reward 33.522925.
World Perf: Episode 67.000000. Reward 27.000000. action: 0.000000. mean reward 33.457696.
World Perf: Episode 70.000000. Reward 58.333333. action: 1.000000. mean reward 33.706452.
World Perf: Episode 73.000000. Reward 29.333333. action: 0.000000. mean reward 33.662721.
World Perf: Episode 76.000000. Reward 12.333333. action: 0.000000. mean reward 33.449427.
World Perf: Episode 79.000000. Reward 25.000000. action: 0.000000. mean reward 33.364933.
World Perf: Episode 82.000000. Reward 21.666667. action: 1.000000. mean reward 33.247950.
World Perf: Episode 85.000000. Reward 27.333333. action: 0.000000. mean reward 33.188804.
World Perf: Episode 88.000000. Reward 34.000000. action: 1.000000. mean reward 33.196916.
World Perf: Episode 91.000000. Reward 22.333333. action: 0.000000. mean reward 33.088280.
World Perf: Episode 94.000000. Reward 43.000000. action: 1.000000. mean reward 33.187397.
World Perf: Episode 97.000000. Reward 14.000000. action: 0.000000. mean reward 32.995523.
World Perf: Episode 100.000000. Reward 19.000000. action: 1.000000. mean reward 32.855568.
World Perf: Episode 103.000000. Reward 23.333333. action: 1.000000. mean reward 32.760346.
World Perf: Episode 106.000000. Reward 41.666667. action: 0.000000. mean reward 32.733059.
World Perf: Episode 109.000000. Reward 48.000000. action: 1.000000. mean reward 32.898281.
World Perf: Episode 112.000000. Reward 17.000000. action: 1.000000. mean reward 33.378304.
World Perf: Episode 115.000000. Reward 52.333333. action: 0.000000. mean reward 33.407272.
World Perf: Episode 118.000000. Reward 16.000000. action: 1.000000. mean reward 32.948177.
World Perf: Episode 121.000000. Reward 22.000000. action: 0.000000. mean reward 32.596844.
World Perf: Episode 124.000000. Reward 30.000000. action: 1.000000. mean reward 32.334019.
World Perf: Episode 127.000000. Reward 22.666667. action: 1.000000. mean reward 31.985229.
World Perf: Episode 130.000000. Reward 75.333333. action: 1.000000. mean reward 32.219444.
World Perf: Episode 133.000000. Reward 37.666667. action: 0.000000. mean reward 32.033123.
World Perf: Episode 136.000000. Reward 34.666667. action: 1.000000. mean reward 36.278812.
World Perf: Episode 139.000000. Reward 44.333333. action: 0.000000. mean reward 37.793564.
World Perf: Episode 142.000000. Reward 43.000000. action: 0.000000. mean reward 37.605556.
World Perf: Episode 145.000000. Reward 17.333333. action: 0.000000. mean reward 37.140411.
World Perf: Episode 148.000000. Reward 58.000000. action: 0.000000. mean reward 37.012058.
World Perf: Episode 151.000000. Reward 54.000000. action: 0.000000. mean reward 36.910122.
World Perf: Episode 154.000000. Reward 43.666667. action: 0.000000. mean reward 36.677944.
World Perf: Episode 157.000000. Reward 49.666667. action: 1.000000. mean reward 36.497761.
World Perf: Episode 160.000000. Reward 33.333333. action: 1.000000. mean reward 36.195156.
World Perf: Episode 163.000000. Reward 26.333333. action: 0.000000. mean reward 35.835846.
World Perf: Episode 166.000000. Reward 36.333333. action: 0.000000. mean reward 35.844830.
World Perf: Episode 169.000000. Reward 28.333333. action: 1.000000. mean reward 35.657898.
World Perf: Episode 172.000000. Reward 47.333333. action: 1.000000. mean reward 35.645672.
World Perf: Episode 175.000000. Reward 54.000000. action: 1.000000. mean reward 35.669083.
World Perf: Episode 178.000000. Reward 39.666667. action: 1.000000. mean reward 35.657860.
World Perf: Episode 181.000000. Reward 29.000000. action: 0.000000. mean reward 36.254162.
World Perf: Episode 184.000000. Reward 40.333333. action: 1.000000. mean reward 38.040600.
World Perf: Episode 187.000000. Reward 47.000000. action: 1.000000. mean reward 39.061661.
World Perf: Episode 190.000000. Reward 50.666667. action: 0.000000. mean reward 40.936543.
World Perf: Episode 193.000000. Reward 41.666667. action: 1.000000. mean reward 41.026863.
World Perf: Episode 196.000000. Reward 39.666667. action: 0.000000. mean reward 42.870926.
World Perf: Episode 199.000000. Reward 40.333333. action: 1.000000. mean reward 43.584274.
World Perf: Episode 202.000000. Reward 37.333333. action: 0.000000. mean reward 43.347317.
World Perf: Episode 205.000000. Reward 40.333333. action: 0.000000. mean reward 44.021179.
World Perf: Episode 208.000000. Reward 21.333333. action: 1.000000. mean reward 43.482227.
World Perf: Episode 211.000000. Reward 41.666667. action: 0.000000. mean reward 43.259212.
World Perf: Episode 214.000000. Reward 34.000000. action: 1.000000. mean reward 44.416565.
World Perf: Episode 217.000000. Reward 34.666667. action: 0.000000. mean reward 44.088322.
World Perf: Episode 220.000000. Reward 34.666667. action: 1.000000. mean reward 44.904999.
World Perf: Episode 223.000000. Reward 47.333333. action: 1.000000. mean reward 44.721806.
World Perf: Episode 226.000000. Reward 44.666667. action: 1.000000. mean reward 45.130310.
World Perf: Episode 229.000000. Reward 43.333333. action: 0.000000. mean reward 44.777817.
World Perf: Episode 232.000000. Reward 43.333333. action: 0.000000. mean reward 44.769470.
World Perf: Episode 235.000000. Reward 44.333333. action: 0.000000. mean reward 46.332897.
World Perf: Episode 238.000000. Reward 45.000000. action: 1.000000. mean reward 45.947269.
World Perf: Episode 241.000000. Reward 54.666667. action: 0.000000. mean reward 46.343395.
World Perf: Episode 244.000000. Reward 24.000000. action: 1.000000. mean reward 45.764530.
World Perf: Episode 247.000000. Reward 34.333333. action: 1.000000. mean reward 45.315842.
World Perf: Episode 250.000000. Reward 48.333333. action: 0.000000. mean reward 45.853481.
World Perf: Episode 253.000000. Reward 67.333333. action: 0.000000. mean reward 45.776394.
World Perf: Episode 256.000000. Reward 34.000000. action: 1.000000. mean reward 47.594440.
World Perf: Episode 259.000000. Reward 38.333333. action: 0.000000. mean reward 47.181637.
World Perf: Episode 262.000000. Reward 66.000000. action: 0.000000. mean reward 47.100784.
World Perf: Episode 265.000000. Reward 34.333333. action: 0.000000. mean reward 46.671185.
World Perf: Episode 268.0000