In [1]:
import numpy as np
import tensorflow as tf
import gym
import random
import matplotlib.pyplot as plt
from collections import deque

%matplotlib inline



def stats(rewards):
    print("Mean reward: ", np.mean(rewards))
    rewards_100 = []
    for i in range(100, len(rewards) + 1):
        rewards_100.append(np.mean(rewards[i-100:i]))
    #print("Max 100 rewards mean: ", np.max(rewards_100))
    #re = np.argmax(rewards_100)
    #print("Max 100 rewards from episode: %d, to episode: %d" % (re, re + 99))
    plt.plot(rewards_100)
    plt.xlabel('episodes')
    plt.ylabel('rewards')
    plt.show()
    
def tolerant_mean(arrs):
    lens = [len(i) for i in arrs]
    arr = np.ma.empty((np.max(lens),len(arrs)))
    arr.mask = True
    for idx, l in enumerate(arrs):
        arr[:len(l),idx] = l
    return arr.mean(axis = -1), arr.std(axis=-1)
    
def train(num_iterations=10):
    env = gym.make('FrozenLake-v0')
    env._max_episode_steps = 1000
    max_num_episodes = 2000
    checkpoint = 400
    replay_experience_maxlen = 50000
    batch_size = 64
    #num_iterations = 10
    #final_episodes = []

    learning_rate = 0.001
    decay = 0.99
    training_rewards = []
        

    augs = ['ras', None, 'gn']
    final_rewards = []
        
    for aug in augs:
        curr_rewards = []
        final_episodes = []
        for i in range(num_iterations):
            print("----------------ITERATION: ", i, aug)
            tf.compat.v1.reset_default_graph()
            X = tf.placeholder(dtype=tf.float32, shape=(None, 4))
            y = tf.placeholder(dtype=tf.float32, shape=(None, 2))
            net = tf.contrib.layers.fully_connected(X, 15)
            Q = tf.contrib.layers.fully_connected(net, 2, activation_fn=None)
            mse = tf.contrib.losses.mean_squared_error(y, Q)
            train_step = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(mse)





            sess = tf.InteractiveSession()
            tf.global_variables_initializer().run()

            # Initialize empty experiences
            replay_experience = deque(maxlen=replay_experience_maxlen)
            for episode in range(max_num_episodes):
                state = env.reset()
                epsilon = 1./((episode/50) + 10)
                done = False
                episode_reward = 0
                while not done:
                    # Calculate Q(s, a) for all a
                    Q_s_A = Q.eval(feed_dict={X: state.reshape((1, 4))})

                    # Choose action based on epsilon-greedy policy
                    if np.random.random_sample() < epsilon:
                        action = env.action_space.sample()
                    else:
                        action = np.argmax(Q_s_A[0])

                    # Perform action
                    next_state, reward, done, _ = env.step(action)

                    # Append final reward for each episode
                    episode_reward += reward

                    # Change 0 reward to -1 to learn more from punishment
                    if done:
                        reward = -1.0

                    # Save experience
                    replay_experience.append([state, action, reward, next_state, done])

                    # Switch to next state
                    state = next_state

                    # Do training if replay_experience contains enough sample > batch_size
                    if len(replay_experience) > batch_size:
                        

                        ## 1- Sample from replay experience
                        batch = random.sample(replay_experience, batch_size)
                        states = np.vstack([x[0] for x in batch])

                        actions = np.array([x[1] for x in batch])
                        rewards = np.array([x[2] for x in batch])
                        next_states = np.vstack([x[3] for x in batch])
                        if aug == 'ras':
                            low = np.random.uniform(0.6, 0.8)
                            high = np.random.uniform(1.2, 1.4)
                            scaling = np.random.uniform(low, high)
                            next_states *= scaling
                            states *= scaling
                        elif aug == 'gn':
                            mean = np.mean(states)
                            std = np.std(states)
                            noise = np.random.normal(mean, std)
                            states += noise
                            next_states += noise

                        episodes_done = np.array([x[4] for x in batch])
                        target_Q = Q.eval(feed_dict={X: states})
                        target_Q[range(batch_size), actions] = rewards + decay * np.max(Q.eval(feed_dict={X: next_states}), axis=1) * ~episodes_done
                        train_step.run(feed_dict={X: states, y: target_Q})

                if (episode + 1) % checkpoint == 0:
                    print("Episode: %d" % (episode))


            print("Play 10 times with optimal policy")
            avg_rewards = []
            for i in range(10):
                state = env.reset()
                done = False
                total_reward = 0
                while not done:
                    state, reward, done, _ = env.step(np.argmax(Q.eval(feed_dict={X: state.reshape((1, 4))})))
                    #print(state, reward, done)
                    total_reward += reward
                    #env.render()
                avg_rewards.append(total_reward)
                print("Iteration: %d, Total Reward: %d" % (i, total_reward))
            final_rewards.append([np.mean(avg_rewards), np.std(avg_rewards)])
    return final_rewards




In [2]:
rewards = train(5)

----------------ITERATION:  0 ras
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Please use `layer.__call__` method instead.
Instructions for updating:
Use tf.losses.mean_squared_error instead.
Instructions for updating:
Use tf.losses.compute_weighted_loss instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Instructions for updating:
Use tf.losses.add_loss instead.


ValueError: cannot reshape array of size 1 into shape (1,4)

In [15]:
print(rewards)
print(len(rewards))

np_rewards = np.array(rewards)

print(np_rewards[:5,0])
ras_mean = np.mean(np_rewards[:5,0])
ras_std = np.std(np_rewards[:5,0])

print(ras_mean, ras_std)

base_mean = np.mean(np_rewards[5:10,0])
base_std = np.std(np_rewards[5:10,0])

print(base_mean, base_std)

gn_mean = np.mean(np_rewards[10:,0])
gn_std = np.std(np_rewards[10:,0])

print(gn_mean, gn_std)

'''y_ras, _ = tolerant_mean(rewards[0])
stats(y_ras)
y_base, _ = tolerant_mean(rewards[1])
stats(y_base)
y_gn, _ = tolerant_mean(rewards[2])
stats(y_gn)
plt.plot(np.arange(len(y_gn))+1, y_gn, color='red', label='Gaussian Noise')
plt.plot(np.arange(len(y_ras))+1, y_ras, color='blue', label='Random Amplitude Scaling')
plt.plot(np.arange(len(y_base))+1, y_base, color='green', label='Baseline')
plt.xlabel('epsidoes')
plt.ylabel('reward')
plt.legend()
plt.show()'''

[[70.6, 49.70754469896899], [152.2, 4.833218389437828], [1000.0, 0.0], [1000.0, 0.0], [1000.0, 0.0], [9.8, 0.8717797887081348], [22.8, 10.85172797300043], [150.9, 10.367738422626218], [868.3, 218.65180081581767], [11.8, 0.8717797887081348], [158.5, 34.575280186861825], [147.0, 39.5145542806698], [140.4, 50.227880703848136], [186.9, 29.152872928752668], [122.4, 31.978742939646644]]
15
[  70.6  152.2 1000.  1000.  1000. ]
644.5600000000001 436.0874297660963
212.71999999999997 332.03066966772803
151.04 21.407344534061203


"y_ras, _ = tolerant_mean(rewards[0])\nstats(y_ras)\ny_base, _ = tolerant_mean(rewards[1])\nstats(y_base)\ny_gn, _ = tolerant_mean(rewards[2])\nstats(y_gn)\nplt.plot(np.arange(len(y_gn))+1, y_gn, color='red', label='Gaussian Noise')\nplt.plot(np.arange(len(y_ras))+1, y_ras, color='blue', label='Random Amplitude Scaling')\nplt.plot(np.arange(len(y_base))+1, y_base, color='green', label='Baseline')\nplt.xlabel('epsidoes')\nplt.ylabel('reward')\nplt.legend()\nplt.show()"