In [9]:
import numpy as np
import tensorflow as tf
import gym
import random
import matplotlib.pyplot as plt
from collections import deque
%matplotlib inline

id16 = np.identity(16)
def onehot(x):
    return id16[x : x + 1]

def stats(rewards):
    print("Mean reward: ", np.mean(rewards))
    rewards_100 = []
    for i in range(100, len(rewards) + 1):
        rewards_100.append(np.mean(rewards[i-100:i]))
    print("Max 100 rewards mean: ", np.max(rewards_100))
    re = np.argmax(rewards_100)
    print("Max 100 rewards from episode: %d, to episode: %d" % (re - 100, re))
    #plt.plot(rewards_100)
    #plt.show()
    
def train(num_iterations=10):
    

    learning_rate = 0.001
    decay = 0.99
    augs = ['ras', None, 'gn']
    final_rewards = []
    
    for aug in augs:
        for i in range(num_iterations):
            print(f"---Iteration #{i} w/ {aug}")
            env = gym.make('FrozenLake-v0')
            env._max_episode_steps = 10000
            max_num_episodes = 5000
            checkpoint = 200
            replay_experience_maxlen = 50000
            batch_size = 64
            tf.reset_default_graph()
            X = tf.placeholder(dtype=tf.float32, shape=(None, 4 * 4))
            y = tf.placeholder(dtype=tf.float32, shape=(None, 4))
            Q = tf.contrib.layers.fully_connected(X, 4, activation_fn=None)
            mse = tf.contrib.losses.mean_squared_error(y, Q)
            train_step = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(mse)

            id16 = np.identity(16)
            all_rewards = []

            sess = tf.InteractiveSession()
            tf.global_variables_initializer().run()

            # Initialize empty experiences
            replay_experience = deque(maxlen=replay_experience_maxlen)
            for episode in range(max_num_episodes):
                state = env.reset()
                epsilon = 1./((episode/50) + 10)
                done = False
                while not done:
                    # Calculate Q(s, a) for all a
                    Q_s_A = Q.eval(feed_dict={X: onehot(state)})

                    # Choose action based on epsilon-greedy policy
                    if np.random.random_sample() < epsilon:
                        action = env.action_space.sample()
                    else:
                        action = np.argmax(Q_s_A[0])

                    # Perform action
                    next_state, reward, done, _ = env.step(action)

                    # Append final reward for each episode
                    if done:
                        all_rewards.append(reward)

                    # Change 0 reward to -1 to learn more from punishment
                    if done and reward < 0.5:
                        reward = -1.0

                    # Save experience
                    replay_experience.append([onehot(state), action, reward, onehot(next_state), done])

                    # Switch to next state
                    state = next_state

                    # Do training if replay_experience contains enough sample > batch_size
                    if len(replay_experience) > batch_size:
                        ## 1- Sample from replay experience
                        batch = random.sample(replay_experience, batch_size)
                        states = np.vstack([x[0] for x in batch])
                        actions = np.array([x[1] for x in batch])
                        rewards = np.array([x[2] for x in batch])
                        next_states = np.vstack([x[3] for x in batch])
                        if aug == "ras":
                            low = np.random.uniform(0.6, 0.8)
                            high = np.random.uniform(1.2, 1.4)
                            scaling = np.random.uniform(low, high)
                            next_states *= scaling
                            states *= scaling
                        elif aug == 'gn':
                            #print('gn')
                            mean = np.mean(states)
                            std = np.std(states)
                            noise = np.random.normal(mean, std)
                            states += noise
                            next_states += noise
                        episodes_done = np.array([x[4] for x in batch])
                        target_Q = Q.eval(feed_dict={X: states})
                        target_Q[range(batch_size), actions] = rewards + decay * np.max(Q.eval(feed_dict={X: next_states}), axis=1) * ~episodes_done
                        train_step.run(feed_dict={X: states, y: target_Q})

                if (episode + 1) % checkpoint == 0:
                    print("Episode: ", episode, np.mean(all_rewards))

                if episode == max_num_episodes - 1:# or (len(all_rewards) >= 100 and np.mean(all_rewards[-100:]) >= 0.75):
                    #training_episodes.append(episode)
                    break
            print("Play 10 times with optimal policy")
            avg_rewards = []
            for i in range(1000):
                state = env.reset()
                done = False
                total_reward = 0
                while not done:
                    # Calculate Q(s, a) for all a
                    Q_s_A = Q.eval(feed_dict={X: onehot(state)})

                    # Choose action based on epsilon-greedy policy
                    action = np.argmax(Q_s_A[0])
                        
                    next_state, reward, done, _ = env.step(action)

                    total_reward += reward
                    # Switch to next state
                    state = next_state
                    #env.render()
                avg_rewards.append(total_reward)
                #print("Iteration: %d, Total Reward: %d" % (i, total_reward))
            print('AVG REWARD: ', np.mean(avg_rewards))
            final_rewards.append(np.mean(avg_rewards))
            
            env.close()
            stats(all_rewards)
            
    return final_rewards

In [10]:
num_iterations = 5
episodes  = train(num_iterations)
ras_mean = np.mean(episodes[:num_iterations]) 
ras_std = np.std(episodes[:num_iterations]) 
base_mean = np.mean(episodes[num_iterations:2*num_iterations]) 
base_std = np.std(episodes[num_iterations:2*num_iterations])
gn_mean = np.mean(episodes[2*num_iterations:]) 
gn_std = np.std(episodes[2*num_iterations:])
print("Base stats: ", base_mean, base_std)
print("RAS stats: ", ras_mean, ras_std)
print("GN stats: ", gn_mean, gn_std)

---Iteration #0 w/ ras
Episode:  199 0.05
Episode:  399 0.1525
Episode:  599 0.27666666666666667
Episode:  799 0.35375
Episode:  999 0.409
Episode:  1199 0.43833333333333335
Episode:  1399 0.4742857142857143
Episode:  1599 0.500625
Episode:  1799 0.5272222222222223
Episode:  1999 0.543
Episode:  2199 0.5563636363636364
Episode:  2399 0.5675
Episode:  2599 0.578076923076923
Episode:  2799 0.5889285714285715
Episode:  2999 0.5976666666666667
Episode:  3199 0.6040625
Episode:  3399 0.6111764705882353
Episode:  3599 0.6163888888888889
Episode:  3799 0.6228947368421053
Episode:  3999 0.6305
Episode:  4199 0.6390476190476191
Episode:  4399 0.6427272727272727
Episode:  4599 0.6460869565217391
Episode:  4799 0.650625
Episode:  4999 0.6524
Play 10 times with optimal policy
AVG REWARD:  0.844
Mean reward:  0.6524
Max 100 rewards mean:  0.9
Max 100 rewards from episode: 3942, to episode: 4042
---Iteration #1 w/ ras
Episode:  199 0.085
Episode:  399 0.2125
Episode:  599 0.28833333333333333
Episode

Episode:  1799 0.5761111111111111
Episode:  1999 0.5905
Episode:  2199 0.6
Episode:  2399 0.61125
Episode:  2599 0.6184615384615385
Episode:  2799 0.6282142857142857
Episode:  2999 0.6376666666666667
Episode:  3199 0.6459375
Episode:  3399 0.6494117647058824
Episode:  3599 0.6561111111111111
Episode:  3799 0.6623684210526316
Episode:  3999 0.66675
Episode:  4199 0.6735714285714286
Episode:  4399 0.6768181818181818
Episode:  4599 0.6789130434782609
Episode:  4799 0.68375
Episode:  4999 0.6848
Play 10 times with optimal policy
AVG REWARD:  0.744
Mean reward:  0.6848
Max 100 rewards mean:  0.85
Max 100 rewards from episode: 3931, to episode: 4031
---Iteration #0 w/ gn
Episode:  199 0.08
Episode:  399 0.125
Episode:  599 0.11166666666666666
Episode:  799 0.1075
Episode:  999 0.168
Episode:  1199 0.24
Episode:  1399 0.29
Episode:  1599 0.33875
Episode:  1799 0.3761111111111111
Episode:  1999 0.408
Episode:  2199 0.43
Episode:  2399 0.45375
Episode:  2599 0.46923076923076923
Episode:  2799 0