In [159]:
%matplotlib inline

import gym
import itertools
import matplotlib
import numpy as np
import sys
import sklearn.pipeline
import sklearn.preprocessing
import tensorflow as tf

if "../" not in sys.path:
  sys.path.append("../") 

from collections import deque, namedtuple
from lib import plotting
from sklearn.linear_model import SGDRegressor
from sklearn.kernel_approximation import RBFSampler

matplotlib.style.use('ggplot')

In [160]:
env = gym.envs.make("Pong-v0")

[2016-09-15 09:54:50,441] Making new env: Pong-v0


In [173]:
# For Pong, only 2 (up) and 3 (down) are valid actions
VALID_ACTIONS = [2, 3]

In [232]:
class Estimator():
    def __init__(self, scope="estimator"):
        self.scope = scope
        with tf.variable_scope(scope):
            self._build_model()

    def preprocess_state(self, s):
        # Crop the Atari image to 160x160 pixels
        return s[:,34:-16,:,:]
    
    def _build_model(self):        
        # Placeholders for our input
        self.X_pl = tf.placeholder(shape=[None, 160, 160, 3], dtype=tf.float32)
        self.y_pl = tf.placeholder(shape=[None, len(VALID_ACTIONS)], dtype=tf.float32)
        self.y_weights = tf.placeholder(shape=[None, len(VALID_ACTIONS)], dtype=tf.float32)

        X_grayscale = tf.to_float(tf.image.rgb_to_grayscale(self.X_pl))
        conv1 = tf.contrib.layers.conv2d(
            X_grayscale, 16, 8, 4, activation_fn=tf.nn.relu)
        conv2 = tf.contrib.layers.conv2d(
            conv1, 32, 4, 2, activation_fn=tf.nn.relu)
        conv2_flat = tf.contrib.layers.flatten(conv2)
        fc = tf.contrib.layers.fully_connected(conv2_flat, len(VALID_ACTIONS))
        self.predictions = fc

        # Calculate the loss
        self.losses = (self.y_pl - fc)**2
        self.losses_masked = self.losses * self.y_weights
        self.losses_per_example = tf.reduce_sum(self.losses_masked, reduction_indices=1)
        self.loss = tf.reduce_mean(self.losses_per_example)
        self.train_op = tf.contrib.layers.optimize_loss(
            self.loss, tf.contrib.framework.get_global_step(), learning_rate=0.001, optimizer="Adam")
    
    def predict(self, s):
        sess = tf.get_default_session()
        # Crop the image to 160x160 pixels
        state = self.preprocess_state(s)
        feed_dict = { self.X_pl: state }
        return sess.run(self.predictions, feed_dict)
    
    def update(self, s, a, y):
        sess = tf.get_default_session()
        state = self.preprocess_state(s)
        loss_weights = np.zeros([len(y), len(VALID_ACTIONS)])
        loss_weights[:, a] = 1.0
        y_feed = np.zeros_like(loss_weights)
        y_feed[:, a] = y
        feed_dict = { self.X_pl: state, self.y_pl: y_feed, self.y_weights: loss_weights }
        return sess.run(self.train_op, feed_dict)

In [233]:
tf.reset_default_graph()
e = Estimator(scope="aa")
with tf.Session() as sess:
    sess.run(tf.initialize_all_variables())
    observation = env.reset()
    observations = np.expand_dims(observation, 0)
    print(e.predict(observations))

    # Example training step
    y = np.array([[33.0]])
    a = np.array([1])
    print(e.update(observations, a, y))

# tv = tf.trainable_variables()
# v = tv[0]
# print(v.name)

# e.close()

[[ 23.40990448   0.        ]]
1089.0


In [246]:
def copy_model_parameters(estimator1, estimator2):
    e1_params = [t for t in tf.trainable_variables() if t.name.startswith(estimator1.scope)]
    e1_params = sorted(e1_params, key=lambda v: v.name)
    e2_params = [t for t in tf.trainable_variables() if t.name.startswith(estimator2.scope)]
    e2_params = sorted(e2_params, key=lambda v: v.name)
    # print([v.name for v in e1_params])
    # print([v.name for v in e2_params])
    # print([v.name for v in tf.trainable_variables()])
    
    update_ops = []
    for e1_v, e2_v in zip(e1_params, e2_params):
        op = e2_v.assign(e1_v)
        update_ops.append(op)
    # print(update_ops)
    
    sess = tf.get_default_session()
    sess.run(update_ops)
    

# observation = env.reset()
# tf.reset_default_graph()
# e1 = Estimator(scope="e1")
# e2 = Estimator(scope="e2")
# with tf.Session() as sess:
#     sess.run(tf.initialize_all_variables())
#     print(e1.predict(observations))
#     print(e2.predict(observations))
#     copy_model_parameters(e1, e2)
#     print(e2.predict(observations))

In [247]:
def make_epsilon_greedy_policy(estimator, epsilon, nA):
    """
    Creates an epsilon-greedy policy based on a given Q-function approximator and epsilon.
    
    Args:
        estimator: An estimator that returns q values for a given state
        epsilon: The probability to select a random action . float between 0 and 1.
        nA: Number of actions in the environment.
    
    Returns:
        A function that takes the observation as an argument and returns
        the probabilities for each action in the form of a numpy array of length nA.
    
    """
    def policy_fn(observation):
        A = np.ones(nA, dtype=float) * epsilon / nA
        q_values = estimator.predict(np.expand_dims(observation, 0))[0]
        best_action = np.argmax(q_values)
        A[best_action] += (1.0 - epsilon)
        return A
    return policy_fn

In [256]:
def deep_q_learning(env, estimator, target_estimator, num_episodes,
                    replay_memory_size=500000,
                    update_target_estimator_every=1000, discount_factor=1.0,
                    epsilon=0.5, epsilon_decay=1.0, epsilon_min=0.1, batch_size=8):
    """
    Q-Learning algorithm for fff-policy TD control using Function Approximation.
    Finds the optimal greedy policy while following an epsilon-greedy policy.
    
    Args:
        env: OpenAI environment.
        estimator: Action-Value function estimator
        num_episodes: Number of episodes to run for.
        discount_factor: Lambda time discount factor.
        epsilon: Chance the sample a random action. Float betwen 0 and 1.
        epsilon_decay: Each episode, epsilon is decayed by this factor
    
    Returns:
        An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards.
    """

    Transition = namedtuple("Transition", ["state", "action", "reward", "next_state"])
    replay_memory = deque(maxlen=replay_memory_size)
    
    # Keeps track of useful statistics
    stats = plotting.EpisodeStats(
        episode_lengths=np.zeros(num_episodes),
        episode_rewards=np.zeros(num_episodes))
    
    
    total_t = 0
    
    for i_episode in range(num_episodes):
        
        if total_t % update_target_estimator_every == 0:
            copy_model_parameters(estimator, target_estimator)
        
        # The policy we're following
        epsilon_ = max(epsilon_decay**i_episode, epsilon_min)
        policy = make_epsilon_greedy_policy(
            estimator, epsilon_ , len(VALID_ACTIONS))
        
        # Print out which episode we're on, useful for debugging.
        # Also print reward for last episode
        last_reward = stats.episode_rewards[i_episode - 1]
        # print("\rEpisode {}/{} ({})".format(i_episode + 1, num_episodes, last_reward), end="")
        # sys.stdout.flush()
        
        # Reset the environment and pick the first action
        state = env.reset()
        
        # One step in the environment
        for t in itertools.count():
            
            # env.render()
            
            print("\rStep {} @ Episode {}/{} ({})".format(t, i_episode + 1, num_episodes, last_reward), end="")
            sys.stdout.flush()
            
            # Take a step
            action_probs = policy(state)
            action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
            next_state, reward, done, _ = env.step(VALID_ACTIONS[action])
            
            # Save transition
            replay_memory.append(Transition(state, action, reward, next_state))
            
            # Update statistics
            stats.episode_rewards[i_episode] += reward
            stats.episode_lengths[i_episode] = t

            # Sample from the replay memory
            sample_len = min(batch_size, len(replay_memory))
            sample_idx = np.random.choice(len(replay_memory), sample_len, replace=False)
            samples = [replay_memory[_] for _ in sample_idx]
            states_batch, action_batch, reward_batch, next_states_batch = map(np.array, zip(*[_ for _ in samples]))
            
            # Calculate q values and targets
            q_values_next = target_estimator.predict(next_states_batch)
            targets_batch = reward_batch * discount_factor * np.amax(q_values_next, axis=1)
            
            # TODO: Set terminal targets to reward
            
            # Perform gradient descent update
            estimator.update(states_batch, action_batch, targets_batch)

            # TD Update
            # next_state_batch = np.expand_dims(next_state, 0)
            # q_values_next = estimator.predict(next_state_batch)[0]
            
            # Q-Value TD Target
            # td_target = reward + discount_factor * np.max(q_values_next)
            
            # SARSA TD Target for on policy-training
            # next_action_probs = policy(next_state)
            # next_action = np.random.choice(np.arange(len(next_action_probs)), p=next_action_probs)             
            # td_target = reward + discount_factor * q_values_next[next_action]
            
            # Update the function approximator using our target
            # estimator.update(np.array([state]), np.array([action]), np.array([td_target]))
                
            if done:
                break
            
            total_t += 1
            state = next_state
    
    return stats

In [257]:
tf.reset_default_graph()
q_estimator = Estimator(scope="q")
target_estimator = Estimator(scope="target_q")
stats = None
with tf.Session() as sess:
    sess.run(tf.initialize_all_variables())
    stats = deep_q_learning(env, q_estimator, target_estimator, 10000, update_target_estimator_every=10000, epsilon=1.0, epsilon_decay=0.999, epsilon_min=0.1)

Step 934 @ Episode 1/1000 (0.0)

KeyboardInterrupt: 