In [None]:
%matplotlib inline

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import math
import pickle

In [None]:
class Environ:
    def __init__(self, theta, dtheta):
        self.reset(theta, dtheta)
        
    def reset(self, theta, dtheta):
        self.th = theta
        self.dth = dtheta
        self.g = 0.01
        self.seq = np.zeros(NUM_STATES)
        
    def get_reward(self):
        h = -np.cos(self.th)
        if h > 0:
            return 5*h
        else:
            return h
    
    def update_state(self, action):
        power = 0.005 * np.sign(action)
        self.dth += -self.g*np.sin(self.th)+power
        self.th += self.dth
        self.seq[1:NUM_STATES] = self.seq[0:NUM_STATES-1]
        self.seq[0] = self.th

In [None]:
class Qnet(object): # super class
    def __init__(self):
        pass
    
    def get_action_values(self, seq):
        action_values = self.q.eval(session=self.sess, feed_dict={self.x:[seq]})
        return action_values[0] 
    
    def update_model(self, experience_memory):
        gamma = 0.9
        if len(experience_memory) < BATCH_NUM:
            return
        batch_index = list(np.random.randint(0, len(experience_memory), BATCH_NUM))
        batch = np.array([experience_memory[i] for i in batch_index])
        xs = batch[:,0:NUM_STATES]
        targets = self.q.eval(session=self.sess, feed_dict={self.x:xs})
        for i, experience in enumerate(batch):
            seq = experience[0:NUM_STATES]
            action = experience[NUM_STATES:NUM_STATES+1]
            reward = experience[NUM_STATES+1:NUM_STATES+2]
            new_seq = experience[NUM_STATES+2:]
            for action_index in range(len(Actions)):
                if Actions[action_index] == action:
                    break
            targets[action_index] = reward + gamma * np.max(self.get_action_values(new_seq))

        self.sess.run(self.train_step, feed_dict={self.x:xs, self.y_:targets})
        return self.loss.eval(session = self.sess, feed_dict={self.x:xs, self.y_:targets})

In [None]:
class Agent:
    def __init__(self):
        self.epsilon = 0.7
        self.episode_memory = [[] for _ in range(TOP)]
        self.best_episodes = [-1000]*TOP

    def get_action(self, qnet, seq, train=True):
        if train and np.random.random() < self.epsilon:
            action = np.random.choice(Actions)
        else:
            action_values = qnet.get_action_values(seq)
            action_index = np.argmax(action_values)
            action = Actions[action_index]
        return action
        
    def get_memory(self):
        result = []
        for e in self.episode_memory:
            result.extend(e)
        return result

In [None]:
def run(qnet, agent, train=True):
    log = []
    env = Environ(0,0) 
    total_reward = 0
    episode_record = []

    # Run single episode to record experience
    for i in range(300):
        old_seq = env.seq.copy()
        action = agent.get_action(qnet, old_seq, train)
        env.update_state(action)
        reward = env.get_reward()
        total_reward += reward
        new_seq = env.seq.copy()
        episode_record.append(np.hstack([old_seq, action, reward, new_seq]))
        log.append([env.th, action])

    if np.min(agent.best_episodes) < total_reward or np.random.random() < 0.05:
        agent.best_episodes = agent.best_episodes[1:]
        agent.best_episodes.append(total_reward)
        agent.episode_memory = agent.episode_memory[1:]
        agent.episode_memory.append(episode_record)
    if train:
        loss = qnet.update_model(agent.get_memory())
    else:
        loss = 0
        
    return qnet, total_reward, log, loss

In [None]:
class Perceptron(Qnet):
    def __init__(self):
        with tf.name_scope('input'):
            self.x = tf.placeholder(tf.float32, [None, NUM_STATES])

        with tf.name_scope('output'):
            self.w = tf.Variable(
                tf.truncated_normal([NUM_STATES, len(Actions)], stddev=1.0/math.sqrt(2.0)),
                name='weights')
            self.b = tf.Variable(tf.zeros([len(Actions)]), name='biases')
            self.q = tf.matmul(self.x, self.w) + self.b
        
        with tf.name_scope('optimizer'):
            self.y_ = tf.placeholder(tf.float32, [None, len(Actions)])
            self.loss = tf.reduce_mean(tf.square(self.y_ - self.q))
            self.train_step = tf.train.AdamOptimizer().minimize(self.loss)

        self.sess = tf.Session()
        self.sess.run(tf.initialize_all_variables())

In [None]:
class SingleNet(Qnet):
    def __init__(self):
        with tf.name_scope('input'):
            self.x = tf.placeholder(tf.float32, [None, NUM_STATES])

        with tf.name_scope('hidden1'):
            self.w1 = tf.Variable(
                tf.truncated_normal([NUM_STATES, H1_UNITS], stddev=1.0/math.sqrt(2.0)),
                name='weights')
            self.b1 = tf.Variable(tf.zeros([1, H1_UNITS]), name='biases')
            self.hidden1 = tf.nn.relu(tf.matmul(self.x, self.w1) + self.b1)

        with tf.name_scope('output'):
            self.w = tf.Variable(
                tf.truncated_normal([H1_UNITS, len(Actions)], stddev=1.0/math.sqrt(2.0)),
                name='weights')
            self.b = tf.Variable(tf.zeros([len(Actions)]), name='biases')
            self.q = tf.matmul(self.hidden1, self.w) + self.b
        
        with tf.name_scope('optimizer'):
            self.y_ = tf.placeholder(tf.float32, [None, len(Actions)])
            self.loss = tf.reduce_mean(tf.square(self.y_ - self.q))
            self.train_step = tf.train.AdamOptimizer().minimize(self.loss)

        self.sess = tf.Session()
        self.sess.run(tf.initialize_all_variables())

In [None]:
class DoubleNet(Qnet):
    def __init__(self):
        with tf.name_scope('input'):
            self.x = tf.placeholder(tf.float32, [None, NUM_STATES])

        with tf.name_scope('hidden1'):
            self.w1 = tf.Variable(
                tf.truncated_normal([NUM_STATES, H1_UNITS], stddev=1.0/math.sqrt(2.0)),
                name='weights')
            self.b1 = tf.Variable(tf.zeros([1, H1_UNITS]), name='biases')
            self.hidden1 = tf.nn.relu(tf.matmul(self.x, self.w1) + self.b1)

        with tf.name_scope('hidden2'):
            self.w2 = tf.Variable(
                tf.truncated_normal([H1_UNITS, H2_UNITS], stddev=1.0/math.sqrt(2.0)),
                name='weights')
            self.b2 = tf.Variable(tf.zeros([1, H2_UNITS]), name='biases')
            self.hidden2 = tf.nn.relu(tf.matmul(self.hidden1, self.w2) + self.b2)
            
        with tf.name_scope('output'):
            self.w = tf.Variable(
                tf.truncated_normal([H2_UNITS, len(Actions)], stddev=1.0/math.sqrt(2.0)),
                name='weights')
            self.b = tf.Variable(tf.zeros([len(Actions)]), name='biases')
            self.q = tf.matmul(self.hidden2, self.w) + self.b
        
        with tf.name_scope('optimizer'):
            self.y_ = tf.placeholder(tf.float32, [None, len(Actions)])
            self.loss = tf.reduce_mean(tf.square(self.y_ - self.q))
            self.train_step = tf.train.AdamOptimizer().minimize(self.loss)

        self.sess = tf.Session()
        self.sess.run(tf.initialize_all_variables())

## Perceptron

In [None]:
NUM_STATES = 4    # Use 4 previous positions as a status
TOP = 1000        # Remember top 1000 episodes
BATCH_NUM = 500   # Use 500 samples for a single update
Actions = [-1, 1] # Possible actions
N = 50000         # Run 50000 iterations

qnet = Perceptron()
agent = Agent()

best = -1000
saver = tf.train.Saver()
for i in range(N):
    qnet, total_reward, log, loss = run(qnet, agent, train=True)
    if (i+1) % 10 == 0:
        qnet, total_reward, log, _ = run(qnet, agent, train=False)
        if total_reward > best:
            best = total_reward
            with open('log_for_l0_%04d' % int(best), mode='wb') as f:
                pickle.dump(log, f)
            saver.save(qnet.sess, 'train_data_l0', global_step=i)
        print (i+1, best, total_reward, loss)

## Single layer

In [None]:
NUM_STATES = 4    # Use 4 previous positions as a status
H1_UNITS = 1024   # Use 1024 hidden units
TOP = 1000        # Remember top 1000 episodes
BATCH_NUM = 500   # Use 500 samples for a single update
Actions = [-1, 1] # Possible actions
N = 50000         # Run 50000 iterations

qnet = SingleNet()
agent = Agent()

best = -1000
saver = tf.train.Saver()
for i in range(N):
    qnet, total_reward, log, loss = run(qnet, agent, train=True)
    if (i+1) % 10 == 0:
        qnet, total_reward, log, _ = run(qnet, agent, train=False)
        if total_reward > best:
            best = total_reward
            with open('log_for_l1_%04d' % int(best), mode='wb') as f:
                pickle.dump(log, f)
            saver.save(qnet.sess, 'train_data_l1', global_step=i)
        print (i+1, best, total_reward, loss)

## Double layer

In [None]:
NUM_STATES = 4    # Use 4 previous positions as a status
H1_UNITS = 512    # Use 512 first hidden units
H2_UNITS = 1024   # Use 1024 second hidden units
TOP = 1000        # Remember top 1000 episodes
BATCH_NUM = 500   # Use 500 samples for a single update
Actions = [-1, 1] # Possible actions
N = 50000         # Run 50000 iterations

qnet = DoubleNet()
agent = Agent()

best = -1000
saver = tf.train.Saver()
for i in range(N):
    qnet, total_reward, log, loss = run(qnet, agent, train=True)
    if (i+1) % 10 == 0:
        qnet, total_reward, log, _ = run(qnet, agent, train=False)
        if total_reward > best:
            best = total_reward
            with open('log_for_l2_%04d' % int(best), mode='wb') as f:
                pickle.dump(log, f)
            saver.save(qnet.sess, 'train_data_l2', global_step=i)
        print (i+1, best, total_reward, loss)