# Reinforcement Learning - Glider

### Dustin D. Gerrard
### July 15, 2018

In [None]:
from glider import Glider
import tensorflow as tf
import numpy as np
import math
import matplotlib.pyplot as plt
import random

# Neural Fitted Q Iteration: http://ml.informatik.uni-freiburg.de/former/_media/publications/rieecml05.pdf

# STATES 
# (x, y, x_dot, y_dot, phi)

# DISCRETE ACTIONS
ACTIONS = (-1.0, 0.0, 1.0)

# TARGET (X)
target_x = 125

In [None]:
def get_epsilon(epsilon_init, epsilon_min, episode_step, num_steps_epsilon_decay):
    epsilon = epsilon_init*np.power(0.9,(episode_step/num_steps_epsilon_decay))
    if epsilon < epsilon_min:
        epsilon = epsilon_min
    return epsilon

In [None]:
def plot_trajectory(episode_list):
    X, Y = [], []
    for i in range(len(episode_list)):
        state = episode_list[i][0]
        X.append(state[0])
        Y.append(state[2])
    plt.plot(X,Y)
    plt.scatter(125.0,0)
    # plt.show()


In [None]:
def training_data(episode_list, total_cost, w_effort):
    X_NN = []
    Y_NN = []
    Q = total_cost
    for i in range(len(episode_list)):        
        X_NN.append(episode_list[i][0][0:5])
        Y_NN.append(Q)
        Q -= episode_list[i][2]*w_effort
    return X_NN, Y_NN

In [2]:
num_input = 5
num_l1 = 10
num_l2 = 10
num_out = 1
W = {
    'h1':tf.Variable(tf.random_normal([num_input,num_l1])),
    'h2':tf.Variable(tf.random_normal([num_l1, num_l2])),
    'out':tf.Variable(tf.random_normal([num_l2, num_out]))
}    
b = {
    'b1':tf.Variable(tf.random_normal([num_l1])),
    'b2':tf.Variable(tf.random_normal([num_l2])),
    'out':tf.Variable(tf.random_normal([num_out]))
}

def multilayer_perceptron(x, W, b):        
    l1 = tf.add(tf.matmul(x, W['h1']), b['b1'])
    l1 = tf.nn.relu(l1)
    l2 = tf.add(tf.matmul(l1, W['h2']), b['b2'])
    l2 = tf.nn.relu(l2)
    out = tf.add(tf.matmul(l2, W['out']), b['out'])
    return out

In [None]:
# main learning loop
## policy_matrix = np.random.randint(low = 0, high = NUM_ACTIONS, size = NUM_BUCKETS)
## state_action_matrix = np.zeros((NUM_ACTIONS, num_buckets))

# epsilon greedy determines which action to take during training.

w_effort = 1.0
w_target = 6.0
num_episodes = 10 # 10
max_step = 1000 # 10k
env = Glider()
for episode in range(num_episodes):
    if episode%10 == 0:
        print('Episode: ' + str(episode))
    epsilon = get_epsilon(0.99, 0.1, episode, 10)
    Y, X, V = [], [], []
    episode_list = list()
    total_cost = 0.0
    state = env.reset()
    # state_disc = get_discrete_states(state_cont, all_state_arrays)
    effort_cost = 0.0
    for step in range(max_step):
        action = random.choice(ACTIONS)
        cost, done = env.step(action)
        new_state = env.get_state()
        total_cost += cost
        episode_list.append((state, action, cost))                
        state = new_state
        if done:
            break
        effort_cost += cost
    glider_landing_x = episode_list[len(episode_list)-1][0][0]
    target_cost = abs(target_x - glider_landing_x)
    total_cost = w_target*target_cost + w_effort*effort_cost
    
    # plot trajectory
    plot_trajectory(episode_list)
    plt.hold
    
plt.show()

In [None]:
effort_cost

In [None]:
target_cost*w_target

In [None]:
total_cost

In [None]:
state_landing = episode_list[len(episode_list)-1][0]
print(state_landing)
landing_velocity = math.sqrt(state_landing[1]**2 + state_landing[3]**2)
landing_velocity

In [None]:
# NFQ main learning loop
## policy_matrix = np.random.randint(low = 0, high = NUM_ACTIONS, size = NUM_BUCKETS)
## state_action_matrix = np.zeros((NUM_ACTIONS, num_buckets))

# epsilon greedy determines which action to take during training.

# neural net
sess = tf.Session()
Q_net = multilayer_perceptron(tf.placeholder("float", [None,5]), W, b)

sess.run(tf.initialize_all_variables())

labels = [[[1]]]
predictions = [a]
loss = tf.losses.mean_squared_error(labels, predictions)
train_step = tf.train.GradientDescentOptimizer(0.001).minimize(loss)

w_effort = 1.0
w_target = 6.0
num_episodes = 10 # 10
max_step = 1000 # 10k
env = Glider()
for episode in range(num_episodes):
    if episode%10 == 0:
        print('Episode: ' + str(episode))
    epsilon = get_epsilon(0.99, 0.1, episode, 10)
    Y, X, V = [], [], []
    episode_list = list()
    total_cost = 0.0
    state = env.reset()
    # state_disc = get_discrete_states(state_cont, all_state_arrays)
    effort_cost = 0.0
    for step in range(max_step):
        state_action_1 = env.probe_step(1.0)
        state_action_0 = env.probe_step(0.0)
        state_action_minus_1 = env.probe_step(-1.0)
        Q_1 = sess.run(Q_net, {x : [state_action_1,]})
        Q_0 = sess.run(Q_net, {x : [state_action_0,]})
        Q_m1 = sess.run(Q_net, {x : [state_action_minus_1,]})
        print(Q_1)
        print(Q_0)
        print(Q_m1)
        action = random.choice(ACTIONS)
        cost, done = env.step(action)
        new_state = env.get_state()
        total_cost += cost
        episode_list.append((state, action, cost))                
        state = new_state
        if done:
            break
        effort_cost += cost
    
    glider_landing_x = episode_list[len(episode_list)-1][0][0]
    target_cost = abs(target_x - glider_landing_x)
    total_cost = w_target*target_cost + w_effort*effort_cost
    
    # train MLP           
    A, B = training_data(episode_list, total_cost, w_effort)
    A = tf.Variable(A)
    labels = np.transpose([B])
    predictions = multilayer_perceptron(A)
    loss = tf.losses.mean_squared_error(labels, predictions)
    train_step = tf.train.GradientDescentOptimizer(1e-8).minimize(loss)
    sess.run(tf.initialize_all_variables())
    # sess.run(tf.global_variables_initializer())
    print(sess.run(loss))
    sess.run(train_step)
    print(sess.run(loss))
    
    # plot trajectory
    plot_trajectory(episode_list)
    plt.hold
    
plt.show()

In [None]:
print(state_action_1)
print(state_action_0)
print(state_action_minus_1)


In [None]:
Q_1 = sess.run(Q_net, {x : [[1,2,3,4,5],]})

In [None]:
print(sess.run(prediction_1))

In [None]:
done

In [None]:
# multiple inputs/outputs

import tensorflow as tf

A, B = training_data(episode_list, total_cost, w_effort)

sess = tf.Session()
x = tf.Variable(A)
a = multilayer_perceptron(x)
print(a)
labels = np.transpose([B])
predictions = multilayer_perceptron(x)
loss = tf.losses.mean_squared_error(labels, predictions)
train_step = tf.train.GradientDescentOptimizer(1e-8).minimize(loss)
sess.run(tf.initialize_all_variables())
# a = multilayer_perceptron(x)
# print(sess.run(a))
print(sess.run(loss))
sess.run(train_step)
# print(sess.run(a))
print(sess.run(loss))

for i in range(100000):
    sess.run(train_step)
    # print(sess.run(a))
    if i%1000 == 0:
        print(sess.run(loss))

In [None]:
a, b = training_data(episode_list, total_cost, w_effort)
a

In [None]:
X_NN0 = training_data(episode_list, total_cost, w_effort)

In [None]:
episode_list[len(episode_list)-2][2]

In [None]:
plot_trajectory(episode_list)

In [None]:
len(episode_list)

In [None]:
import tensorflow

In [None]:
action = random.choice(ACTIONS)
action

In [None]:
 X, Y, A = [], [], [],
for i in range(len(episode_list)):
    state = episode_list[i][0]
    X.append(state[0])
    Y.append(state[2])
    A.append(episode_list[i][1])

In [None]:
episode_list

In [None]:
state = env.reset()
env.get_state()

In [None]:
env.step(1.0)
env.get_state()

In [None]:
plt.show()

In [None]:
done

In [None]:
x = tf.Variable([[0.0,1.0,2.0,3.0,4.0],])
# multilayer_perceptron(x)
num_input = 5 
num_l1 = 2
W = {'h1':tf.Variable(tf.random_normal([num_input,num_l1]))}
a = tf.matmul(x, W['h1'])
# W['h1']
sess = tf.Session()
print(W['h1'])
sess.run(tf.global_variables_initializer())
print(sess.run(W['h1']))
print(sess.run(x))
print(sess.run(a))

In [None]:
x = tf.Variable([[0.0,1.0,2.0,3.0,4.0],])
a = multilayer_perceptron(x)
print(a)
sess = tf.Session()
sess.run(tf.global_variables_initializer())
print(sess.run(a))
sess.close()

In [None]:
print(sess.run(a))

In [39]:
# Working neural net

import tensorflow as tf

num_input = 5
num_l1 = 10
num_l2 = 10
num_out = 1
W = {
    'h1':tf.Variable(tf.random_normal([num_input,num_l1])),
    'h2':tf.Variable(tf.random_normal([num_l1, num_l2])),
    'out':tf.Variable(tf.random_normal([num_l2, num_out]))
}    
b = {
    'b1':tf.Variable(tf.random_normal([num_l1])),
    'b2':tf.Variable(tf.random_normal([num_l2])),
    'out':tf.Variable(tf.random_normal([num_out]))
}

def multilayer_perceptron(x, W, b):        
    l1 = tf.add(tf.matmul(x, W['h1']), b['b1'])
    l1 = tf.nn.relu(l1)
    l2 = tf.add(tf.matmul(l1, W['h2']), b['b2'])
    l2 = tf.nn.relu(l2)
    out = tf.add(tf.matmul(l2, W['out']), b['out'])
    return out

sess = tf.Session()
# x = tf.Variable([[0.0,1.0,2.0,3.0,4.0],])
x = tf.placeholder("float", [None,num_input])
y_ = multilayer_perceptron(x, W, b)
print(a)

sess = tf.Session()
sess.run(tf.initialize_all_variables())

y = tf.placeholder(tf.float32, [None, 1]) #, name = 'y')   # 3 outputs
labels = y
predictions = a
loss = tf.losses.mean_squared_error(y, y_)
train_step = tf.train.GradientDescentOptimizer(0.001).minimize(loss)

# a = multilayer_perceptron(x)
# print(sess.run(a))
print(sess.run([train_step, loss], feed_dict={x:[[1,2,3,4,5],[2,3,4,5,6]], y:[[1],[2]]}))
# print(sess.run(loss))
# sess.run(train_step)
# print(sess.run(a))
# print(sess.run(loss))

for i in range(100):
    print(sess.run([train_step, loss], feed_dict={x:[[1,2,3,4,5],], y:[[1]]}))
    # print(sess.run(loss))

Tensor("Add_47:0", shape=(?, 1), dtype=float32)
Instructions for updating:
Use `tf.global_variables_initializer` instead.
[None, 103.09693]
[None, 11.722347]
[None, 1.5383936]
[None, 0.15390986]
[None, 0.014678886]
[None, 0.0013974456]
[None, 0.00013286489]
[None, 1.2633786e-05]
[None, 1.1973219e-06]
[None, 1.1201075e-07]
[None, 1.055929e-08]
[None, 9.2768815e-10]
[None, 5.287859e-11]
[None, 6.9633188e-13]
[None, 6.9633188e-13]
[None, 6.9633188e-13]
[None, 6.9633188e-13]
[None, 6.9633188e-13]
[None, 1.2789769e-13]
[None, 1.2789769e-13]
[None, 1.2789769e-13]
[None, 1.2789769e-13]
[None, 1.2789769e-13]
[None, 1.2789769e-13]
[None, 1.2789769e-13]
[None, 1.2789769e-13]
[None, 1.2789769e-13]
[None, 1.2789769e-13]
[None, 1.2789769e-13]
[None, 1.2789769e-13]
[None, 1.2789769e-13]
[None, 1.2789769e-13]
[None, 1.2789769e-13]
[None, 1.2789769e-13]
[None, 1.2789769e-13]
[None, 1.2789769e-13]
[None, 1.2789769e-13]
[None, 1.2789769e-13]
[None, 1.2789769e-13]
[None, 1.2789769e-13]
[None, 1.2789769e-

In [12]:
sess.run(x)

array([[ 0.16691241,  0.59427404,  2.37687016,  2.3382268 ,  3.4388082 ]], dtype=float32)

In [None]:
model_result = sess.run(a , {x : [[0.0,1.0,2.0,3.0,4.0],]})
model_result[0][0]

In [None]:
# Network Parameters
n_hidden_1 = 256 # 1st layer number of features
n_hidden_2 = 256 # 2nd layer number of features
n_input = 784 # MNIST data input (img shape: 28*28)
n_classes = 10 # MNIST total classes (0-9 digits)
 
# tf Graph input
x = tf.placeholder("float", [None, n_input])
y = tf.placeholder("float", [None, n_classes])

def multilayer_perceptron(x, weights, biases):
    # Hidden layer with ReLU activation
    layer_1 = tf.add(tf.matmul(x, weights['h1']), biases['b1'])
    layer_1 = tf.nn.relu(layer_1)
    # Hidden layer with ReLU activation
    layer_2 = tf.add(tf.matmul(layer_1, weights['h2']), biases['b2'])
    layer_2 = tf.nn.relu(layer_2)
    # Output layer with linear activation
    out_layer = tf.matmul(layer_2, weights['out']) + biases['out']
    return out_layer
 
# Store layers weight &amp; bias
weights = {
'h1': tf.Variable(tf.random_normal([n_input, n_hidden_1])),
'h2': tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2])),
'out': tf.Variable(tf.random_normal([n_hidden_2, n_classes]))
}
biases = {
'b1': tf.Variable(tf.random_normal([n_hidden_1])),
'b2': tf.Variable(tf.random_normal([n_hidden_2])),
'out': tf.Variable(tf.random_normal([n_classes]))
}
 
# Construct model
pred = multilayer_perceptron(x, weights, biases)