-
Notifications
You must be signed in to change notification settings - Fork 17
/
Copy pathq_learning_player.rb
153 lines (127 loc) · 5.54 KB
/
q_learning_player.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
require 'ruby-fann'
class QLearningPlayer
attr_accessor :x, :y, :game
def initialize
@x = 0
@y = 0
@actions = [:left, :right, :up, :down]
@first_run = true
@discount = 0.9
@epsilon = 0.1
@max_epsilon = 0.9
@epsilon_increase_factor = 800.0
@replay_memory_size = 500
@replay_memory_pointer = 0
@replay_memory = []
@replay_batch_size = 400
@runs = 0
@r = Random.new
end
def initialize_q_neural_network
# Setup model
# Input is the size of the map + number of actions
# Output size is one
@q_nn_model = RubyFann::Standard.new(
num_inputs: @game.map_size_x*@game.map_size_y + @actions.length,
hidden_neurons: [ (@game.map_size_x*@game.map_size_y+@actions.length) ],
num_outputs: 1 )
@q_nn_model.set_learning_rate(0.2)
@q_nn_model.set_activation_function_hidden(:sigmoid_symmetric)
@q_nn_model.set_activation_function_output(:sigmoid_symmetric)
end
def get_input
# Pause to make sure humans can follow along
# Increase pause with the number of runs
sleep 0.05 + 0.01*(@runs/400.0)
@runs += 1
if @first_run
# If this is first run initialize the Q-neural network
initialize_q_neural_network
@first_run = false
else
# If this is not the first
# Evaluate what happened on last action and calculate reward
r = 0 # default is 0
if !@game.new_game and @old_score < @game.score
r = 1 # reward is 1 if our score increased
elsif !@game.new_game and @old_score > @game.score
r = -1 # reward is -1 if our score decreased
elsif !@game.new_game
r = -0.1
end
# Capture current state
# Set input to network map_size_x * map_size_y + actions length vector with a 1 on the player position
input_state = Array.new(@game.map_size_x*@game.map_size_y + @actions.length, 0)
input_state[@x + (@game.map_size_x*@y)] = 1
# Add reward, old_state and input state to memory
@replay_memory[@replay_memory_pointer] = {reward: r, old_input_state: @old_input_state, input_state: input_state}
# Increment memory pointer
@replay_memory_pointer = (@replay_memory_pointer<@replay_memory_size) ? @replay_memory_pointer+1 : 0
# If replay memory is full train network on a batch of states from the memory
if @replay_memory.length > @replay_memory_size
# Randomly samply a batch of actions from the memory and train network with these actions
@batch = @replay_memory.sample(@replay_batch_size)
training_x_data = []
training_y_data = []
# For each batch calculate new q_value based on current network and reward
@batch.each do |m|
# To get entire q table row of the current state run the network once for every posible action
q_table_row = []
@actions.length.times do |a|
# Create neural network input vector for this action
input_state_action = m[:input_state].clone
# Set a 1 in the action location of the input vector
input_state_action[(@game.map_size_x*@game.map_size_y) + a] = 1
# Run the network for this action and get q table row entry
q_table_row[a] = @q_nn_model.run(input_state_action).first
end
# Update the q value
updated_q_value = m[:reward] + @discount * q_table_row.max
# Add to training set
training_x_data.push(m[:old_input_state])
training_y_data.push([updated_q_value])
end
# Train network with batch
train = RubyFann::TrainData.new( :inputs=> training_x_data, :desired_outputs=>training_y_data );
@q_nn_model.train_on_data(train, 1, 1, 0.01)
end
end
# Capture current state and score
# Set input to network map_size_x * map_size_y vector with a 1 on the player position
input_state = Array.new(@game.map_size_x*@game.map_size_y + @actions.length, 0)
input_state[@x + (@game.map_size_x*@y)] = 1
# Chose action based on Q value estimates for state
# If a random number is higher than epsilon we take a random action
# We will slowly increase @epsilon based on runs to a maximum of @max_epsilon - this encourages early exploration
epsilon_run_factor = (@runs/@epsilon_increase_factor) > (@max_epsilon-@epsilon) ? (@max_epsilon-@epsilon) : (@runs/@epsilon_increase_factor)
if @r.rand > (@epsilon + epsilon_run_factor)
# Select random action
@action_taken_index = @r.rand(@actions.length)
else
# To get the entire q table row of the current state run the network once for every posible action
q_table_row = []
@actions.length.times do |a|
# Create neural network input vector for this action
input_state_action = input_state.clone
# Set a 1 in the action location of the input vector
input_state_action[(@game.map_size_x*@game.map_size_y) + a] = 1
# Run the network for this action and get q table row entry
q_table_row[a] = @q_nn_model.run(input_state_action).first
end
# Select action with highest posible reward
@action_taken_index = q_table_row.each_with_index.max[1]
end
# Save score and current state
@old_score = @game.score
# Set action taken in input state before storing it
input_state[(@game.map_size_x*@game.map_size_y) + @action_taken_index] = 1
@old_input_state = input_state
# Take action
return @actions[@action_taken_index]
end
def print_table
@q_table.length.times do |i|
puts @q_table[i].to_s
end
end
end