-
Notifications
You must be signed in to change notification settings - Fork 623
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
17a43f7
commit b526e0e
Showing
2 changed files
with
131 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,129 @@ | ||
''' An example of learning a NFSP Agent on Leduc Hold'em | ||
The hyperparameters are slightly tuned for better performance | ||
according to the playing experiences against it with human | ||
interfaces. Thus, they may be different from the version | ||
without save code. Please contact us if you find better | ||
hyperparameters. Models can be saved with similar procedures | ||
for other games/algorithms. For loading models, refer to | ||
https://github.com/datamllab/rlcard/blob/master/rlcard/models/pretrained_models.py | ||
''' | ||
|
||
import os | ||
import tensorflow as tf | ||
|
||
import rlcard | ||
from rlcard.agents.nfsp_agent import NFSPAgent | ||
from rlcard.agents.random_agent import RandomAgent | ||
from rlcard.utils.utils import set_global_seed | ||
from rlcard.utils.logger import Logger | ||
|
||
# Make environment | ||
env = rlcard.make('leduc-holdem') | ||
eval_env = rlcard.make('leduc-holdem') | ||
|
||
# Set the iterations numbers and how frequently we evaluate/save plot | ||
evaluate_every = 100000 | ||
save_plot_every = 100000 | ||
evaluate_num = 10000 | ||
episode_num = 100000000 | ||
|
||
# Set the the number of steps for collecting normalization statistics | ||
# and intial memory size | ||
memory_init_size = 1000 | ||
norm_step = 1000 | ||
|
||
# The paths for saving the logs and learning curves | ||
root_path = './experiments/leduc_holdem_nfsp_result/' | ||
log_path = root_path + 'log.txt' | ||
csv_path = root_path + 'performance.csv' | ||
figure_path = root_path + 'figures/' | ||
|
||
# Model save path | ||
if not os.path.exists('models'): | ||
os.makedirs('models') | ||
model_path = 'models/leduc_holdem_nfsp/model' | ||
|
||
|
||
# Set a global seed | ||
set_global_seed(0) | ||
|
||
with tf.Session() as sess: | ||
# Set agents | ||
global_step = tf.Variable(0, name='global_step', trainable=False) | ||
agents = [] | ||
for i in range(env.player_num): | ||
agent = NFSPAgent(sess, | ||
scope='nfsp' + str(i), | ||
anticipatory_param=0.1, | ||
action_num=env.action_num, | ||
state_shape=env.state_shape, | ||
rl_learning_rate=0.1, | ||
sl_learning_rate=0.005, | ||
hidden_layers_sizes=[128,128], | ||
min_buffer_size_to_learn=memory_init_size, | ||
q_replay_memory_init_size=memory_init_size, | ||
q_epsilon_start=0.06, | ||
q_epsilon_end=0.0, | ||
q_norm_step=norm_step, | ||
q_mlp_layers=[128,128]) | ||
agents.append(agent) | ||
|
||
sess.run(tf.global_variables_initializer()) | ||
|
||
saver = tf.train.Saver() | ||
|
||
random_agent = RandomAgent(action_num=eval_env.action_num) | ||
|
||
env.set_agents(agents) | ||
eval_env.set_agents([agents[0], random_agent]) | ||
|
||
# Count the number of steps | ||
step_counters = [0 for _ in range(env.player_num)] | ||
|
||
# Init a Logger to plot the learning curve | ||
logger = Logger(xlabel='timestep', ylabel='reward', legend='NFSP on Leduc Holdem', log_path=log_path, csv_path=csv_path) | ||
|
||
for episode in range(episode_num): | ||
|
||
# First sample a policy for the episode | ||
for agent in agents: | ||
agent.sample_episode_policy() | ||
|
||
# Generate data from the environment | ||
trajectories, _ = env.run(is_training=True) | ||
|
||
# Feed transitions into agent memory, and train the agent | ||
for i in range(env.player_num): | ||
for ts in trajectories[i]: | ||
agents[i].feed(ts) | ||
step_counters[i] += 1 | ||
|
||
# Train the agent | ||
train_count = step_counters[i] - (memory_init_size + norm_step) | ||
if train_count > 0 and train_count % 64 == 0: | ||
rl_loss = agents[i].train_rl() | ||
sl_loss = agents[i].train_sl() | ||
print('\rINFO - Agent {}, step {}, rl-loss: {}, sl-loss: {}'.format(i, step_counters[i], rl_loss, sl_loss), end='') | ||
|
||
# Evaluate the performance. Play with random agents. | ||
if episode % evaluate_every == 0: | ||
# Save Model | ||
saver.save(sess, 'models/leduc_holdem_nfsp/model') | ||
|
||
reward = 0 | ||
for eval_episode in range(evaluate_num): | ||
_, payoffs = eval_env.run(is_training=False) | ||
reward += payoffs[0] | ||
|
||
logger.log('\n########## Evaluation ##########') | ||
logger.log('Timestep: {} Average reward is {}'.format(env.timestep, float(reward)/evaluate_num)) | ||
|
||
# Add point to logger | ||
logger.add_point(x=env.timestep, y=float(reward)/evaluate_num) | ||
|
||
# Make plot | ||
if episode % save_plot_every == 0 and episode > 0: | ||
logger.make_plot(save_path=figure_path+str(episode)+'.png') | ||
|
||
# Make the final plot | ||
logger.make_plot(save_path=figure_path+'final_'+str(episode)+'.png') |