Skip to content

Commit

Permalink
Add save example
Browse files Browse the repository at this point in the history
  • Loading branch information
daochenzha committed Nov 5, 2019
1 parent 17a43f7 commit b526e0e
Show file tree
Hide file tree
Showing 2 changed files with 131 additions and 0 deletions.
2 changes: 2 additions & 0 deletions examples/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,5 @@ We provide some running examples. We will update the examples if we achieve bett
* `uno_nfsp.py`: train NFSP on UNO.
* `uno_random.py`: run random agents on UNO.
* `uno_single.py`: train DQN on UNO as single-agent environment.
* **Save models**: refer to `leduc_holdem_nfsp_save_model.py`.
* **Load models**: refer to [rlcard/models/pretrained\_models.py](../rlcard/models/pretrained_models.py)
129 changes: 129 additions & 0 deletions examples/leduc_holdem_nfsp_save_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
''' An example of learning a NFSP Agent on Leduc Hold'em
The hyperparameters are slightly tuned for better performance
according to the playing experiences against it with human
interfaces. Thus, they may be different from the version
without save code. Please contact us if you find better
hyperparameters. Models can be saved with similar procedures
for other games/algorithms. For loading models, refer to
https://github.com/datamllab/rlcard/blob/master/rlcard/models/pretrained_models.py
'''

import os
import tensorflow as tf

import rlcard
from rlcard.agents.nfsp_agent import NFSPAgent
from rlcard.agents.random_agent import RandomAgent
from rlcard.utils.utils import set_global_seed
from rlcard.utils.logger import Logger

# Make environment
env = rlcard.make('leduc-holdem')
eval_env = rlcard.make('leduc-holdem')

# Set the iterations numbers and how frequently we evaluate/save plot
evaluate_every = 100000
save_plot_every = 100000
evaluate_num = 10000
episode_num = 100000000

# Set the the number of steps for collecting normalization statistics
# and intial memory size
memory_init_size = 1000
norm_step = 1000

# The paths for saving the logs and learning curves
root_path = './experiments/leduc_holdem_nfsp_result/'
log_path = root_path + 'log.txt'
csv_path = root_path + 'performance.csv'
figure_path = root_path + 'figures/'

# Model save path
if not os.path.exists('models'):
os.makedirs('models')
model_path = 'models/leduc_holdem_nfsp/model'


# Set a global seed
set_global_seed(0)

with tf.Session() as sess:
# Set agents
global_step = tf.Variable(0, name='global_step', trainable=False)
agents = []
for i in range(env.player_num):
agent = NFSPAgent(sess,
scope='nfsp' + str(i),
anticipatory_param=0.1,
action_num=env.action_num,
state_shape=env.state_shape,
rl_learning_rate=0.1,
sl_learning_rate=0.005,
hidden_layers_sizes=[128,128],
min_buffer_size_to_learn=memory_init_size,
q_replay_memory_init_size=memory_init_size,
q_epsilon_start=0.06,
q_epsilon_end=0.0,
q_norm_step=norm_step,
q_mlp_layers=[128,128])
agents.append(agent)

sess.run(tf.global_variables_initializer())

saver = tf.train.Saver()

random_agent = RandomAgent(action_num=eval_env.action_num)

env.set_agents(agents)
eval_env.set_agents([agents[0], random_agent])

# Count the number of steps
step_counters = [0 for _ in range(env.player_num)]

# Init a Logger to plot the learning curve
logger = Logger(xlabel='timestep', ylabel='reward', legend='NFSP on Leduc Holdem', log_path=log_path, csv_path=csv_path)

for episode in range(episode_num):

# First sample a policy for the episode
for agent in agents:
agent.sample_episode_policy()

# Generate data from the environment
trajectories, _ = env.run(is_training=True)

# Feed transitions into agent memory, and train the agent
for i in range(env.player_num):
for ts in trajectories[i]:
agents[i].feed(ts)
step_counters[i] += 1

# Train the agent
train_count = step_counters[i] - (memory_init_size + norm_step)
if train_count > 0 and train_count % 64 == 0:
rl_loss = agents[i].train_rl()
sl_loss = agents[i].train_sl()
print('\rINFO - Agent {}, step {}, rl-loss: {}, sl-loss: {}'.format(i, step_counters[i], rl_loss, sl_loss), end='')

# Evaluate the performance. Play with random agents.
if episode % evaluate_every == 0:
# Save Model
saver.save(sess, 'models/leduc_holdem_nfsp/model')

reward = 0
for eval_episode in range(evaluate_num):
_, payoffs = eval_env.run(is_training=False)
reward += payoffs[0]

logger.log('\n########## Evaluation ##########')
logger.log('Timestep: {} Average reward is {}'.format(env.timestep, float(reward)/evaluate_num))

# Add point to logger
logger.add_point(x=env.timestep, y=float(reward)/evaluate_num)

# Make plot
if episode % save_plot_every == 0 and episode > 0:
logger.make_plot(save_path=figure_path+str(episode)+'.png')

# Make the final plot
logger.make_plot(save_path=figure_path+'final_'+str(episode)+'.png')

0 comments on commit b526e0e

Please sign in to comment.